# Zodiac Race

## Initial setup

In [106]:
#Import the relevant libraries
import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

## Read the files

First, we import the csv file and determine the column that contains 'ABCDE'

In [107]:
#Import csv files
z_file = "./The_Zodiac_Race_Challenge_Dataset.csv"
z_df = pd.read_csv(z_file)

In [108]:
z_df.head()

Unnamed: 0,Desert,Mountain,Water,Land,Forest,Snow,Average_Temperature,Is_Raining,Had_A_Good_Sleep,Winner,Winner_Time,Second,Second_Time,Third,Third_Time
0,7376,4387,7769,7374,814,2677,18,Yes,Yes,rat,218,dog,242,dragon,245
1,7911,5771,4993,1855,9605,283,18,Yes,Yes,tiger,216,snake,224,rat,239
2,8948,9631,1855,4913,3531,3746,39,Yes,No,ox,191,rabbit,217,horse,240
3,1735,6643,6066,9976,9119,9912,24,Yes,No,goat,319,rooster,322,horse,334
4,2001,8287,4062,8763,9751,2820,26,Yes,Yes,horse,255,goat,259,rooster,263


In [109]:
# Find column that contains'ABCDE'
bad_cols = [col for col in z_df.columns if 'ABCDE' in z_df[col].values]
print(bad_cols)

['Had_A_Good_Sleep']


  


In [110]:
# Remove bad col
z_df.drop(bad_cols, axis=1, inplace=True)

In [111]:
# Separate input variables (X) from output variables (Y) 

X = z_df.iloc[:, :8]
Y = z_df.iloc[:, 8:]

In [112]:
X.head()

Unnamed: 0,Desert,Mountain,Water,Land,Forest,Snow,Average_Temperature,Is_Raining
0,7376,4387,7769,7374,814,2677,18,Yes
1,7911,5771,4993,1855,9605,283,18,Yes
2,8948,9631,1855,4913,3531,3746,39,Yes
3,1735,6643,6066,9976,9119,9912,24,Yes
4,2001,8287,4062,8763,9751,2820,26,Yes


In [113]:
Y.head()

Unnamed: 0,Winner,Winner_Time,Second,Second_Time,Third,Third_Time
0,rat,218,dog,242,dragon,245
1,tiger,216,snake,224,rat,239
2,ox,191,rabbit,217,horse,240
3,goat,319,rooster,322,horse,334
4,horse,255,goat,259,rooster,263


In [114]:
animal_list = ['rat', 'ox', 'tiger', 'rabbit', 'dragon', 'snake', 
               'horse', 'goat', 'monkey', 'rooster', 'dog', 'pig']

top_3 = ['Winner', 'Second', 'Third']

#Generate df (one-hot encoded) indicating whether each animal (column) finishes in the top 3

Y_top3 = pd.DataFrame({animal:z_df[top_3].isin([animal]).apply(sum, axis=1) for animal in animal_list}).astype(int)

In [115]:
Y_top3.head()

Unnamed: 0,rat,ox,tiger,rabbit,dragon,snake,horse,goat,monkey,rooster,dog,pig
0,1,0,0,0,1,0,0,0,0,0,1,0
1,1,0,1,0,0,1,0,0,0,0,0,0
2,0,1,0,1,0,0,1,0,0,0,0,0
3,0,0,0,0,0,0,1,1,0,1,0,0
4,0,0,0,0,0,0,1,1,0,1,0,0


# Data preprocessing

## Engineer Features

In [116]:
# Total distance:
X.eval('Total_distance = Desert + Mountain + Water + Land + Forest + Snow', inplace= True)
# Convert Yes/No to 1/0
X['Is_Raining'] = X['Is_Raining'].map({'Yes':1, 'No':0}) 

# scale data
X_scaled = (X - X.min()) / (X.max() - X.min())
# X_scaled = (X - X.mean())/X.std()

In [117]:
from sklearn.model_selection import train_test_split

from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import accuracy_score, log_loss
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
import copy

In [118]:
Y.head()

Unnamed: 0,Winner,Winner_Time,Second,Second_Time,Third,Third_Time
0,rat,218,dog,242,dragon,245
1,tiger,216,snake,224,rat,239
2,ox,191,rabbit,217,horse,240
3,goat,319,rooster,322,horse,334
4,horse,255,goat,259,rooster,263


In [119]:
def to_time_df(finisher_col, time_col): 
    return pd.get_dummies(finisher_col).apply( lambda x: x*time_col)

time_map = {'Winner': 'Winner_Time',
            'Second': 'Second_Time',
            'Third': 'Third_Time'
           }

Y_time = sum([to_time_df(Y[pos_name], Y[pos_time]) for (pos_name, pos_time )in time_map.items()])[animal_list]

assert sum((Y_time > 0).sum(axis=1) != 3) ==0
Y_time.head()

Unnamed: 0,rat,ox,tiger,rabbit,dragon,snake,horse,goat,monkey,rooster,dog,pig
0,218,0,0,0,245,0,0,0,0,0,242,0
1,239,0,216,0,0,224,0,0,0,0,0,0
2,0,191,0,217,0,0,240,0,0,0,0,0
3,0,0,0,0,0,0,334,319,0,322,0,0
4,0,0,0,0,0,0,255,259,0,263,0,0


## 1st stage:
### Predict whether each animal finishes in top 3

In [120]:
#split into training and test sets
X_train, X_test, Y_top3_train, Y_top3_test, Y_time_train, Y_time_test = train_test_split(X_scaled, Y_top3.astype(int), Y_time, random_state=0)

In [121]:
classifiers = [
    KNeighborsClassifier(3),
#     SVC(probability=True),
    DecisionTreeClassifier(),
    RandomForestClassifier(random_state=0),
#     AdaBoostClassifier(),
    GradientBoostingClassifier(),
    GaussianNB(),
    LinearDiscriminantAnalysis(),
#     QuadraticDiscriminantAnalysis(),
    LogisticRegression()
    ]


clf_names = [clf.__class__.__name__ for clf in classifiers]

In [166]:
class model_matrix(object):
    def __init__(self, model_list, animal_list):
        '''
        Args:
        model_list: list of initialised models (regressors/ classifiers)
        animal_list: list of animals
        '''
        self.fitted = False
        
        self.animal_list = animal_list
        self.mode_list = model_list
        self.model_names = [model.__class__.__name__ for model in model_list]
        
        self.uniqname= 'mod' + str(abs(hash(frozenset(model_list))
                                  + hash(frozenset(animal_list))))
        
        def add_animal_attr(model, animal_name):
            model_ = copy.deepcopy(model)
            model_.animal = animal_name
            return model_
        
        self.model_df = pd.DataFrame([[add_animal_attr(model, animal) 
                                       for animal in animal_list] 
                                       for model in model_list],
                                       columns = animal_list,
                                       index = self.model_names)
        
    
    def helper(self, model, X, Y):
                #returns fitted classifier
                print('fitting {} to {} \n'.format(model.__class__.__name__, 
                                                   model.animal))
                #fit inplace
                model.fit(X,Y[model.animal])
    
    
    
    def train(self, X, Y, refit = False):
        
    
        if refit == True:
            #function to apply to each element (instantiated model)
            


            self.model_df.applymap(lambda model: self.helper(model, X, Y))
            self.fitted = True

            pickle.dump(self.model_df, open(self.uniqname, 'wb'))
        
        else:
            
            self.model_df= pickle.load(open(self.uniqname, 'rb'))
            
        self.fitted = True
    
    def score(self, X, Y):
        assert self.fitted== True
        return self.model_df.applymap( lambda model: model.score(X, Y[model.animal]))

        


In [124]:
clf_matrix = model_matrix(classifiers, animal_list)

clf_matrix.train(X_train, Y_top3_train, False)

In [129]:
clf_score_matrix =clf_matrix.score(X_test, Y_top3_test)

clf_score_matrix

Unnamed: 0,rat,ox,tiger,rabbit,dragon,snake,horse,goat,monkey,rooster,dog,pig
KNeighborsClassifier,0.8972,0.9364,0.9082,0.9409,0.9247,0.8777,0.9061,0.9166,0.9053,0.9475,0.9036,0.9273
DecisionTreeClassifier,0.924,0.9826,0.9193,0.9793,0.9245,0.907,0.922,0.926,0.9175,0.9578,0.9292,0.9587
RandomForestClassifier,0.9456,0.9868,0.9421,0.9854,0.9484,0.9266,0.9465,0.946,0.9438,0.9688,0.9483,0.9711
GradientBoostingClassifier,0.945,0.9851,0.9423,0.9852,0.955,0.9241,0.9459,0.9493,0.934,0.9695,0.9453,0.9667
GaussianNB,0.8443,0.9502,0.8654,0.9471,0.896,0.8157,0.863,0.8792,0.8481,0.9496,0.8519,0.882
LinearDiscriminantAnalysis,0.7804,0.9445,0.8348,0.9379,0.9125,0.7187,0.8676,0.9,0.8527,0.9441,0.7933,0.869
LogisticRegression,0.7794,0.9467,0.8354,0.9408,0.9123,0.719,0.8683,0.9002,0.8533,0.9504,0.7932,0.8683


### Compare classifiers

In [130]:
# %%debug

# clf_score_matrix=clf_score_matrix.swaplevel(axis=1)
# clf_score_matrix.reindex(sorted(clf_score_matrix.columns), axis=1)
best_clfs = clf_score_matrix.idxmax()

name_clf_dict = {clf.__class__.__name__:clf for clf in classifiers}
best_clfs.value_counts()

best_clf = name_clf_dict[best_clfs.mode().loc[0]]
print('best clf: {}'.format(best_clf.__class__.__name__))

best clf: RandomForestClassifier


## Fit Regressors

In [133]:
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor, AdaBoostRegressor
from sklearn.linear_model import LogisticRegression

In [142]:
regressors = [
#               GradientBoostingRegressor(),
              GradientBoostingRegressor(n_estimators=1000, max_depth=5),
#               RandomForestRegressor(), 
#               AdaBoostRegressor(),
#               LogisticRegression()
             ]



In [186]:
#For each animal, train regressors only on rows where there is a timing 

class special_rgg_matrix(model_matrix):
    def helper(self, model, X, Y):
            #returns fitted model
            print('fitting {} to {} \n'.format(model.__class__.__name__, 
                                               model.animal))
            #fit inplace
            y = Y[model.animal]
            model.fit(X[y>0],y[y>0])

    def score(self, X, Y):
            assert self.fitted== True
            return self.model_df.applymap(lambda model: 
                                          model.score(X[Y[model.animal]>0], 
                                                      Y[model.animal][Y[model.animal]>0]))

In [192]:
rgg_matrix = special_rgg_matrix(regressors, animal_list)
rgg_matrix.train(X_train, Y_time_train, True)

rgg_score_matrix = rgg_matrix.score(X_test, Y_time_test)


fitting GradientBoostingRegressor to rat 

fitting GradientBoostingRegressor to rat 

fitting GradientBoostingRegressor to ox 

fitting GradientBoostingRegressor to tiger 

fitting GradientBoostingRegressor to rabbit 

fitting GradientBoostingRegressor to dragon 

fitting GradientBoostingRegressor to snake 

fitting GradientBoostingRegressor to horse 

fitting GradientBoostingRegressor to goat 

fitting GradientBoostingRegressor to monkey 

fitting GradientBoostingRegressor to rooster 

fitting GradientBoostingRegressor to dog 

fitting GradientBoostingRegressor to pig 



In [193]:
rgg_score_matrix

Unnamed: 0,rat,ox,tiger,rabbit,dragon,snake,horse,goat,monkey,rooster,dog,pig
GradientBoostingRegressor,0.999332,0.997174,0.998598,0.996353,0.999029,0.999433,0.998579,0.998522,0.99907,0.997644,0.999231,0.997433


In [194]:
best_rggs = rgg_score_matrix.idxmax()

name_rgg_dict = {rgg.__class__.__name__:rgg for rgg in regressors}
best_rggs.value_counts()

best_rgg = name_rgg_dict[best_rggs.mode().loc[0]]
print('best rgg: {}'.format(best_rgg.__class__.__name__))

best rgg: GradientBoostingRegressor


## Predict top3 using regression
### Compare results vs using classifer technique to predict top 3

In [237]:
def pred_time(X, best_rgg):
# returns dataframe of predicted timings for all animals given input data X 
    return pd.DataFrame([rgg.predict(X) for rgg in 
                        rgg_matrix.model_df.loc[best_rgg.__class__.__name__]], \
                        index=animal_list,
                        columns=X.index).T
    



In [238]:
def pred_top3_rgg(X, best_rgg):
# returns dataframe Y of (one-hot) indicator of whether animal finished in top3
    return pred_time(X_test, best_rgg).apply(lambda row: row.nsmallest(3) ,axis=1).notnull().reindex(animal_list, axis=1)
    

def pred_top3_clf(X, best_clf):
# returns dataframe Y of (one-hot) indicator of whether animal finished in top3
    return pd.DataFrame([clf_matrix.model_df.loc[best_clf.__class__.__name__,animal]
                         .predict(X) for animal in animal_list],
                        index=animal_list,
                        columns=X.index
                       ).T

In [239]:
pred_top3_clf_df = pred_top3_clf(X_test, best_clf)
pred_top3_rgg_df= pred_top3_rgg(X_test,best_rgg).astype(int)

In [241]:
print("Accuracy in predicting whether animal finishes in top3: \n \
      Using Regression: {} \n \
      Using Classification: {} \n "
      .format((Y_top3_test == pred_top3_rgg_df).mean().mean(),
              (Y_top3_test == pred_top3_clf_df).mean().mean())
     )

Accuracy in predicting whether animal finishes in top3: 
       Using Regression: 0.8401166666666665 
       Using Classification: 0.9549499999999999 
 


## Predict top 3 finishers in order using:


### Method 1
Pure regression: regression, then sort by timings. Pick only top 3. Compare against original top3 finishers.

### Method 2
Classification to pick animals predicted to finish in top3, then sort by timings 
If < 3 finishers are predicted to be in top3, choose next fastest timing from animal not predicted in top3

In [146]:
#Generate test ranked top3 set

#Correct rankings
Y_rank_test = Y_time_test[Y_time_test>0].rank(axis=1)[animal_list]
X_rank_test = X_test

In [None]:
#Method 1

#Dataframe of predicted rankings using regression
pred_rank_rgg = pred_time(X_rank_test, best_rgg).rank(axis=1).apply(lambda row: row.nsmallest(3) ,axis=1).reindex(animal_list, axis=1)

In [157]:
#number of positions predicted correctly for each entry
correct_counts = (pred_rank_rgg==Y_rank_test).sum(axis=1).value_counts()
print(correct_counts_m1)
print("Proportion of all 3 correct using m1", correct_counts_m1[3]/correct_counts_m1.sum())

#proportion of times where all top3 rankings where predicted correctly
rank_acc_rgg = ((pred_rank_rgg==Y_rank_test).sum(axis=1)==3).mean()

0    3535
3    3009
1    2207
2    1249
dtype: int64
Proportion of all 3 correct  0.3009


In [149]:
#Method 2

def pred_rank_m2(X, best_rgg, best_clf):
    pred_rank = pred_time(X, best_rgg).rank(axis=1)
    pred_top3_clf_df = pred_top3_clf(X, best_clf)
    
    #ranking(using timings from rgg) of animals predicted by clf to finish in top 3 
    intop3_ranked = pred_rank[pred_top3_clf_df.astype(bool)]
    outtop3_ranked = pred_rank[~pred_top3_clf_df.astype(bool)] + 12
    
    return  intop3_ranked.fillna(outtop3_ranked).apply(lambda row: row.nsmallest(3) ,axis=1) \
            .reindex(animal_list, axis=1).rank(axis=1)


In [150]:
pred_rank_m2_df = pred_rank_m2(X_rank_test, best_rgg, best_clf)

In [158]:
correct_counts_m2 = (pred_rank_m2_df==Y_rank_test).sum(axis=1).value_counts()
print(correct_counts_m2)
print("Proportion of all 3 correct using m2:", correct_counts_m2[3]/correct_counts_m2.sum())

3    5328
1    2242
2    1971
0     459
dtype: int64
Proportion of all 3 correct  0.5328


In [159]:
rank_acc_m2 = ((pred_rank_m2_df==Y_rank_test).sum(axis=1)==3).mean()

In [161]:
clfs_fitted_df = pd.DataFrame([[helper(clf, animal, X_scaled, Y_top3) for clf in copy.deepcopy([best_clf])] 
                                for animal in animal_list],
                              columns= [best_clf.__class__.__name__],
                              index = animal_list)

rggs_fitted_df = pd.DataFrame([[helper_rgg(rgg, animal, X_scaled, Y_time) for rgg in copy.deepcopy([best_rgg])] 
                                for animal in animal_list]
                               ,columns= [best_rgg.__class__.__name__]
                               ,index = animal_list)



fitting RandomForestClassifier to rat 





fitting RandomForestClassifier to ox 





fitting RandomForestClassifier to tiger 





fitting RandomForestClassifier to rabbit 





fitting RandomForestClassifier to dragon 





fitting RandomForestClassifier to snake 





fitting RandomForestClassifier to horse 





fitting RandomForestClassifier to goat 





fitting RandomForestClassifier to monkey 





fitting RandomForestClassifier to rooster 





fitting RandomForestClassifier to dog 





fitting RandomForestClassifier to pig 





fitting GradientBoostingRegressor to rat 

fitting GradientBoostingRegressor to ox 

fitting GradientBoostingRegressor to tiger 

fitting GradientBoostingRegressor to rabbit 

fitting GradientBoostingRegressor to dragon 

fitting GradientBoostingRegressor to snake 

fitting GradientBoostingRegressor to horse 

fitting GradientBoostingRegressor to goat 

fitting GradientBoostingRegressor to monkey 

fitting GradientBoostingRegressor to rooster 

fitting GradientBoostingRegressor to dog 

fitting GradientBoostingRegressor to pig 



In [162]:
X_submit = pd.DataFrame([[2345, 8764, 6689, 7332, 1050, 3741, 40]], columns=X_test.columns[:7])
X_submit['Is_Raining']=0
X_submit['Total_distance']= X_submit.loc[::7].sum(axis=1)

pred_rank_m2(X_submit, best_rgg, best_clf)

Unnamed: 0,rat,ox,tiger,rabbit,dragon,snake,horse,goat,monkey,rooster,dog,pig
0,,3.0,,1.0,,,,,,,,2.0
