In [12]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split

In [4]:
input_file = "C:/Users/Willy/Documents/Zophia/zenpli/backend-dev-data-dataset.txt"
df = pd.read_csv(input_file, dtype={'cont_10': 'object',
                                                      'disc_6': 'object'})

In [5]:
train, test = train_test_split(df, test_size=0.2, random_state=1, shuffle=True)

In [18]:
# save train and test so model can be reproducible
train.to_csv(r"C:/Users/Willy/Documents/Zophia/zenpli/train.csv", sep='\t', index=False)
test.to_csv(r"C:/Users/Willy/Documents/Zophia/zenpli/test.csv", sep='\t', index=False)

In [28]:
#features to include in model
cols = ['cont_3','cont_4','disc_5','disc_6','cat_7','cont_9']

In [29]:
#training set
X = train[cols]
y = train['cat_8']

In [30]:
#testing set
X_test = test[cols]

In [23]:
#onehot encoder for feature engineering
ohe = OneHotEncoder()

In [24]:
#column transfomer for pipeline
ct = make_column_transformer(
    (ohe, ['disc_5', 'disc_6', 'cat_7']),
    remainder='passthrough')

In [25]:
from sklearn.ensemble import RandomForestClassifier

In [26]:
#random forest 
rf = RandomForestClassifier(max_depth=2, random_state=0)

In [31]:
#pipeline for one hot encoder and model
pipe1 = make_pipeline(ct, rf)
pipe1.fit(X, y)
pipe1.predict(X_test)

array(['happy', 'happy', 'happy', ..., 'happy', 'happy', 'happy'],
      dtype=object)

In [32]:
#cross validation pipe 1
from sklearn.model_selection import cross_val_score
cross_val_score(pipe1, X, y, cv=10, scoring='accuracy').mean()

0.4925

## Hiperparameters

In [40]:
#random forest
params1 = {}
params1['randomforestclassifier__criterion'] = ['gini','entropy']
params1['randomforestclassifier__oob_score'] = [True]
params1

{'randomforestclassifier__criterion': ['gini', 'entropy'],
 'randomforestclassifier__oob_score': [True]}

In [41]:
from sklearn.model_selection import GridSearchCV
grid1 = GridSearchCV(pipe1, params1, cv=10, scoring='accuracy')
grid1.fit(X, y)

GridSearchCV(cv=10,
             estimator=Pipeline(steps=[('columntransformer',
                                        ColumnTransformer(remainder='passthrough',
                                                          transformers=[('onehotencoder',
                                                                         OneHotEncoder(),
                                                                         ['disc_5',
                                                                          'disc_6',
                                                                          'cat_7'])])),
                                       ('randomforestclassifier',
                                        RandomForestClassifier(max_depth=2,
                                                               random_state=0))]),
             param_grid={'randomforestclassifier__criterion': ['gini',
                                                               'entropy'],
                         'ran

In [42]:
#Grid search results for random forest
results1 = pd.DataFrame(grid1.cv_results_)
results1


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_randomforestclassifier__criterion,param_randomforestclassifier__oob_score,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
0,33.342239,0.823368,0.718284,0.05459,gini,True,"{'randomforestclassifier__criterion': 'gini', ...",0.4925,0.4925,0.4925,0.4925,0.4925,0.4925,0.4925,0.4925,0.4925,0.4925,0.4925,0.0,1
1,34.9826,0.749923,0.741824,0.023483,entropy,True,{'randomforestclassifier__criterion': 'entropy...,0.4925,0.4925,0.4925,0.4925,0.4925,0.4925,0.4925,0.4925,0.4925,0.4925,0.4925,0.0,1


In [43]:
#gridsearch results for random forest ordered by score
results1.sort_values('rank_test_score')

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_randomforestclassifier__criterion,param_randomforestclassifier__oob_score,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
0,33.342239,0.823368,0.718284,0.05459,gini,True,"{'randomforestclassifier__criterion': 'gini', ...",0.4925,0.4925,0.4925,0.4925,0.4925,0.4925,0.4925,0.4925,0.4925,0.4925,0.4925,0.0,1
1,34.9826,0.749923,0.741824,0.023483,entropy,True,{'randomforestclassifier__criterion': 'entropy...,0.4925,0.4925,0.4925,0.4925,0.4925,0.4925,0.4925,0.4925,0.4925,0.4925,0.4925,0.0,1


In [44]:
grid1.best_score_

0.4925

In [45]:
grid1.best_params_

{'randomforestclassifier__criterion': 'gini',
 'randomforestclassifier__oob_score': True}

In [49]:
pd.DataFrame(grid1.predict(X_test)).value_counts()

happy    200000
dtype: int64