In [14]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, confusion_matrix

import scipy.stats as st

In [2]:
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')
gender_submission = pd.read_csv('./data/gender_submission.csv')

In [3]:
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [4]:
def transform_dataset(df):
    df_Embarked = pd.get_dummies(df['Embarked'], prefix = 'Embarked')
    df_Sex = pd.get_dummies(df['Sex'], prefix = 'Sex')
    df_pclass = pd.get_dummies(df['Pclass'], prefix = 'Pclass')
    df["Deck"] = df["Cabin"].str.slice(0,1)
    df["Room"] = df["Cabin"].str.slice(1,5).str.extract("([0-9]+)", expand=False).astype("float")
    df["Deck"] = df["Deck"].fillna("N")
    df["Room"] = df["Room"].fillna(df["Room"].mean())
    df_deck = pd.get_dummies(df['Deck'], prefix = 'Deck')
    output = pd.concat([df,df_Embarked,df_Sex,df_deck], axis = 1)
    output.drop(columns = ['Embarked','Sex','Pclass','Deck','Cabin'], inplace = True)
    output['Title'] =output.Name.apply(lambda x:x.split(',')[1].split('.')[0])
    output.drop(columns = ['Title','Name'], inplace = True)
    return output

In [5]:
train = transform_dataset(train)

In [6]:
train.groupby('Ticket').agg('count')

Unnamed: 0_level_0,PassengerId,Survived,Age,SibSp,Parch,Fare,Room,Embarked_C,Embarked_Q,Embarked_S,...,Sex_male,Deck_A,Deck_B,Deck_C,Deck_D,Deck_E,Deck_F,Deck_G,Deck_N,Deck_T
Ticket,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
110152,3,3,3,3,3,3,3,3,3,3,...,3,3,3,3,3,3,3,3,3,3
110413,3,3,3,3,3,3,3,3,3,3,...,3,3,3,3,3,3,3,3,3,3
110465,2,2,1,2,2,2,2,2,2,2,...,2,2,2,2,2,2,2,2,2,2
110564,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
110813,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
111240,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
111320,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
111361,2,2,2,2,2,2,2,2,2,2,...,2,2,2,2,2,2,2,2,2,2
111369,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
111426,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1


In [7]:
train[train['Ticket']=='110465']

Unnamed: 0,PassengerId,Survived,Age,SibSp,Parch,Ticket,Fare,Room,Embarked_C,Embarked_Q,...,Sex_male,Deck_A,Deck_B,Deck_C,Deck_D,Deck_E,Deck_F,Deck_G,Deck_N,Deck_T
110,111,0,47.0,0,0,110465,52.0,110.0,0,0,...,1,0,0,1,0,0,0,0,0,0
475,476,0,,0,0,110465,52.0,14.0,0,0,...,1,1,0,0,0,0,0,0,0,0


In [8]:
train.drop(columns = 'Ticket',inplace= True)
train.dropna(inplace=True)
train_X = train.loc[:,"Age":]
train_Y = train.loc[:,"Survived"]

In [9]:
# random forest model creation
rfc = RandomForestClassifier()
rfc.fit(train_X,train_Y)
# predictions
rfc_predict = rfc.predict(train_X)



In [11]:
rfc_cv_score = cross_val_score(rfc, train_X,train_Y, cv=10)

In [12]:
print("=== Confusion Matrix ===")
print(confusion_matrix(train_Y, rfc_predict))
print('\n')
print("=== Classification Report ===")
print(classification_report(train_Y, rfc_predict))
print('\n')
print("=== All AUC Scores ===")
print(rfc_cv_score)
print('\n')
print("=== Mean AUC Score ===")
print("Mean AUC Score - Random Forest: ", rfc_cv_score.mean())

=== Confusion Matrix ===
[[418   6]
 [ 11 279]]


=== Classification Report ===
              precision    recall  f1-score   support

           0       0.97      0.99      0.98       424
           1       0.98      0.96      0.97       290

    accuracy                           0.98       714
   macro avg       0.98      0.97      0.98       714
weighted avg       0.98      0.98      0.98       714



=== All AUC Scores ===
[0.73611111 0.72222222 0.73611111 0.90277778 0.74647887 0.81690141
 0.78873239 0.77464789 0.77464789 0.83098592]


=== Mean AUC Score ===
Mean AUC Score - Random Forest:  0.7829616588419406


In [24]:
n_estimators = [1, 2, 3, 4, 5, 6, 7, 8, 9 ,10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]
max_depth = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 25, 30]
min_samples_split = [2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 100]
min_samples_leaf = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] 

hyperF = dict(n_estimators = n_estimators, max_depth = max_depth,  
              min_samples_split = min_samples_split, 
             min_samples_leaf = min_samples_leaf)

gridF = GridSearchCV(rfc, hyperF, cv = 3, verbose = 1, 
                      n_jobs = -1)
bestF = gridF.fit(train_X, train_Y)

Fitting 3 folds for each of 28600 candidates, totalling 85800 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done 1680 tasks      | elapsed:    5.6s
[Parallel(n_jobs=-1)]: Done 5680 tasks      | elapsed:   17.6s
[Parallel(n_jobs=-1)]: Done 11280 tasks      | elapsed:   40.4s
[Parallel(n_jobs=-1)]: Done 18480 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 27280 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 37680 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 49680 tasks      | elapsed:  3.4min
[Parallel(n_jobs=-1)]: Done 63280 tasks      | elapsed:  4.4min
[Parallel(n_jobs=-1)]: Done 78480 tasks      | elapsed:  5.6min
[Parallel(n_jobs=-1)]: Done 85800 out of 85800 | elapsed:  6.2min finished


In [25]:
bestF_predict  = bestF.predict(train_X)

In [26]:
print("=== Confusion Matrix ===")
print(confusion_matrix(train_Y, bestF_predict))
print('\n')
print("=== Classification Report ===")
print(classification_report(train_Y, bestF_predict))
print('\n')
print("=== All AUC Scores ===")
print(rfc_cv_score)
print('\n')
print("=== Mean AUC Score ===")
print("Mean AUC Score - Random Forest: ", rfc_cv_score.mean())

=== Confusion Matrix ===
[[383  41]
 [ 57 233]]


=== Classification Report ===
              precision    recall  f1-score   support

           0       0.87      0.90      0.89       424
           1       0.85      0.80      0.83       290

    accuracy                           0.86       714
   macro avg       0.86      0.85      0.86       714
weighted avg       0.86      0.86      0.86       714



=== All AUC Scores ===
[0.73611111 0.72222222 0.73611111 0.90277778 0.74647887 0.81690141
 0.78873239 0.77464789 0.77464789 0.83098592]


=== Mean AUC Score ===
Mean AUC Score - Random Forest:  0.7829616588419406
