In [843]:
# Load libs
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier

In [844]:
df=pd.read_csv('train.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [845]:
df.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [846]:
# special treatment of nans for Age due to dependence upon SibSp
df['Age'].replace(np.nan, -1,inplace=True)
ssm_0=df.Age[(df.Age!=-1)&(df.SibSp==0)].mean()
ssm_1=df.Age[(df.Age!=-1)&(df.SibSp==1)].mean()
ssm_2=df.Age[(df.Age!=-1)&(df.SibSp==2)].mean()
ssm_3=df.Age[(df.Age!=-1)&(df.SibSp==3)].mean()
ssm_4=df.Age[(df.Age!=-1)&(df.SibSp==4)].mean()
ssm_5=df.Age[(df.Age!=-1)&(df.SibSp==5)].mean()
ssm_0,ssm_1,ssm_2,ssm_3,ssm_4,ssm_5

(31.39755838641189,
 30.089726775956283,
 22.62,
 13.916666666666666,
 7.055555555555555,
 10.2)

In [847]:
df.Age[(df.Age==-1)&(df.SibSp==0)]=ssm_0
df.Age[(df.Age==-1)&(df.SibSp==1)]=ssm_1
df.Age[(df.Age==-1)&(df.SibSp==2)]=ssm_2
df.Age[(df.Age==-1)&(df.SibSp==3)]=ssm_3
df.Age[(df.Age==-1)&(df.SibSp==4)]=ssm_4
df.Age[(df.Age==-1)&(df.SibSp==5)]=ssm_5

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.Age[(df.Age==-1)&(df.SibSp==0)]=ssm_0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.Age[(df.Age==-1)&(df.SibSp==1)]=ssm_1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.Age[(df.Age==-1)&(df.SibSp==2)]=ssm_2
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.Age[(df.Age==-1)&(df.SibSp==3)]=ssm_3
A va

In [848]:
# deleting non-informative columns, coding categorical columns, fillna
df=df.drop(columns=['PassengerId','Name','Cabin','Ticket'],axis=1)
df.Sex.replace(to_replace=['male','female'],value=[0,1], inplace=True)
df['Embarked'] = df['Embarked'].fillna(df['Embarked'].value_counts().index[0])
df.Embarked.replace(to_replace=['S','C','Q'],value=[0,1,2], inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Sex       891 non-null    int64  
 3   Age       891 non-null    float64
 4   SibSp     891 non-null    int64  
 5   Parch     891 non-null    int64  
 6   Fare      891 non-null    float64
 7   Embarked  891 non-null    int64  
dtypes: float64(2), int64(6)
memory usage: 55.8 KB


In [849]:
# perturbation of the dataset before modelling
dfr=pd.DataFrame(np.random.permutation(df.values),columns=df.columns)

In [850]:
# defining x and y
x=dfr.drop(columns=['Survived'],axis=1)
y=dfr[['Survived']]

In [851]:
# splitting data into train and test
X_train, X_test, y_train, y_test = train_test_split(x,y,test_size = 0.2, random_state = 42)

In [852]:
# scaling data for x, train and test (keeping logics of fitting and transforming)
scaler=StandardScaler()
x_all=scaler.fit(x)
xx=x_all.transform(x)
X_tr=scaler.fit(X_train)
X_tr_sc=X_tr.transform(X_train)
X_t_sc=X_tr.transform(X_test)

In [853]:
# since we work with the classification problem, the classifiers were selected for modelling
# set of different models to choose the best ones for further experiments
# I've chosen 4 models (SVM, GBC, XGB and the simplest one - LR)
models = []
models.append(('LR', LogisticRegression(solver='liblinear', multi_class='ovr')))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC(gamma='auto')))
models.append(('CART', DecisionTreeClassifier()))
models.append(('GBC', GradientBoostingClassifier()))
models.append(('XGB', xgb.XGBClassifier()))

results = []
names = []
for name, model in models:
    kfold = StratifiedKFold(n_splits=5, random_state=1, shuffle=True)
    cv_results = cross_val_score(model, xx, np.ravel(y), cv=kfold, scoring='accuracy')
    results.append(cv_results)
    names.append(name)
    print('%s: %.2f (%.2f)' % (name, cv_results.mean(), cv_results.std()))

LR: 0.80 (0.01)
LDA: 0.79 (0.01)
KNN: 0.80 (0.03)
NB: 0.79 (0.02)
SVM: 0.82 (0.02)
CART: 0.78 (0.01)
GBC: 0.82 (0.02)
XGB: 0.81 (0.02)


In [854]:
# XGB Hyperparater tuning by utilizing RandomizedSearchCV (a faster option)
import xgboost as xgb
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
gbm = xgb.XGBClassifier()
gbm_param_grid_RG = {
    'colsample_bytree': [0.3, 0.7, 1.0],
    'n_estimators': [150,200,250,300],
    'max_depth': range(2, 11)}
randomized_gbm = RandomizedSearchCV(param_distributions=gbm_param_grid_RG, estimator=gbm, scoring="accuracy", n_iter=10, cv=5, verbose=1)
randomized_gbm.fit(x, y)
print("Best parameters found: ", randomized_gbm.best_params_)
print("Best accuracy: ", randomized_gbm.best_score_)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best parameters found:  {'n_estimators': 250, 'max_depth': 2, 'colsample_bytree': 0.3}
Best accuracy:  0.8170547988199107


In [855]:
# XGB - print accuracy, confusion matrix & classification report for test data
xgb_params=randomized_gbm.best_params_
xgb_opt=xgb.XGBClassifier(**xgb_params)
xgb_opt.fit(X_train,y_train)
predictions0=xgb_opt.predict(X_test)

print("accuracy_score")
print('%.2f' % accuracy_score(np.ravel(y_test), predictions0))
print("________________________________________________________")
print("confusion_matrix")
print(confusion_matrix(np.ravel(y_test), predictions0))
print("________________________________________________________")
print("classification_report")
print(classification_report(np.ravel(y_test), predictions0))

accuracy_score
0.81
________________________________________________________
confusion_matrix
[[105  13]
 [ 21  40]]
________________________________________________________
classification_report
              precision    recall  f1-score   support

         0.0       0.83      0.89      0.86       118
         1.0       0.75      0.66      0.70        61

    accuracy                           0.81       179
   macro avg       0.79      0.77      0.78       179
weighted avg       0.81      0.81      0.81       179



In [856]:
# CVM Hyperparater tuning by utilizing RandomizedSearchCV 
svm=SVC(gamma='auto')
svm_param_grid = {'C': [50, 10, 1.0, 0.1, 0.01], 
              'gamma': ['scale'],
              'kernel': ['linear','poly', 'rbf', 'sigmoid']} 
randomized_svm = RandomizedSearchCV(param_distributions=svm_param_grid, estimator=svm, scoring="accuracy", n_iter=10, cv=5, verbose=1)
randomized_svm.fit(xx,np.ravel(y))
print("Best parameters found: ", randomized_svm.best_params_)
print("Lowest score: ", randomized_svm.best_score_)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best parameters found:  {'kernel': 'rbf', 'gamma': 'scale', 'C': 1.0}
Lowest score:  0.8248948590797817


In [857]:
# the best parameters after tuning
svm_pars=grid1.best_params_
svm_pars

{'C': 1.0, 'gamma': 'scale', 'kernel': 'rbf'}

In [858]:
# fitting the model with the optimal parameters
svm_opt=SVC(**svm_pars)
svm_opt.fit(X_tr_sc, np.ravel(y_train))
predictions1=svm_opt.predict(X_t_sc)

In [859]:
# print accuracy, confusion matrix & classification report for test data
print("accuracy_score")
print('%.2f' % accuracy_score(np.ravel(y_test), predictions1))
print("________________________________________________________")
print("confusion_matrix")
print(confusion_matrix(np.ravel(y_test), predictions1))
print("________________________________________________________")
print("classification_report")
print(classification_report(np.ravel(y_test), predictions1))


accuracy_score
0.84
________________________________________________________
confusion_matrix
[[112   6]
 [ 22  39]]
________________________________________________________
classification_report
              precision    recall  f1-score   support

         0.0       0.84      0.95      0.89       118
         1.0       0.87      0.64      0.74        61

    accuracy                           0.84       179
   macro avg       0.85      0.79      0.81       179
weighted avg       0.85      0.84      0.84       179



In [860]:
# LGR Hyperparater tuning by utilizing RandomizedSearchCV 
lgr=LogisticRegression(solver='liblinear', multi_class='ovr')
lgr_param_grid = {'solver' : ['newton-cg', 'lbfgs', 'liblinear'], 
              'penalty' : ['l2'],
              'C' : [100, 10, 1.0, 0.1, 0.01]} 
randomized_lgr = RandomizedSearchCV(param_distributions=lgr_param_grid, estimator=lgr, scoring="accuracy", n_iter=10, cv=5, verbose=1)
randomized_lgr.fit(xx,np.ravel(y))
print("Best parameters found: ", randomized_lgr.best_params_)
print("Best score: ", randomized_lgr.best_score_)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best parameters found:  {'solver': 'newton-cg', 'penalty': 'l2', 'C': 0.1}
Best score:  0.8035465444730401


In [868]:
# fitting the model with the optimal parameters
lgr_opt=LogisticRegression(C=0.1, solver='newton-cg',penalty='l2')
lgr_opt.fit(X_tr_sc, np.ravel(y_train))
predictions2=lgr_opt.predict(X_t_sc)

In [869]:
# print accuracy, confusion matrix & classification report for test data
print("accuracy_score")
print('%.2f' % accuracy_score(np.ravel(y_test), predictions2))
print("________________________________________________________")
print("confusion_matrix")
print(confusion_matrix(np.ravel(y_test), predictions2))
print("________________________________________________________")
print("classification_report")
print(classification_report(np.ravel(y_test), predictions2))

accuracy_score
0.83
________________________________________________________
confusion_matrix
[[102  16]
 [ 15  46]]
________________________________________________________
classification_report
              precision    recall  f1-score   support

         0.0       0.87      0.86      0.87       118
         1.0       0.74      0.75      0.75        61

    accuracy                           0.83       179
   macro avg       0.81      0.81      0.81       179
weighted avg       0.83      0.83      0.83       179



In [863]:
# SGBC Hyperparater tuning by utilizing RandomizedSearchCV 
from scipy.stats import loguniform
n_estimators = [5,20,50,100,150,200,300] # number of trees in the random forest
max_depth = [int(x) for x in np.linspace(10, 120, num = 12)] # maximum number of levels allowed in each decision tree
max_leaf_nodes=[2, 5, 10, 20, 50, 100]

gbc_random_grid = {'n_estimators': n_estimators,
'max_depth': max_depth,
'max_leaf_nodes':max_leaf_nodes,
'learning_rate': loguniform(0.01, 1)}

print ('Random grid: ', gbc_random_grid, '\n')

Random grid:  {'n_estimators': [5, 20, 50, 100, 150, 200, 300], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120], 'max_leaf_nodes': [2, 5, 10, 20, 50, 100], 'learning_rate': <scipy.stats._distn_infrastructure.rv_continuous_frozen object at 0x000001B5DD506DA0>} 



In [864]:
# retrieving the best score and the optimal parameters
gbc=GradientBoostingClassifier()
gbc_random = RandomizedSearchCV(estimator = gbc,param_distributions = gbc_random_grid,
               n_iter = 100, cv = 5, verbose=2, random_state=35, n_jobs = -1, scoring="accuracy")
gbc_random.fit(x, np.ravel(y))
print("Best parameters found: ", gbc_random.best_params_)
print("Best score: ", gbc_random.best_score_)

Fitting 5 folds for each of 100 candidates, totalling 500 fits
Best parameters found:  {'learning_rate': 0.19687245414460147, 'max_depth': 50, 'max_leaf_nodes': 20, 'n_estimators': 5}
Best score:  0.8383717280773334


In [865]:
# fitting the model with the optimal parameters
gbc_opt_params=gbc_random.best_params_
gbc_opt = GradientBoostingClassifier(**gbc_opt_params)
gbc_opt.fit(X_train,np.ravel(y_train))
predictions3=gbc_opt.predict(X_test)

In [866]:
# print accuracy, confusion matrix & classification report for test data
print("accuracy_score")
print('%.2f' % accuracy_score(np.ravel(y_test), predictions3))
print("________________________________________________________")
print("confusion_matrix")
print(confusion_matrix(np.ravel(y_test), predictions3))
print("________________________________________________________")
print("classification_report")
print(classification_report(np.ravel(y_test), predictions3))

accuracy_score
0.81
________________________________________________________
confusion_matrix
[[103  15]
 [ 19  42]]
________________________________________________________
classification_report
              precision    recall  f1-score   support

         0.0       0.84      0.87      0.86       118
         1.0       0.74      0.69      0.71        61

    accuracy                           0.81       179
   macro avg       0.79      0.78      0.79       179
weighted avg       0.81      0.81      0.81       179



In [867]:
# SVM, GBC and XGB - the best set of models to achieve the highest score for the testing dataset