In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score
import seaborn as sns

In [2]:
train = pd.read_csv('dataset/train.csv')
test = pd.read_csv('dataset/test.csv')
gender = pd.read_csv('gender_submission.csv')

### The sum of the null values in each column.

In [3]:
train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [4]:
test.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [5]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values = np.nan, strategy = 'mean')    

In [6]:
train.iloc[:,5:6] = imputer.fit_transform(train.iloc[:,5:6])
test.iloc[:,4:5] = imputer.fit_transform(test.iloc[:,4:5])
test[['Fare']] = imputer.fit_transform(test[['Fare']])

### After imputer

In [7]:
train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [8]:
test.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          327
Embarked         0
dtype: int64

In [9]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder()
sexTrain = train.iloc[:,4:5]
sexTrain = ohe.fit_transform(sexTrain).toarray()
sexTrain = pd.DataFrame(data = sexTrain, index = range(len(sexTrain)), columns = ['Female', 'Male'])

In [10]:
sexTest = test.iloc[:,3:4]
sexTest = ohe.fit_transform(sexTest).toarray()
sexTest = pd.DataFrame(data = sexTest, index = range(len(sexTest)), columns = ['Female', 'Male'])

In [11]:
x_train = pd.concat([train[['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']], sexTrain['Female']], axis = 1)
x_test = pd.concat([test[['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']], sexTest['Female']], axis = 1)


In [12]:
y_train = pd.concat([train['Survived']])
y_test = gender.iloc[:,1:]


### Model selection

In [13]:
from xgboost import XGBClassifier
xgb = XGBClassifier()
xgb.fit(x_train, y_train)
y_pred_xgb = xgb.predict(x_test)

cm = confusion_matrix(y_test, y_pred_xgb)
print('XGB CM\n', cm)

crossVal= cross_val_score(estimator = xgb, X = x_train, y = y_train, cv = 4)
print('XGB Accuracy: ', crossVal.mean())
print('XGB Std: ', crossVal.std())

XGB CM
 [[234  32]
 [ 40 112]]
XGB Accuracy:  0.8125681735547207
XGB Std:  0.015019315488846188


In [14]:
from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression(random_state = 0)
log_reg.fit(x_train, y_train)
y_pred = log_reg.predict(x_test)
cm = confusion_matrix(y_test, y_pred)
print('LogisticRegression CM\n', cm)

crossVal = cross_val_score(estimator = log_reg, X = x_train, y = y_train, cv = 4)
print('Logistic Regression Accuracy: ', crossVal.mean())
print('Logistic Regression Std: ', crossVal.std())

LogisticRegression CM
 [[253  13]
 [ 10 142]]
Logistic Regression Accuracy:  0.7901415989981012
Logistic Regression Std:  0.016677369878150876


In [15]:
from sklearn.svm import SVC
svc = SVC(kernel = 'linear')
#svc = SVC(kernel = 'poly')
#svc = SVC(kernel = 'rbf')
#svc = SVC(kernel = 'precomputed')
svc.fit(x_train, y_train)
y_pred = svc.predict(x_test)
cm = confusion_matrix(y_test, y_pred)
print('SVC CM\n', cm)

crossVal = cross_val_score(estimator = svc, X = x_train, y = y_train, cv = 4)
print('SVC Accuracy: ', crossVal.mean())
print('SVC Std: ', crossVal.std())


SVC CM
 [[266   0]
 [  0 152]]
SVC Accuracy:  0.786748070940896
SVC Std:  0.017323976251949758


In [16]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(x_train, y_train)
y_pred = gnb.predict(x_test)
cm = confusion_matrix(y_test, y_pred)
print('GaussianNB CM\n', cm)

crossVal = cross_val_score(estimator = gnb, X = x_train, y = y_train, cv = 4)
print('GaussianNB Accuracy: ', crossVal.mean())
print('GaussianNB Std: ', crossVal.std())

GaussianNB CM
 [[243  23]
 [  6 146]]
GaussianNB Accuracy:  0.7822789156869874
GaussianNB Std:  0.011431935998370984


In [17]:
from sklearn.naive_bayes import MultinomialNB
mnb = MultinomialNB()
mnb.fit(x_train, y_train)
y_pred = mnb.predict(x_test)
cm = confusion_matrix(y_test, y_pred)
print('MultinomialNB CM\n', cm)

crossVal = cross_val_score(estimator = mnb, X = x_train, y = y_train, cv = 4)
print('MultinomialNB Accuracy: ', crossVal.mean())
print('MultinomialNB Std: ', crossVal.std())

MultinomialNB CM
 [[212  54]
 [ 89  63]]
MultinomialNB Accuracy:  0.6947592211045125
MultinomialNB Std:  0.05699698776274929


In [18]:
from sklearn.naive_bayes import ComplementNB
cnb = ComplementNB()
cnb.fit(x_train, y_train)
y_pred = cnb.predict(x_test)
cm = confusion_matrix(y_test, y_pred)
print('ComplementNB CM\n', cm)

crossVal = cross_val_score(estimator = cnb, X = x_train, y = y_train, cv = 4)
print('ComplementNB Accuracy: ', crossVal.mean())
print('ComplementNB Std: ', crossVal.std())


ComplementNB CM
 [[210  56]
 [ 88  64]]
ComplementNB Accuracy:  0.6958802973376964
ComplementNB Std:  0.05782394130892117


In [19]:
from sklearn.naive_bayes import BernoulliNB
bnb = BernoulliNB()
bnb.fit(x_train, y_train)
y_pred = bnb.predict(x_test)
cm = confusion_matrix(y_test, y_pred)
print('BernoulliNB CM\n', cm)

crossVal = cross_val_score(estimator = bnb, X = x_train, y = y_train, cv = 4)
print('BernoulliNB Accuracy: ', crossVal.mean())
print('BernoulliNB Std: ', crossVal.std())


BernoulliNB CM
 [[266   0]
 [  0 152]]
BernoulliNB Accuracy:  0.786748070940896
BernoulliNB Std:  0.017323976251949758


In [20]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors = 2, metric = 'minkowski')
knn.fit(x_train, y_train)
y_pred = knn.predict(x_test)
cm = confusion_matrix(y_test, y_pred)
print('KNeighborsClassifier CM\n', cm)

crossVal = cross_val_score(estimator = knn, X = x_train, y = y_train, cv = 4)
print('KNeighborsClassifier Accuracy: ', crossVal.mean())
print('KNeighborsClassifier Std: ', crossVal.std())

KNeighborsClassifier CM
 [[229  37]
 [102  50]]
KNeighborsClassifier Accuracy:  0.6812659071627681
KNeighborsClassifier Std:  0.041514905128271086


In [21]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators = 10, criterion = 'entropy')
rfc.fit(x_train, y_train)
y_pred = rfc.predict(x_test)
cm = confusion_matrix(y_test, y_pred)
print('RandomForestClassifier CM\n', cm)

crossVal = cross_val_score(estimator = rfc, X = x_train, y = y_train, cv = 4)
print('RandomForestClassifier Accuracy: ', crossVal.mean())
print('RandomForestClassifier Std: ', crossVal.std())

RandomForestClassifier CM
 [[230  36]
 [ 52 100]]
RandomForestClassifier Accuracy:  0.8058568658344443
RandomForestClassifier Std:  0.03852849866142551


### Model optimization

In [22]:
params = [{'learning_rate':[0.1,0.01],
           'colsample_bytree':[1,3],
           'gamma':[0,1],
           'reg_alpha':[3,4,8],
           'reg_lambda':[1,13,15],
           'n_estimators':[200,500],
           'missing':[False, True],
           'subsample':[1,2],
           'base_score':[0.2,0.8]
           }
]

from sklearn.model_selection import GridSearchCV

gs = GridSearchCV(estimator = xgb,
                  param_grid = params,
                  scoring = 'accuracy',
                  cv = 10,
                  n_jobs = -1)

grid_search = gs.fit(x_train, y_train)
best_result = grid_search.best_score_
best_params = grid_search.best_params_
print('Best_Result', best_result)
print('Best_Params', best_params)

Best_Result 0.8361922596754058
Best_Params {'base_score': 0.8, 'colsample_bytree': 1, 'gamma': 0, 'learning_rate': 0.1, 'missing': False, 'n_estimators': 500, 'reg_alpha': 3, 'reg_lambda': 13, 'subsample': 1}


In [23]:
from xgboost import XGBClassifier
xgb = XGBClassifier(base_score=0.8, colsample_bylevel = 1, learning_rate=0.1, missing=False, n_estimators=500,
       objective='binary:logistic', reg_alpha=3, reg_lambda=13, subsample = 1, gamma = 0)

xgb.fit(x_train, y_train)
y_pred_xgb = xgb.predict(x_test)

print('y_test', y_test.shape, '\ny_pred_xgb', y_pred_xgb.shape)
cm = confusion_matrix(y_test, y_pred_xgb)
print('XGB CM\n', cm)

y_test (418, 1) 
y_pred_xgb (418,)
XGB CM
 [[253  13]
 [ 24 128]]


In [24]:
passengerId = gender.iloc[:,0:1]
y_pred_xgb = pd.DataFrame(data = y_pred_xgb, index = range(len(y_pred_xgb)))
submit = pd.concat([passengerId, y_pred_xgb], axis = 1)

submit.to_csv(r'submission.csv', index=False)