In [1]:
import math
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Imputer

In [2]:
# == Read raw data
titanic_train = pd.read_csv("../Data/train.csv")
titanic_test = pd.read_csv("../Data/test.csv")

# Now we only consider the train data set of titanic data set
X = titanic_train.drop('Survived', axis = 1)
y = titanic_train.Survived

In [3]:
# == Preprocessing of the modeling data
# drop columns don't go into model
X_model = X.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis = 1)

# transfer object variables to dummies
X_dummy = pd.get_dummies(X_model, drop_first=True)
print(X.shape)
print(X_dummy.shape)

# NA manipulation
X_dummy_clean = X_dummy.dropna()
print(X_dummy.columns)
print(X_dummy_clean.shape)

# split the data to train and test
X_train, X_test, y_train, y_test = train_test_split(X_dummy_clean, y[X_dummy_clean.index], test_size = 0.2, random_state = 2333)

(891, 11)
(891, 8)
Index(['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Sex_male', 'Embarked_Q',
       'Embarked_S'],
      dtype='object')
(714, 8)


In [4]:
# == Linear Regression
from sklearn.linear_model import LinearRegression

linear_reg = LinearRegression()
linear_reg.fit(X_train, y_train)
linear_pred = linear_reg.predict(X_test)

print(linear_reg.score(X_test, y_test))
# plt.hist(linear_pred)

0.48834415378484247


In [5]:
# == Logistic Regression
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix

log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)
log_pred = log_reg.predict(X_test)

print(log_reg.score(X_test, y_test))
print(confusion_matrix(y_test, log_pred))
# plt.hist(log_pred)

0.8181818181818182
[[70 11]
 [15 47]]


In [6]:
# -- Scaling
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

steps = [('scaler', StandardScaler()), 
         ('logistic', LogisticRegression())]
pipeline = Pipeline(steps)

pipeline.fit(X_train, y_train)
pip_pred = pipeline.predict(X_test)

print(pipeline.score(X_test, y_test))
print(confusion_matrix(y_test, pip_pred))

0.8251748251748252
[[70 11]
 [14 48]]


In [10]:
# -- Tunning parameters
from sklearn.model_selection import GridSearchCV

log_param = {'logistic__C' : np.logspace(-2, 3, 50),
             'logistic__penalty' : ['l1', 'l2']}

cv = GridSearchCV(estimator = pipeline, param_grid = log_param, cv = 5, scoring = 'accuracy')

cv.fit(X_train, y_train)
cv_pred = cv.predict(X_test)

print(cv.best_params_)
print(cv.score(X_test, y_test))
print(confusion_matrix(y_test, cv_pred))

{'logistic__C': 0.8685113737513529, 'logistic__penalty': 'l2'}
0.8251748251748252
[[70 11]
 [14 48]]


In [7]:
# == KNN
from sklearn.neighbors import KNeighborsClassifier

steps = [('scaler', StandardScaler()),
         ('knn', KNeighborsClassifier())]

pipeline = Pipeline(steps)
knn_param = {'knn__n_neighbors' : np.arange(1, 11)}

cv = GridSearchCV(estimator = pipeline, param_grid = knn_param, cv = 5, scoring = 'accuracy')

cv.fit(X_train, y_train)
cv_pred = cv.predict(X_test)

print(cv.best_params_)
print(cv.score(X_test, y_test))
print(confusion_matrix(y_test, cv_pred))

{'knn__n_neighbors': 7}
0.8391608391608392
[[74  7]
 [16 46]]


In [18]:
# == SVM
# -- Linear SVM
from sklearn.svm import SVC

linear_svm = SVC(kernel = 'linear')
linear_svm.fit(X_train, y_train)
linear_svm_pred = linear_svm.predict(X_test)

print(linear_svm.score(X_test, y_test))
print(confusion_matrix(y_test, linear_svm_pred))

0.7972027972027972
[[69 12]
 [17 45]]


In [44]:
# -- Properties of SVM, only support vectors would affect the model train
svm_small = SVC(kernel = 'linear')
X_small = X_train.reindex(linear_svm.support_).dropna()
y_small = y_train.reindex(linear_svm.support_).dropna()
print('Original data size is ', X_train.shape, " and support data size is ", X_small.shape)
svm_small.fit(X_small, y_small)
svm_small_pred = svm_small.predict(X_test)

print(svm_small.score(X_test, y_test))
print(confusion_matrix(y_test, svm_small_pred))

Original data size is  (571, 8)  and support data size is  (161, 8)
0.7972027972027972
[[69 12]
 [17 45]]


In [45]:
# -- SVM with RBF kernel
steps = [('scaler', StandardScaler()),
         ('svm', SVC(kernel = 'rbf'))]

pipeline = Pipeline(steps)
svm_para = {'svm__C':[0.1, 1, 10],
            'svm__gamma' : [0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10]}

cv = GridSearchCV(estimator = pipeline, param_grid = svm_para, cv = 5)

cv.fit(X_train, y_train)
cv_pred = cv.predict(X_test)

print(cv.best_params_)
print(cv.score(X_test, y_test))
print(confusion_matrix(y_test, cv_pred))

{'svm__C': 10, 'svm__gamma': 0.1}
0.8251748251748252
[[76  5]
 [20 42]]
