In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder,StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix

In [2]:
dataset = pd.read_csv('titanic.csv')
dataset.drop(['Name'],axis=1,inplace=True)
dataset.corr()

Unnamed: 0,Survived,Pclass,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare
Survived,1.0,-0.336528,-0.059665,-0.037082,0.080097,0.256179
Pclass,-0.336528,1.0,-0.391492,0.085026,0.020252,-0.548919
Age,-0.059665,-0.391492,1.0,-0.297669,-0.193741,0.112329
Siblings/Spouses Aboard,-0.037082,0.085026,-0.297669,1.0,0.414244,0.158839
Parents/Children Aboard,0.080097,0.020252,-0.193741,0.414244,1.0,0.21547
Fare,0.256179,-0.548919,0.112329,0.158839,0.21547,1.0


In [3]:
dataset = dataset[['Siblings/Spouses Aboard','Parents/Children Aboard','Age','Fare','Pclass','Sex','Survived']]
X = dataset.iloc[:,0:6].values
y = dataset.iloc[:,6:7].values.ravel()
dataset.head()

Unnamed: 0,Siblings/Spouses Aboard,Parents/Children Aboard,Age,Fare,Pclass,Sex,Survived
0,1,0,22.0,7.25,3,male,0
1,1,0,38.0,71.2833,1,female,1
2,0,0,26.0,7.925,3,female,1
3,1,0,35.0,53.1,1,female,1
4,0,0,35.0,8.05,3,male,0


In [4]:
X.shape

(887L, 6L)

In [5]:
standard_scaler = StandardScaler()
X[:,0:4] = standard_scaler.fit_transform(X[:,0:4])
label_encoder = LabelEncoder()
X[:,5] = label_encoder.fit_transform(X[:,5])
one_hot_encoder = OneHotEncoder(categorical_features=[4,5])
X = one_hot_encoder.fit_transform(X).toarray()



X.shape



(887L, 9L)

In [6]:
X_train,X_test, y_train, y_test = train_test_split(X, y, test_size =0.25, random_state = 0)

In [7]:
from sklearn.linear_model import LogisticRegression
log_clf = LogisticRegression(random_state=0)
log_clf.fit(X_train,y_train)
log_pred = log_clf.predict(X_test)

In [8]:
y_test

array([0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0,
       1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0,
       1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1,
       1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0,
       0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0,
       1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0], dtype=int64)

In [9]:
cf = confusion_matrix(y_test,log_pred)
cf

array([[123,  19],
       [ 25,  55]], dtype=int64)

In [10]:
from sklearn.svm import SVC
svc_clf = SVC()
svc_clf.fit(X_train,y_train)
svc_pred = svc_clf.predict(X_test)

In [11]:
cf = confusion_matrix(y_test,svc_pred)
cf

array([[122,  20],
       [ 23,  57]], dtype=int64)

In [12]:
from sklearn.tree import DecisionTreeClassifier
dt_clf = DecisionTreeClassifier()
dt_clf.fit(X_train,y_train)
dt_pred = dt_clf.predict(X_test)

In [13]:
cf = confusion_matrix(y_test,dt_pred)
cf

array([[118,  24],
       [ 23,  57]], dtype=int64)

In [14]:
from sklearn.ensemble import RandomForestClassifier
rf_clf = RandomForestClassifier(n_estimators=40)
rf_clf.fit(X_train,y_train)
rf_pred = rf_clf.predict(X_test)

In [15]:
cf = confusion_matrix(y_test,rf_pred)
cf

array([[127,  15],
       [ 24,  56]], dtype=int64)

In [16]:
cf = confusion_matrix(y_test,rf_pred)
cf

array([[127,  15],
       [ 24,  56]], dtype=int64)

In [17]:
from sklearn.ensemble import AdaBoostClassifier
ada_clf = AdaBoostClassifier(random_state=0)
ada_clf.fit(X_train,y_train)
y_pred = ada_clf.predict(X_test)

In [18]:
cf = confusion_matrix(y_test,y_pred)
cf

array([[118,  24],
       [ 22,  58]], dtype=int64)

In [19]:
log_acc = cross_val_score(estimator=log_clf,X = X_train,y=y_train,cv=10)
print log_acc.mean()
log_acc

0.793786415516


array([ 0.77941176,  0.83823529,  0.86567164,  0.8030303 ,  0.78787879,
        0.68181818,  0.77272727,  0.72727273,  0.86363636,  0.81818182])

In [20]:
svc_acc = cross_val_score(estimator=svc_clf,X = X_train,y=y_train,cv=10)
print svc_acc.mean()
svc_acc

0.83001636204


array([ 0.79411765,  0.86764706,  0.86567164,  0.8030303 ,  0.81818182,
        0.78787879,  0.86363636,  0.78787879,  0.87878788,  0.83333333])

In [21]:
dec_acc = cross_val_score(estimator=dt_clf,X = X_train,y=y_train,cv = 10)
print dec_acc.mean()
dec_acc

0.780374198526


array([ 0.76470588,  0.80882353,  0.80597015,  0.78787879,  0.75757576,
        0.68181818,  0.81818182,  0.75757576,  0.81818182,  0.8030303 ])

In [22]:
rf_acc = cross_val_score(estimator=rf_clf,X = X_train,y=y_train,cv = 10)
print rf_acc.mean()
rf_acc

0.80916141219


array([ 0.75      ,  0.80882353,  0.8358209 ,  0.83333333,  0.77272727,
        0.75757576,  0.84848485,  0.75757576,  0.87878788,  0.84848485])

In [23]:
ada_acc = cross_val_score(estimator=ada_clf,X = X_train,y=y_train,cv = 10)
print ada_acc.mean()
ada_acc

0.807355601671


array([ 0.80882353,  0.82352941,  0.88059701,  0.83333333,  0.72727273,
        0.74242424,  0.8030303 ,  0.71212121,  0.90909091,  0.83333333])

In [24]:
parameters = [{'C':[1.2,1.25,1.3,1.35,1.4],'kernel':['rbf'],'gamma':[0.16,0.18,0.185,0.19]}]
grid_search = GridSearchCV(estimator=svc_clf,param_grid=parameters,scoring='accuracy',cv=10,n_jobs=-1)
grid_search = grid_search.fit(X_train,y_train)
best_accuracy = grid_search.best_score_
best_parameters = grid_search.best_params_

print best_accuracy
print best_parameters

0.836090225564
{'kernel': 'rbf', 'C': 1.3, 'gamma': 0.18}


In [25]:
svc_clf = SVC(C=1.3,gamma=0.18)
svc_clf.fit(X_train,y_train)
svc_pred = svc_clf.predict(X_test)
svc_pred

array([1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0,
       1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1,
       0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0,
       0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0,
       1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0], dtype=int64)

In [26]:
cf = confusion_matrix(y_test,svc_pred)
cf

array([[122,  20],
       [ 22,  58]], dtype=int64)

In [27]:
accuracy = cross_val_score(estimator=svc_clf,X=X_train,y=y_train,scoring='accuracy',n_jobs=-1,cv=10)
accuracy

array([ 0.80882353,  0.85294118,  0.88059701,  0.81818182,  0.81818182,
        0.81818182,  0.86363636,  0.77272727,  0.87878788,  0.84848485])

In [28]:
accuracy.mean()

0.83605435389895444

In [29]:
'SVC is the best classifier for this problem from the analyzed results'

'SVC is the best classifier for this problem'