In [1]:
from sklearn.datasets import load_breast_cancer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_predict,cross_val_score
from sklearn import model_selection
from sklearn.model_selection import KFold

import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline,make_pipeline

from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
import pandas as pd

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier

from sklearn.ensemble import VotingClassifier
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.metrics import precision_score, recall_score, f1_score,classification_report

In [2]:
import sklearn
from sklearn.svm import SVC

In [3]:
data = pd.read_csv('titanic.csv')

In [4]:
data.head(10)

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,home.dest
0,3,0,"Abbing, Mr. Anthony",male,42,0,0,C.A. 5547,75500,,S,
1,3,0,"Abbott, Master. Eugene Joseph",male,13,0,2,C.A. 2673,202500,,S,"East Providence, RI"
2,3,0,"Abbott, Mr. Rossmore Edward",male,16,1,1,C.A. 2673,202500,,S,"East Providence, RI"
3,3,1,"Abbott, Mrs. Stanton (Rosa Hunt)",female,35,1,1,C.A. 2673,202500,,S,"East Providence, RI"
4,3,1,"Abelseth, Miss. Karen Marie",female,16,0,0,348125,76500,,S,"Norway Los Angeles, CA"
5,3,1,"Abelseth, Mr. Olaus Jorgensen",male,25,0,0,348122,76500,F G63,S,"Perkins County, SD"
6,2,0,"Abelson, Mr. Samuel",male,30,1,0,P/PP 3381,240000,,C,"Russia New York, NY"
7,2,1,"Abelson, Mrs. Samuel (Hannah Wizosky)",female,28,1,0,P/PP 3381,240000,,C,"Russia New York, NY"
8,3,1,"Abrahamsson, Mr. Abraham August Johannes",male,20,0,0,SOTON/O2 3101284,79250,,S,"Taalintehdas, Finland Hoboken, NJ"
9,3,1,"Abrahim, Mrs. Joseph (Sophie Halaut Easu)",female,18,0,0,2657,72292,,C,"Greensburg, PA"


## Dropping irrelevant columns and nan values

In [5]:
data.drop('cabin',axis=1,inplace=True)
data.drop('home.dest',axis=1,inplace=True)

In [6]:
data.drop('name',axis=1,inplace=True)

In [7]:
data.drop('ticket',axis=1,inplace=True)

In [8]:
data.dropna(inplace=True)

In [9]:
data.head()

Unnamed: 0,pclass,survived,sex,age,sibsp,parch,fare,embarked
0,3,0,male,42,0,0,75500,S
1,3,0,male,13,0,2,202500,S
2,3,0,male,16,1,1,202500,S
3,3,1,female,35,1,1,202500,S
4,3,1,female,16,0,0,76500,S


In [10]:
data.describe()

Unnamed: 0,pclass,survived,sibsp,parch
count,1043.0,1043.0,1043.0,1043.0
mean,2.209012,0.407478,0.504314,0.42186
std,0.840685,0.491601,0.91308,0.840655
min,1.0,0.0,0.0,0.0
25%,1.0,0.0,0.0,0.0
50%,2.0,0.0,0.0,0.0
75%,3.0,1.0,1.0,1.0
max,3.0,1.0,8.0,6.0


In [11]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1043 entries, 0 to 1045
Data columns (total 8 columns):
pclass      1043 non-null int64
survived    1043 non-null int64
sex         1043 non-null object
age         1043 non-null object
sibsp       1043 non-null int64
parch       1043 non-null int64
fare        1043 non-null object
embarked    1043 non-null object
dtypes: int64(4), object(4)
memory usage: 73.3+ KB


In [12]:
data.isnull().sum()/data.shape[0]

pclass      0.0
survived    0.0
sex         0.0
age         0.0
sibsp       0.0
parch       0.0
fare        0.0
embarked    0.0
dtype: float64

In [13]:
data.shape

(1043, 8)

## Make pclass category

In [14]:
class_dummy = pd.get_dummies(data.pclass)
class_dummy.columns = ["1st","2nd","3rd"]
class_dummy.head()

Unnamed: 0,1st,2nd,3rd
0,0,0,1
1,0,0,1
2,0,0,1
3,0,0,1
4,0,0,1


### Check 

In [15]:
class_dummy["2nd"].sum()

261

In [16]:
data[data.pclass == 2].shape

(261, 8)

## Make sex category

In [17]:
data.sex = data.sex.str.replace('female','0')

In [18]:
data.sex = data.sex.str.replace('male','1')

In [19]:
data.sex = data.sex.astype('uint8')

In [20]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1043 entries, 0 to 1045
Data columns (total 8 columns):
pclass      1043 non-null int64
survived    1043 non-null int64
sex         1043 non-null uint8
age         1043 non-null object
sibsp       1043 non-null int64
parch       1043 non-null int64
fare        1043 non-null object
embarked    1043 non-null object
dtypes: int64(4), object(3), uint8(1)
memory usage: 66.2+ KB


## Make embarked category

In [21]:
data.embarked.unique()

array(['S', 'C', 'Q'], dtype=object)

In [22]:
embarked_dummy = pd.get_dummies(data.embarked)
embarked_dummy.head()

Unnamed: 0,C,Q,S
0,0,0,1
1,0,0,1
2,0,0,1
3,0,0,1
4,0,0,1


### Check

In [23]:
embarked_dummy.S.sum()

781

In [24]:
data[data["embarked"]=="S"].shape

(781, 8)

## Convert age to numerical values

In [25]:
data.age.unique()

array(['42', '13', '16', '35', '25', '30', '28', '20', '18', '26', '40',
       '0,8333', '24', '29', '0,9167', '2', '19', '32', '48', '4', '6',
       '17', '38', '9', '11', '39', '27', '63', '34', '36', '53', '71',
       '57', '5', '3', '23', '45', '21', '47', '33', '0,75', '80', '22',
       '51', '50', '1', '12', '37', '58', '41', '15', '60', '44', '59',
       '18,5', '14', '54', '49', '76', '46', '52', '8', '31', '64', '70,5',
       '43', '55', '70', '22,5', '0,3333', '36,5', '0,1667', '65', '40,5',
       '10', '0,6667', '23,5', '62', '7', '32,5', '34,5', '61', '20,5',
       '30,5', '55,5', '28,5', '45,5', '56', '38,5', '14,5', '24,5', '67',
       '74', '0,4167', '11,5', '66', '26,5'], dtype=object)

In [26]:
data.age = data.age.str.replace(',',repl='.')

In [27]:
data.age = data.age.astype('float')

In [28]:
data.age.describe()

count    1043.000000
mean       29.813199
std        14.366261
min         0.166700
25%        21.000000
50%        28.000000
75%        39.000000
max        80.000000
Name: age, dtype: float64

## Convert age to categories

In [29]:
data.age = pd.cut(data.age,[0,10,25,40,80],labels=["age_0_10","age_10_25","age_25_40","age_40_80"])

In [30]:
data.age.unique()

[age_40_80, age_10_25, age_25_40, age_0_10]
Categories (4, object): [age_0_10 < age_10_25 < age_25_40 < age_40_80]

In [31]:
age_dummy = pd.get_dummies(data.age)
age_dummy.head()

Unnamed: 0,age_0_10,age_10_25,age_25_40,age_40_80
0,0,0,0,1
1,0,1,0,0
2,0,1,0,0
3,0,0,1,0
4,0,1,0,0


In [32]:
age_dummy.isnull().sum()

age_0_10     0
age_10_25    0
age_25_40    0
age_40_80    0
dtype: int64

## Sibsp and parch columns to uint

In [33]:
data.sibsp = data.sibsp.astype('uint8')

In [34]:
data.sibsp.unique()

array([0, 1, 4, 2, 3, 5, 8])

In [35]:
data.parch = data.parch.astype('uint8')

In [36]:
data.parch.unique()

array([0, 2, 1, 5, 3, 4, 6])

## Fare to float

In [37]:
data.fare = data.fare.str.replace(',','')

In [38]:
data.fare = data.fare.astype('float')

In [39]:
data.fare.describe()

count    1.043000e+03
mean     3.660302e+05
std      5.575365e+05
min      0.000000e+00
25%      8.050000e+04
50%      1.575000e+05
75%      3.507710e+05
max      5.123292e+06
Name: fare, dtype: float64

## Check data

In [40]:
data.head()

Unnamed: 0,pclass,survived,sex,age,sibsp,parch,fare,embarked
0,3,0,1,age_40_80,0,0,75500.0,S
1,3,0,1,age_10_25,0,2,202500.0,S
2,3,0,1,age_10_25,1,1,202500.0,S
3,3,1,0,age_25_40,1,1,202500.0,S
4,3,1,0,age_10_25,0,0,76500.0,S


In [41]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1043 entries, 0 to 1045
Data columns (total 8 columns):
pclass      1043 non-null int64
survived    1043 non-null int64
sex         1043 non-null uint8
age         1043 non-null category
sibsp       1043 non-null uint8
parch       1043 non-null uint8
fare        1043 non-null float64
embarked    1043 non-null object
dtypes: category(1), float64(1), int64(2), object(1), uint8(3)
memory usage: 44.8+ KB


In [42]:
data.shape

(1043, 8)

## Create X and y 

In [43]:
X = pd.concat([class_dummy,data["sex"],age_dummy,data["sibsp"],data["parch"],data["fare"],embarked_dummy],axis=1)

In [44]:
X.shape

(1043, 14)

In [45]:
X.head()

Unnamed: 0,1st,2nd,3rd,sex,age_0_10,age_10_25,age_25_40,age_40_80,sibsp,parch,fare,C,Q,S
0,0,0,1,1,0,0,0,1,0,0,75500.0,0,0,1
1,0,0,1,1,0,1,0,0,0,2,202500.0,0,0,1
2,0,0,1,1,0,1,0,0,1,1,202500.0,0,0,1
3,0,0,1,0,0,0,1,0,1,1,202500.0,0,0,1
4,0,0,1,0,0,1,0,0,0,0,76500.0,0,0,1


In [46]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1043 entries, 0 to 1045
Data columns (total 14 columns):
1st          1043 non-null uint8
2nd          1043 non-null uint8
3rd          1043 non-null uint8
sex          1043 non-null uint8
age_0_10     1043 non-null uint8
age_10_25    1043 non-null uint8
age_25_40    1043 non-null uint8
age_40_80    1043 non-null uint8
sibsp        1043 non-null uint8
parch        1043 non-null uint8
fare         1043 non-null float64
C            1043 non-null uint8
Q            1043 non-null uint8
S            1043 non-null uint8
dtypes: float64(1), uint8(13)
memory usage: 29.5 KB


In [47]:
y = data['survived']

In [48]:
y.shape

(1043,)

## Split into train and test datasets

In [49]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2,random_state = 123)

In [50]:
X_train.shape

(834, 14)

In [51]:
X_test.shape

(209, 14)

## Train data analysis

## Assume all people died

In [52]:
y[y==0].count()/y.shape[0]

0.59252157238734415

In [53]:
data2 = pd.concat([X_train,y_train],axis=1)

In [54]:
data2.shape

(834, 15)

In [55]:
data2.head()

Unnamed: 0,1st,2nd,3rd,sex,age_0_10,age_10_25,age_25_40,age_40_80,sibsp,parch,fare,C,Q,S,survived
483,0,1,0,0,0,1,0,0,1,2,650000.0,0,0,1,1
899,1,0,0,0,0,1,0,0,1,0,600000.0,0,0,1,1
770,0,0,1,1,0,0,1,0,0,0,78958.0,0,0,1,0
582,0,0,1,1,0,0,1,0,2,0,86625.0,0,0,1,0
740,0,0,1,1,0,0,1,0,0,0,77750.0,0,0,1,1


## Assume all women survived

In [56]:
data2.groupby("sex")["survived"].count()/data2.shape[0]

sex
0    0.36211
1    0.63789
Name: survived, dtype: float64

In [57]:
data2.groupby("1st")["survived"].count()

1st
0    611
1    223
Name: survived, dtype: int64

## ML accuracy must be bigger than 60% (assuming that all people died) and 64%(assuming that all women survived)

## Decision Tree

In [58]:
tree = DecisionTreeClassifier()
param_grid = [{'criterion':['gini','entropy'],
                              'max_depth':[10,50,100]},
                               {'criterion':['gini','entropy'],
                               'min_samples_split':[5,10]}]
grid = GridSearchCV(tree,
                    param_grid,
                   cv=10,
                   scoring='f1')
grid.fit(X_train,y_train)

GridSearchCV(cv=10, error_score='raise',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best'),
       fit_params={}, iid=True, n_jobs=1,
       param_grid=[{'criterion': ['gini', 'entropy'], 'max_depth': [10, 50, 100]}, {'criterion': ['gini', 'entropy'], 'min_samples_split': [5, 10]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='f1', verbose=0)

In [59]:
grid.best_score_

0.72183577801535104

In [60]:
grid.best_params_

{'criterion': 'entropy', 'min_samples_split': 5}

In [61]:
sorted_features = sorted(zip(grid.best_estimator_.feature_importances_,X_train.columns),reverse = True)
sorted_features

[(0.36458918543039509, 'fare'),
 (0.28558464830549102, 'sex'),
 (0.082208905763260884, '3rd'),
 (0.078229752032703243, 'sibsp'),
 (0.046251856746198472, 'age_0_10'),
 (0.038309210842581765, 'parch'),
 (0.03290165668407069, 'age_10_25'),
 (0.029024472181021582, '1st'),
 (0.01356804641483038, 'age_40_80'),
 (0.012669437665485561, 'age_25_40'),
 (0.010148820966904769, 'C'),
 (0.0052587302240823617, 'Q'),
 (0.00080650652968592567, 'S'),
 (0.00044877021328822299, '2nd')]

In [62]:
sklearn.tree.export_graphviz(grid.best_estimator_,feature_names=X_train.columns,out_file="titanic.dot")

## Random forest

In [63]:
pipeline_forest = Pipeline([('forest',RandomForestClassifier())])
param_grid_forest = {"forest__n_estimators":[10,50,100],
                     "forest__min_samples_leaf":[5,10,20],
                     "forest__bootstrap":[True,False],
                     "forest__criterion":["gini","entropy"]}

In [64]:
pipeline_bag_tree = Pipeline([('bag_tree',BaggingClassifier(base_estimator=DecisionTreeClassifier()))])
param_grid_bag_tree = {"bag_tree__n_estimators":[10,50,100],
                       "bag_tree__bootstrap":[True,False],
                       "bag_tree__max_samples":[0.2,0.5,0.8],
                       "bag_tree__base_estimator__min_samples_leaf":[5,10,20],
                       "bag_tree__base_estimator__max_features":['auto']}

## Logistic regression

In [65]:
pipeline_log = Pipeline([('scaler',StandardScaler()),
                        ('logreg',LogisticRegression())])
param_grid_log = {"logreg__penalty":["l1","l2"],
                  'logreg__C': [0.1,0.5,1.0,10.0]}

## SVM

In [66]:
pipeline_svm = Pipeline([('scaler',StandardScaler(with_mean=False)),
                        ('svm',SVC())])
param_grid_svm = [{'svm__kernel':['rbf'],
                  'svm__gamma':[0.0,0.5],
                  'svm__C':[0.5,1.0,10.0]},
                 {'svm__kernel':['poly'],
                  'svm__degree':[1,2],
                 'svm__C':[0.5,1.0,10.0]},
                 {'svm__kernel':['sigmoid'],
                 'svm__C':[0.5,1.0,10.0]}]

In [67]:
names = np.array(['LogisticRegression','RandomForest','BaggingTree','SVM'])
params = np.array([param_grid_log,param_grid_forest,param_grid_bag_tree,param_grid_svm])
pipelines = np.array([pipeline_log,pipeline_forest,pipeline_bag_tree,pipeline_svm])

In [68]:
best_score = grid.best_score_
best_model = grid.best_estimator_

for pipe,param_grid,name in zip(pipelines,params,names):
    print(name)
    grid = GridSearchCV(pipe,
                       param_grid,
                       cv=10,
                        refit=True,
                       n_jobs=3,
                       scoring = 'accuracy')
    grid.fit(X_train,y_train)
    
    print(grid.best_params_)
    print(grid.best_score_)
    
    if grid.best_score_ > best_score:
        best_score = grid.best_score_
        best_model = grid.best_estimator_

LogisticRegression
{'logreg__C': 0.1, 'logreg__penalty': 'l2'}
0.798561151079
RandomForest
{'forest__bootstrap': True, 'forest__criterion': 'gini', 'forest__min_samples_leaf': 5, 'forest__n_estimators': 10}
0.804556354916
BaggingTree
{'bag_tree__base_estimator__max_features': 'auto', 'bag_tree__base_estimator__min_samples_leaf': 5, 'bag_tree__bootstrap': False, 'bag_tree__max_samples': 0.5, 'bag_tree__n_estimators': 50}
0.803357314149
SVM
{'svm__C': 0.5, 'svm__degree': 2, 'svm__kernel': 'poly'}
0.802158273381


In [69]:
best_score

0.80455635491606714

In [70]:
y_pred = best_model.predict(X_train)

## Wrongly classified people

In [71]:
X_train2 = pd.concat([X_train,y_train],axis=1)

In [72]:
X_train2[X_train2["survived"] != y_pred]

Unnamed: 0,1st,2nd,3rd,sex,age_0_10,age_10_25,age_25_40,age_40_80,sibsp,parch,fare,C,Q,S,survived
740,0,0,1,1,0,0,1,0,0,0,77750.0,0,0,1,1
275,0,0,1,0,0,0,1,0,0,0,69500.0,0,1,0,1
100,0,1,0,1,0,0,1,0,1,0,260000.0,0,0,1,1
724,0,0,1,1,0,0,1,0,0,0,79250.0,0,0,1,1
905,1,0,0,1,1,0,0,0,0,2,1345000.0,1,0,0,1
839,1,0,0,1,0,0,1,0,0,0,401250.0,1,0,0,0
177,0,1,0,1,0,0,1,0,1,1,290000.0,0,0,1,1
735,0,0,1,1,1,0,0,0,0,1,31708.0,0,0,1,1
313,1,0,0,1,0,0,1,0,1,0,570000.0,0,0,1,1
931,0,0,1,1,0,0,0,1,0,0,79250.0,0,0,1,1


In [73]:
best_model.get_params

<bound method Pipeline.get_params of Pipeline(steps=[('forest', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=5,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False))])>

## Check result on test data

In [74]:
accuracy_score(best_model.predict(X_test),y_test)

0.84210526315789469

In [76]:
print(classification_report(y_pred=best_model.predict(X_test),y_true=y_test))

             precision    recall  f1-score   support

          0       0.82      0.94      0.88       124
          1       0.88      0.71      0.78        85

avg / total       0.85      0.84      0.84       209

