In [1]:
import numpy as np
import pandas as pd

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from sklearn.metrics import roc_auc_score

In [2]:
data = pd.read_csv('train.csv', index_col=0)
data.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
data.isnull().sum()

Survived      0
Pclass        0
Name          0
Sex           0
Age         177
SibSp         0
Parch         0
Ticket        0
Fare          0
Cabin       687
Embarked      2
dtype: int64

In [4]:
y = data.Survived

In [5]:
cat_data = data[['Sex', 'Embarked']]
cat_data.head(3)

Unnamed: 0_level_0,Sex,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,male,S
2,female,C
3,female,S


In [6]:
num_data = data[['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']]
num_data.head(3)

Unnamed: 0_level_0,Pclass,Age,SibSp,Parch,Fare
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,3,22.0,1,0,7.25
2,1,38.0,1,0,71.2833
3,3,26.0,0,0,7.925


In [7]:
num_data[num_data.Age.isnull()].head(10)

Unnamed: 0_level_0,Pclass,Age,SibSp,Parch,Fare
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
6,3,,0,0,8.4583
18,2,,0,0,13.0
20,3,,0,0,7.225
27,3,,0,0,7.225
29,3,,0,0,7.8792
30,3,,0,0,7.8958
32,1,,1,0,146.5208
33,3,,0,0,7.75
37,3,,0,0,7.2292
43,3,,0,0,7.8958


In [8]:
num_imputer = SimpleImputer(strategy='median')
cat_imputer = SimpleImputer(strategy='most_frequent')

In [9]:
imp_num = pd.DataFrame(num_imputer.fit_transform(num_data))
imp_num.columns = num_data.columns
imp_num.head(3)

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare
0,3.0,22.0,1.0,0.0,7.25
1,1.0,38.0,1.0,0.0,71.2833
2,3.0,26.0,0.0,0.0,7.925


In [10]:
imp_cat = pd.DataFrame(cat_imputer.fit_transform(cat_data))
imp_cat.columns = cat_data.columns
imp_cat.head(3)

Unnamed: 0,Sex,Embarked
0,male,S
1,female,C
2,female,S


In [11]:
X = imp_num.join(imp_cat)
X.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex,Embarked
0,3.0,22.0,1.0,0.0,7.25,male,S
1,1.0,38.0,1.0,0.0,71.2833,female,C
2,3.0,26.0,0.0,0.0,7.925,female,S
3,1.0,35.0,1.0,0.0,53.1,female,S
4,3.0,35.0,0.0,0.0,8.05,male,S


In [12]:
X.isnull().sum()

Pclass      0
Age         0
SibSp       0
Parch       0
Fare        0
Sex         0
Embarked    0
dtype: int64

In [13]:
encoder = LabelEncoder()

In [14]:
cats = [col for col in X.columns if X[col].dtype =='object']

In [15]:
enc_X = X.join(X[cats].apply(encoder.fit_transform).add_suffix('_label'))
enc_X.head(3)

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex,Embarked,Sex_label,Embarked_label
0,3.0,22.0,1.0,0.0,7.25,male,S,1,2
1,1.0,38.0,1.0,0.0,71.2833,female,C,0,0
2,3.0,26.0,0.0,0.0,7.925,female,S,0,2


In [16]:
X = enc_X.drop(cats, axis=1)
X.head(3)

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex_label,Embarked_label
0,3.0,22.0,1.0,0.0,7.25,1,2
1,1.0,38.0,1.0,0.0,71.2833,0,0
2,3.0,26.0,0.0,0.0,7.925,0,2


In [17]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=11)

# DecisionTreeClassifier

In [18]:
model_dt = DecisionTreeClassifier(random_state=0)
model_dt.fit(X_train, y_train)

DecisionTreeClassifier(random_state=0)

In [19]:
preds_dt = model_dt.predict(X_valid)
score = roc_auc_score(preds_dt, y_valid)
print('DecisionTreeClassifier')
print('ROC Score >> ', score)

DecisionTreeClassifier
ROC Score >>  0.7894067796610168


# RandomForestClassifier

In [20]:
model_rf = RandomForestClassifier(random_state=0)
model_rf.fit(X_train, y_train)

RandomForestClassifier(random_state=0)

In [21]:
preds_rf = model_rf.predict(X_valid)
score= roc_auc_score(preds_rf, y_valid)
print('RandomForestClassifier')
print('ROC Score >> ', score)

RandomForestClassifier
ROC Score >>  0.8345682530635509


# KNeighborsClassifier

In [22]:
model_knn = KNeighborsClassifier()
model_knn.fit(X_train, y_train)

KNeighborsClassifier()

In [23]:
preds_knn = model_knn.predict(X_valid)
score = roc_auc_score(preds_knn, y_valid)
print('KNeighborsClassifier')
print('ROC Score >> ', score)

KNeighborsClassifier
ROC Score >>  0.6664142943670502


# SupportVectorClassifier

In [24]:
model_svc  = SVC()
model_svc.fit(X_train, y_train)

SVC()

In [25]:
preds_svc = model_svc.predict(X_valid)
score = roc_auc_score(preds_svc, y_valid)
print('SupportVectorsClassifier')
print('ROC Score >> ', score)

SupportVectorsClassifier
ROC Score >>  0.7136939571150097
