### Initial modeling

In [2]:
import pandas as pd
import numpy as np

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [2]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


Impute Age with MICE

In [3]:
train_mat = train.select_dtypes(include=['number']).as_matrix() #selecting quant vars from train and test and converting to mat
test_mat = test.select_dtypes(include=['number']).as_matrix()

In [4]:
from fancyimpute import MICE

mice = MICE(n_imputations=100,impute_type='col')

train_imputed = mice.complete(train_mat)
test_imputed = mice.complete(test_mat)



[MICE] Completing matrix with shape (891, 7)
[MICE] Starting imputation round 1/110, elapsed time 0.000
[MICE] Starting imputation round 2/110, elapsed time 0.006
[MICE] Starting imputation round 3/110, elapsed time 0.006
[MICE] Starting imputation round 4/110, elapsed time 0.006
[MICE] Starting imputation round 5/110, elapsed time 0.007
[MICE] Starting imputation round 6/110, elapsed time 0.008
[MICE] Starting imputation round 7/110, elapsed time 0.008
[MICE] Starting imputation round 8/110, elapsed time 0.009
[MICE] Starting imputation round 9/110, elapsed time 0.010
[MICE] Starting imputation round 10/110, elapsed time 0.011
[MICE] Starting imputation round 11/110, elapsed time 0.012
[MICE] Starting imputation round 12/110, elapsed time 0.013
[MICE] Starting imputation round 13/110, elapsed time 0.014
[MICE] Starting imputation round 14/110, elapsed time 0.015
[MICE] Starting imputation round 15/110, elapsed time 0.016
[MICE] Starting imputation round 16/110, elapsed time 0.016
[MIC

In [5]:
train_cols = train[['PassengerId','Survived','Pclass','Age','SibSp','Parch','Fare']].columns
test_cols = test[['PassengerId','Pclass','Age','SibSp','Parch','Fare']].columns

train_imputed = pd.DataFrame(train_imputed, columns=train_cols) #converting mice output back to pd dataframe
test_imputed = pd.DataFrame(test_imputed, columns=test_cols)

In [6]:
train_imputed['Age'].isnull().value_counts() #all missing values successfully imputed!

False    891
Name: Age, dtype: int64

In [7]:
print('Pre-Imputation Age Mean:',train['Age'].mean())
print('Post-Imputation Age Mean:',train_imputed['Age'].mean())
print('Pre-Imputation SD:',train['Age'].std())
print('Post-Imputation SD:',train_imputed['Age'].std())

Pre-Imputation Age Mean: 29.69911764705882
Post-Imputation Age Mean: 29.687388793380368
Pre-Imputation SD: 14.526497332334044
Post-Imputation SD: 13.026100222892355


Add categoricals back to dataframes with imputed age - * except for cabin *

In [8]:
train.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [9]:
train_imputed['Name'] = train['Name']
train_imputed['Sex'] = train['Sex']
train_imputed['Ticket'] = train['Ticket']
train_imputed['Embarked'] = train['Embarked']

In [10]:
train_imputed.head()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,Name,Sex,Ticket,Embarked
0,1.0,0.0,3.0,22.0,1.0,0.0,7.25,"Braund, Mr. Owen Harris",male,A/5 21171,S
1,2.0,1.0,1.0,38.0,1.0,0.0,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,PC 17599,C
2,3.0,1.0,3.0,26.0,0.0,0.0,7.925,"Heikkinen, Miss. Laina",female,STON/O2. 3101282,S
3,4.0,1.0,1.0,35.0,1.0,0.0,53.1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,113803,S
4,5.0,0.0,3.0,35.0,0.0,0.0,8.05,"Allen, Mr. William Henry",male,373450,S


In [11]:
test_imputed['Name'] = test['Name']
test_imputed['Sex'] = test['Sex']
test_imputed['Ticket'] = test['Ticket']
test_imputed['Embarked'] = test['Embarked']

In [12]:
test_imputed.head()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,Name,Sex,Ticket,Embarked
0,892.0,3.0,34.5,0.0,0.0,7.8292,"Kelly, Mr. James",male,330911,Q
1,893.0,3.0,47.0,1.0,0.0,7.0,"Wilkes, Mrs. James (Ellen Needs)",female,363272,S
2,894.0,2.0,62.0,0.0,0.0,9.6875,"Myles, Mr. Thomas Francis",male,240276,Q
3,895.0,3.0,27.0,0.0,0.0,8.6625,"Wirz, Mr. Albert",male,315154,S
4,896.0,3.0,22.0,1.0,1.0,12.2875,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,3101298,S


Label encoding categorical variables: Name, Sex, Ticket, Embarked

In [14]:
train_x = train_imputed.copy()
train_x = train_x.drop(['Survived'], axis=1)
train_x = train_x.drop(['PassengerId'], axis=1)

In [15]:
test = test_imputed.copy()
test = test.drop(['PassengerId'],axis=1)

In [16]:
print(train_x.shape)
print(test.shape)

(891, 9)
(418, 9)


In [17]:
ntrain = train_x.shape[0]
ntest = test.shape[0]

In [18]:
all_data = pd.concat([train_x,test]).reset_index(drop=True)
all_data.shape

(1309, 9)

In [19]:
cols = ('Name','Sex','Ticket','Embarked')

In [20]:
from sklearn.preprocessing import LabelEncoder

label_df = all_data.copy()

for c in cols:
    if label_df[c].dtype == 'object':
        le = LabelEncoder()
        label_df[c] = le.fit_transform(label_df[c].astype(str))

In [21]:
label_df.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Name,Sex,Ticket,Embarked
0,3.0,22.0,1.0,0.0,7.25,155,1,720,2
1,1.0,38.0,1.0,0.0,71.2833,286,0,816,0
2,3.0,26.0,0.0,0.0,7.925,523,0,914,2
3,1.0,35.0,1.0,0.0,53.1,422,0,65,2
4,3.0,35.0,0.0,0.0,8.05,22,1,649,2
