In [1]:
# We modify the test set in the same format as the train set to use in step 3

import pandas as pd
import numpy as np
from numpy.random import randn
from pandas import Series,DataFrame

from scipy import stats

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [2]:
test = pd.read_csv('test.csv')

In [3]:
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [4]:
# Number of missing values in each column of training data
missing_val_count_by_column = (test.isnull().sum())
print(missing_val_count_by_column[missing_val_count_by_column > 0])

Age       86
Fare       1
Cabin    327
dtype: int64


In [5]:
test.drop(['PassengerId','Name','Ticket','Fare','Cabin'],axis=1,inplace=True)

test.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Embarked
0,3,male,34.5,0,0,Q
1,3,female,47.0,1,0,S
2,2,male,62.0,0,0,Q
3,3,male,27.0,0,0,S
4,3,female,22.0,1,1,S


In [6]:
missing_val_count_by_column = (test.isnull().sum())
print(missing_val_count_by_column[missing_val_count_by_column > 0])

Age    86
dtype: int64


In [7]:
emb_dummies = pd.get_dummies(test['Embarked'])

emb_dummies.head()

Unnamed: 0,C,Q,S
0,0,1,0
1,0,0,1
2,0,1,0
3,0,0,1
4,0,0,1


In [8]:
test=pd.concat([test,emb_dummies],axis=1)

test.drop('C',axis=1,inplace=True)

test.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Embarked,Q,S
0,3,male,34.5,0,0,Q,1,0
1,3,female,47.0,1,0,S,0,1
2,2,male,62.0,0,0,Q,1,0
3,3,male,27.0,0,0,S,0,1
4,3,female,22.0,1,1,S,0,1


In [9]:
test.drop('Embarked',axis=1,inplace=True)

In [10]:
def alone(ones):
    sib,par=ones
    if sib < 1 and par < 1:
        return 1
    else:
        return 0

test['alone']=test[['SibSp','Parch']].apply(alone,axis=1)

test.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Q,S,alone
0,3,male,34.5,0,0,1,0,1
1,3,female,47.0,1,0,0,1,0
2,2,male,62.0,0,0,1,0,1
3,3,male,27.0,0,0,0,1,1
4,3,female,22.0,1,1,0,1,0


In [11]:
test.drop(['SibSp','Parch'],axis=1,inplace=True)

In [12]:
test['Age_Missing'] = test['Age'].isnull()

test.head()

Unnamed: 0,Pclass,Sex,Age,Q,S,alone,Age_Missing
0,3,male,34.5,1,0,1,False
1,3,female,47.0,0,1,0,False
2,2,male,62.0,1,0,1,False
3,3,male,27.0,0,1,1,False
4,3,female,22.0,0,1,0,False


In [13]:
def child(paa):
    age,sex=paa
    if age < 16:
        return 'child'
    else:
        return sex

test['person']=test[['Age','Sex']].apply(child,axis=1)

test.head(10)

Unnamed: 0,Pclass,Sex,Age,Q,S,alone,Age_Missing,person
0,3,male,34.5,1,0,1,False,male
1,3,female,47.0,0,1,0,False,female
2,2,male,62.0,1,0,1,False,male
3,3,male,27.0,0,1,1,False,male
4,3,female,22.0,0,1,0,False,female
5,3,male,14.0,0,1,1,False,child
6,3,female,30.0,1,0,1,False,female
7,2,male,26.0,0,1,0,False,male
8,3,female,18.0,0,0,1,False,female
9,3,male,21.0,0,1,0,False,male


In [14]:
# Let us see how many children has missing ages


person_group = test.groupby(test.person)

person_group.mean()

Unnamed: 0_level_0,Pclass,Age,Q,S,alone,Age_Missing
person,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
child,2.59375,7.15625,0.03125,0.78125,0.09375,0.0
female,2.107914,33.065789,0.172662,0.553957,0.482014,0.179856
male,2.311741,32.537634,0.08502,0.680162,0.740891,0.246964


In [49]:
# Let now replace the missing ages by their person mean

def age_replace(rep):
    age,age_miss=rep
    if age_miss == True:
        return 33
    else:
        return age

test['new_age']=test[['Age','Age_Missing']].apply(age_replace,axis=1)

In [52]:
test.head(10)

Unnamed: 0,Pclass,Sex,Age,Q,S,alone,Age_Missing,person,new_age
0,3,male,34.5,1,0,1,False,male,34.5
1,3,female,47.0,0,1,0,False,female,47.0
2,2,male,62.0,1,0,1,False,male,62.0
3,3,male,27.0,0,1,1,False,male,27.0
4,3,female,22.0,0,1,0,False,female,22.0
5,3,male,14.0,0,1,1,False,child,14.0
6,3,female,30.0,1,0,1,False,female,30.0
7,2,male,26.0,0,1,0,False,male,26.0
8,3,female,18.0,0,0,1,False,female,18.0
9,3,male,21.0,0,1,0,False,male,21.0


In [53]:
test.drop('Age',axis=1,inplace=True)

In [54]:
test.head()

Unnamed: 0,Pclass,Sex,Q,S,alone,Age_Missing,person,new_age
0,3,male,1,0,1,False,male,34.5
1,3,female,0,1,0,False,female,47.0
2,2,male,1,0,1,False,male,62.0
3,3,male,0,1,1,False,male,27.0
4,3,female,0,1,0,False,female,22.0


In [55]:
test.drop('Sex',axis=1,inplace=True)

In [58]:
per_dummies = pd.get_dummies(test['person'])

test=pd.concat([test,per_dummies],axis=1)

test.drop('person',axis=1,inplace=True)

test.head()

Unnamed: 0,Pclass,Q,S,alone,Age_Missing,new_age,child,female,male
0,3,1,0,1,False,34.5,0,0,1
1,3,0,1,0,False,47.0,0,1,0
2,2,1,0,1,False,62.0,0,0,1
3,3,0,1,1,False,27.0,0,0,1
4,3,0,1,0,False,22.0,0,1,0


In [59]:
test.drop('child',axis=1,inplace=True)

In [64]:


test_final = test.copy()

In [66]:
test_final.head()

Unnamed: 0,Pclass,Q,S,alone,Age_Missing,new_age,female,male
0,3,1,0,1,False,34.5,0,1
1,3,0,1,0,False,47.0,1,0
2,2,1,0,1,False,62.0,0,1
3,3,0,1,1,False,27.0,0,1
4,3,0,1,0,False,22.0,1,0


In [67]:
test_final.to_csv('test_final.csv')