In [1]:
import numpy as np
import pandas as pd

In [72]:
train = pd.read_csv('train.csv')
train.shape

(891, 12)

In [73]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [74]:
train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [75]:
test = pd.read_csv('test.csv')
target = train.Survived
train.drop(['Survived'],1,inplace=True)
data = train.append(test)
data.reset_index(inplace=True)
data.drop(['index','PassengerId'],inplace=True, axis=1) # index and passenger id are of no use in prediction
data.head()

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [76]:
# Extracting useful information from names
titles = set()
for name in data['Name']:
    titles.add(name.split(',')[1].split('.')[0].strip())
titles

{'Capt',
 'Col',
 'Don',
 'Dona',
 'Dr',
 'Jonkheer',
 'Lady',
 'Major',
 'Master',
 'Miss',
 'Mlle',
 'Mme',
 'Mr',
 'Mrs',
 'Ms',
 'Rev',
 'Sir',
 'the Countess'}

In [77]:
# Grouping this information into Titles.
Title_Dictionary = {
    "Capt": "Officer",
    "Col": "Officer",
    "Major": "Officer",
    "Jonkheer": "Royalty",
    "Don": "Royalty",
    "Sir" : "Royalty",
    "Dr": "Officer",
    "Rev": "Officer",
    "the Countess":"Royalty",
    "Mme": "Mrs",
    "Mlle": "Miss",
    "Ms": "Mrs",
    "Mr" : "Mr",
    "Mrs" : "Mrs",
    "Miss" : "Miss",
    "Master" : "Master",
    "Lady" : "Royalty"
}

data['Title'] = data['Name'].map(lambda name: name.split(',')[1].split('.')[0].strip())
data['Title'] = data.Title.map(Title_Dictionary)
data.drop(['Name'],1,inplace=True)
data.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title
0,3,male,22.0,1,0,A/5 21171,7.25,,S,Mr
1,1,female,38.0,1,0,PC 17599,71.2833,C85,C,Mrs
2,3,female,26.0,0,0,STON/O2. 3101282,7.925,,S,Miss
3,1,female,35.0,1,0,113803,53.1,C123,S,Mrs
4,3,male,35.0,0,0,373450,8.05,,S,Mr


In [78]:
# Processing Age
# Taking mean or median of age on whole data to fill missing values could introduce noise
# That's why age is filled by grouping sex, pclass and title together and then filling median age of that particular group.

group_train = data.iloc[:891].groupby(['Sex','Pclass','Title'])
group_train_median = group_train.median()
group_train_median = group_train_median.reset_index()[['Sex','Pclass','Title','Age']]
group_train_median.head()

Unnamed: 0,Sex,Pclass,Title,Age
0,female,1,Miss,30.0
1,female,1,Mrs,40.0
2,female,1,Officer,49.0
3,female,1,Royalty,40.5
4,female,2,Miss,24.0


In [79]:
def fill_age(row):
    condition = ((group_train_median['Sex']==row['Sex']) & (group_train_median['Pclass']==row['Pclass']) & (group_train_median['Title']==row['Title']))
    return group_train_median[condition]['Age'].values[0]

data['Age'] = data.apply(lambda row: fill_age(row) if np.isnan(row['Age']) else row['Age'],axis=1)

In [80]:
data.describe()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare
count,1309.0,1309.0,1309.0,1309.0,1308.0
mean,2.294882,29.21327,0.498854,0.385027,33.295479
std,0.837836,13.400994,1.041658,0.86556,51.758668
min,1.0,0.17,0.0,0.0,0.0
25%,2.0,21.0,0.0,0.0,7.8958
50%,3.0,26.0,0.0,0.0,14.4542
75%,3.0,36.5,1.0,0.0,31.275
max,3.0,80.0,8.0,9.0,512.3292


In [81]:
# Title is converted into readable form for classifier

title_dummies = pd.get_dummies(data['Title'], prefix= 'Title')
data = pd.concat([data, title_dummies], axis=1)
data.drop('Title', 1, inplace=True)
data.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title_Master,Title_Miss,Title_Mr,Title_Mrs,Title_Officer,Title_Royalty
0,3,male,22.0,1,0,A/5 21171,7.25,,S,0,0,1,0,0,0
1,1,female,38.0,1,0,PC 17599,71.2833,C85,C,0,0,0,1,0,0
2,3,female,26.0,0,0,STON/O2. 3101282,7.925,,S,0,1,0,0,0,0
3,1,female,35.0,1,0,113803,53.1,C123,S,0,0,0,1,0,0
4,3,male,35.0,0,0,373450,8.05,,S,0,0,1,0,0,0


In [82]:
# Processing mising Fare value. 
data.Fare.fillna(data.iloc[:891].Fare.mean(), inplace=True)

In [83]:
# Processing Embarked columnn and coverting into readable form.

data.Embarked.fillna('S',inplace=True)
embarked_dummies = pd.get_dummies(data['Embarked'], prefix='Embarked')
data = pd.concat([data, embarked_dummies], axis=1)
data.drop('Embarked', 1, inplace= True)
data.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Title_Master,Title_Miss,Title_Mr,Title_Mrs,Title_Officer,Title_Royalty,Embarked_C,Embarked_Q,Embarked_S
0,3,male,22.0,1,0,A/5 21171,7.25,,0,0,1,0,0,0,0,0,1
1,1,female,38.0,1,0,PC 17599,71.2833,C85,0,0,0,1,0,0,1,0,0
2,3,female,26.0,0,0,STON/O2. 3101282,7.925,,0,1,0,0,0,0,0,0,1
3,1,female,35.0,1,0,113803,53.1,C123,0,0,0,1,0,0,0,0,1
4,3,male,35.0,0,0,373450,8.05,,0,0,1,0,0,0,0,0,1


In [84]:
# Processing Sex into classifier readable form.
data['Sex'] = data['Sex'].map({'male' : 1, 'female' : 0})

In [85]:
# Processing Pclass.
pclass_dummies = pd.get_dummies(data['Pclass'], prefix='Pclass')
data = pd.concat([data, pclass_dummies], axis=1)
data.drop('Pclass', 1, inplace=True)
data.head()

Unnamed: 0,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Title_Master,Title_Miss,Title_Mr,Title_Mrs,Title_Officer,Title_Royalty,Embarked_C,Embarked_Q,Embarked_S,Pclass_1,Pclass_2,Pclass_3
0,1,22.0,1,0,A/5 21171,7.25,,0,0,1,0,0,0,0,0,1,0,0,1
1,0,38.0,1,0,PC 17599,71.2833,C85,0,0,0,1,0,0,1,0,0,1,0,0
2,0,26.0,0,0,STON/O2. 3101282,7.925,,0,1,0,0,0,0,0,0,1,0,0,1
3,0,35.0,1,0,113803,53.1,C123,0,0,0,1,0,0,0,0,1,1,0,0
4,1,35.0,0,0,373450,8.05,,0,0,1,0,0,0,0,0,1,0,0,1


In [86]:
data.drop('Ticket', 1, inplace=True)
data.drop('Cabin', 1, inplace=True)
data.head()

Unnamed: 0,Sex,Age,SibSp,Parch,Fare,Title_Master,Title_Miss,Title_Mr,Title_Mrs,Title_Officer,Title_Royalty,Embarked_C,Embarked_Q,Embarked_S,Pclass_1,Pclass_2,Pclass_3
0,1,22.0,1,0,7.25,0,0,1,0,0,0,0,0,1,0,0,1
1,0,38.0,1,0,71.2833,0,0,0,1,0,0,1,0,0,1,0,0
2,0,26.0,0,0,7.925,0,1,0,0,0,0,0,0,1,0,0,1
3,0,35.0,1,0,53.1,0,0,0,1,0,0,0,0,1,1,0,0
4,1,35.0,0,0,8.05,0,0,1,0,0,0,0,0,1,0,0,1


In [87]:
# Adding new features.

data['family_size'] = data['SibSp'] + data['Parch'] + 1
data['Single_family'] = data['family_size'].map(lambda s: 1 if s==1 else 0)
data['Small_family'] = data['family_size'].map(lambda s: 1 if 2<= s<= 4 else 0)
data['Large_family'] = data['family_size'].map(lambda s: 1 if s>=5 else 0)
data.head()

Unnamed: 0,Sex,Age,SibSp,Parch,Fare,Title_Master,Title_Miss,Title_Mr,Title_Mrs,Title_Officer,...,Embarked_C,Embarked_Q,Embarked_S,Pclass_1,Pclass_2,Pclass_3,family_size,Single_family,Small_family,Large_family
0,1,22.0,1,0,7.25,0,0,1,0,0,...,0,0,1,0,0,1,2,0,1,0
1,0,38.0,1,0,71.2833,0,0,0,1,0,...,1,0,0,1,0,0,2,0,1,0
2,0,26.0,0,0,7.925,0,1,0,0,0,...,0,0,1,0,0,1,1,1,0,0
3,0,35.0,1,0,53.1,0,0,0,1,0,...,0,0,1,1,0,0,2,0,1,0
4,1,35.0,0,0,8.05,0,0,1,0,0,...,0,0,1,0,0,1,1,1,0,0


In [103]:
train = data.iloc[:891]
test = data.iloc[891:]
target = pd.read_csv('train.csv', usecols=['Survived'])['Survived'].values
train.shape

(891, 21)

In [95]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel

model = RandomForestClassifier(n_estimators=50, max_features='sqrt')
model = model.fit(train,target)

In [99]:
y_pred = model.predict(train)
y_pred

array([0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1,
       1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0,
       0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0,
       1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0,
       0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1,
       0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 1,

In [100]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(target, y_pred)
cm

array([[544,   5],
       [ 10, 332]], dtype=int64)

In [102]:
# Optional
# Using model.transform to select only important features from train data.
# After feature selection, only 5 features are left.
model = SelectFromModel(model, prefit=True)
train_reduced = model.transform(train)
print(train_reduced.shape)

(891, 5)


In [104]:
# Running Classifier on reduced features.

model1 = RandomForestClassifier(n_estimators=50, max_features='sqrt')
model1 = model1.fit(train_reduced,target)
y_pred1 = model1.predict(train_reduced)
cm1 = confusion_matrix(target, y_pred1)
cm1

array([[541,   8],
       [  9, 333]], dtype=int64)

In [110]:
# preparing csv to upload on kaggle

model_rf = RandomForestClassifier(n_estimators=50, max_features='sqrt')
model_rf = model_rf.fit(train,target)
output = model_rf.predict(test).astype(int)
#output
df_output = pd.DataFrame()
aux_data  = pd.read_csv('test.csv')
df_output['PassengerId'] = aux_data['PassengerId']
df_output['Survived'] = output
df_output[['PassengerId','Survived']].to_csv('result.csv', index=False)
