In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from IPython.core.debugger import set_trace
import re
from sklearn import tree

In [2]:
train = pd.read_csv('./train.csv')
test = pd.read_csv('./test.csv')

In [3]:
train.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


PassengerId : int     : Id

Survived    : int     : Survival (0=No; 1=Yes)

Pclass      : int     : Passenger Class

Name        : object  : Name

Sex         : object  : Sex

Age         : float   : Age

SibSp       : int     : Number of Siblings/Spouses Aboard

Parch       : int     : Number of Parents/Children Aboard

Ticket      : object  : Ticket Number

Fare        : float   : Passenger Fare

Cabin       : object  : Cabin

Embarked    : object  : Port of Embarkation

                        (C=Cherbourg; Q=Queenstown; S=Southampton)
                        

In [5]:
train.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

## Feature Engineering

### 1. Passenger Class
It is obvious that the class of passenger is directly proportional to survival rate. If the importance of a person is more than others, they’ll get out of the disaster first

In [6]:
train[['Pclass','Survived']].groupby(['Pclass']).count()

Unnamed: 0_level_0,Survived
Pclass,Unnamed: 1_level_1
1,216
2,184
3,491


In [7]:
train[['Pclass','Survived']].groupby(['Pclass']).mean()

Unnamed: 0_level_0,Survived
Pclass,Unnamed: 1_level_1
1,0.62963
2,0.472826
3,0.242363


### 2. Sex

In [8]:
train[['Sex','Survived']].groupby(['Sex']).count()

Unnamed: 0_level_0,Survived
Sex,Unnamed: 1_level_1
female,314
male,577


In [9]:
train[['Sex','Survived']].groupby(['Sex']).mean()

Unnamed: 0_level_0,Survived
Sex,Unnamed: 1_level_1
female,0.742038
male,0.188908


We can see that 74% of all females were saved and only 18% of all males were saved

### 3. Family Size

In [10]:
# z = pd.DataFrame({
#     'c1': ['a','b','a','b','a','b'],
#     'c2': [1,1,1,1,0,1]
# })
# z[['c1','c2']].groupby(['c1']).count()
# z[['c1','c2']].groupby(['c1']).mean()

In [11]:
all_data = [train, test]
all_data[0]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.0750,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


Next two columns are SibSp(Number of Siblings/Spouses Aboard) and Parch(Number of Parents/Children Aboard), which are not directly related to whether a person has survived or not. That is where the idea of creating a new feature came in. For each row/passenger, we will determine his/her family size by adding SibSp + Parch + 1(him/her self). Family size differs from a minimum of 1 to a maximum of 11, where the family size of 4 having the highest survival rate of 72%.

In [12]:
for data in all_data:
    data['family_size'] = data['SibSp'] + data['Parch'] + 1

In [13]:
train[['family_size','Survived']].groupby(['family_size']).mean()

Unnamed: 0_level_0,Survived
family_size,Unnamed: 1_level_1
1,0.303538
2,0.552795
3,0.578431
4,0.724138
5,0.2
6,0.136364
7,0.333333
8,0.0
11,0.0


It seems to have a good effect on our prediction but let’s go further and categorize people to check whether they are alone in this ship or not. And after looking through it too, it seems to have a considerable impact on our output.

In [14]:
for data in all_data:
    data['is_alone']=0
    data.loc[data['family_size']==1,'is_alone'] = 1

In [15]:
train[['is_alone','Survived']].groupby(['is_alone']).count()

Unnamed: 0_level_0,Survived
is_alone,Unnamed: 1_level_1
0,354
1,537


In [16]:
train[['is_alone','Survived']].groupby(['is_alone']).mean()

Unnamed: 0_level_0,Survived
is_alone,Unnamed: 1_level_1
0,0.50565
1,0.303538


## 4. Embarked

From which place a passenger embarked has something to do with survival (not always). So, let’s take a look. In this column, there are plenty of NAs. To deal with it, we are going to replace NAs with ‘S’ because it is the most occurred value.

In [17]:
test['Embarked'].isnull().sum()

0

In [18]:
train['Embarked'].isnull().sum()

2

In [19]:
print(test['Embarked'].count())
print(test['Embarked'].isnull().count())
train.groupby('Embarked').count()


418
418


Unnamed: 0_level_0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,family_size,is_alone
Embarked,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
C,168,168,168,168,168,130,168,168,168,168,69,168,168
Q,77,77,77,77,77,28,77,77,77,77,4,77,77
S,644,644,644,644,644,554,644,644,644,644,129,644,644


In [20]:
for data in all_data:
    data['Embarked'] = data['Embarked'].fillna('S')
train[['Embarked','Survived']].groupby(['Embarked']).count()

Unnamed: 0_level_0,Survived
Embarked,Unnamed: 1_level_1
C,168
Q,77
S,646


## 5. Fare
We'll create bins of 4 groups and check who survived the accident, with how much fare group and how many.

In [21]:
train['Fare'].isnull().sum()
for data in all_data:
    data['Fare'] = data['Fare'].fillna(data['Fare'].median())

In [22]:
train['fare_category'] = pd.qcut(train['Fare'], 4)

In [23]:
train[['fare_category','Survived']].groupby(['fare_category']).mean()

Unnamed: 0_level_0,Survived
fare_category,Unnamed: 1_level_1
"(-0.001, 7.91]",0.197309
"(7.91, 14.454]",0.303571
"(14.454, 31.0]",0.454955
"(31.0, 512.329]",0.581081


## 6. Age

Age has missing values, we'll fill up with random integers between avg-stdDeviation and avg+stdDeviation

In [24]:
train['Age'].isnull().sum()
for data in all_data:
#     set_trace()
    avg = data['Age'].mean()
    std = data['Age'].std()
    size = data['Age'].isnull().sum()
    
    randomNumbertoReplace = np.random.randint(avg-std,avg+std,size=size)
    
    data['Age'][np.isnan(data['Age'])] = randomNumbertoReplace
    
    data['Age'] = data['Age'].astype(int)



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.


In [25]:
train['Age_category'] = pd.qcut(train['Age'],5)

In [26]:
train[['Age_category','Survived']].groupby(['Age_category']).mean()

Unnamed: 0_level_0,Survived
Age_category,Unnamed: 1_level_1
"(-0.001, 19.0]",0.451282
"(19.0, 25.0]",0.321839
"(25.0, 31.0]",0.365269
"(31.0, 40.0]",0.384211
"(40.0, 80.0]",0.387879


## 7. Name and Titles

In [27]:
def get_title(name):
    title_search = re.search(' ([A-Za-z]+)\. ', name)
    if title_search:
        return title_search.group(1)
    return ""

for data in all_data:
    data['title'] = data['Name'].apply(get_title)

for data in all_data:
    data['title'] = data['title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'],'Rare')
    data['title'] = data['title'].replace('Mlle','Miss')
    data['title'] = data['title'].replace('Ms','Miss')
    data['title'] = data['title'].replace('Mme','Mrs')
    
print(pd.crosstab(train['title'], train['Sex']))
print("----------------------")
print(train[['title','Survived']].groupby(['title'], as_index = False).mean())

Sex     female  male
title               
Master       0    40
Miss       185     0
Mr           0   517
Mrs        126     0
Rare         3    20
----------------------
    title  Survived
0  Master  0.575000
1    Miss  0.702703
2      Mr  0.156673
3     Mrs  0.793651
4    Rare  0.347826


In [28]:
train.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,family_size,is_alone,fare_category,Age_category,title
0,1,0,3,"Braund, Mr. Owen Harris",male,22,1,0,A/5 21171,7.25,,S,2,0,"(-0.001, 7.91]","(19.0, 25.0]",Mr
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38,1,0,PC 17599,71.2833,C85,C,2,0,"(31.0, 512.329]","(31.0, 40.0]",Mrs
2,3,1,3,"Heikkinen, Miss. Laina",female,26,0,0,STON/O2. 3101282,7.925,,S,1,1,"(7.91, 14.454]","(25.0, 31.0]",Miss
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35,1,0,113803,53.1,C123,S,2,0,"(31.0, 512.329]","(31.0, 40.0]",Mrs
4,5,0,3,"Allen, Mr. William Henry",male,35,0,0,373450,8.05,,S,1,1,"(7.91, 14.454]","(31.0, 40.0]",Mr


# Mapping Data
Mapping data is easy. By looking through the code you’ll have the idea how it works. Once done, now we have to select which features to use. Feature selection is as important as feature creation. We will drop unnecessary columns so that it doesn’t affect our final outcome.

In [29]:
#Map Data
for data in all_data:

    #Mapping Sex
    sex_map = { 'female':0 , 'male':1 }
    data['Sex'] = data['Sex'].map(sex_map).astype(int)

    #Mapping Title
    title_map = {'Mr':1, 'Miss':2, 'Mrs':3, 'Master':4, 'Rare':5}
    data['title'] = data['title'].map(title_map)
    data['title'] = data['title'].fillna(0)

    #Mapping Embarked
    embark_map = {'S':0, 'C':1, 'Q':2}
    data['Embarked'] = data['Embarked'].map(embark_map).astype(int)

    #Mapping Fare
    data.loc[ data['Fare'] <= 7.91, 'Fare']                            = 0
    data.loc[(data['Fare'] > 7.91) & (data['Fare'] <= 14.454), 'Fare'] = 1
    data.loc[(data['Fare'] > 14.454) & (data['Fare'] <= 31), 'Fare']   = 2
    data.loc[ data['Fare'] > 31, 'Fare']                               = 3
    data['Fare'] = data['Fare'].astype(int)

    #Mapping Age
    data.loc[ data['Age'] <= 16, 'Age']                       = 0
    data.loc[(data['Age'] > 16) & (data['Age'] <= 32), 'Age'] = 1
    data.loc[(data['Age'] > 32) & (data['Age'] <= 48), 'Age'] = 2
    data.loc[(data['Age'] > 48) & (data['Age'] <= 64), 'Age'] = 3
    data.loc[ data['Age'] > 64, 'Age']                        = 4

#Feature Selection
#Create list of columns to drop
drop_elements = ["Name", "Ticket", "Cabin", "SibSp", "Parch", "family_size"]

#Drop columns from both data sets
train = train.drop(drop_elements, axis = 1)
train = train.drop(['PassengerId','fare_category', 'Age_category'], axis = 1)
test = test.drop(drop_elements, axis = 1)

#Print ready to use data
print(train.head(10))

   Survived  Pclass  Sex  Age  Fare  Embarked  is_alone  title
0         0       3    1    1     0         0         0      1
1         1       1    0    2     3         1         0      3
2         1       3    0    1     1         0         1      2
3         1       1    0    2     3         0         0      3
4         0       3    1    2     1         0         1      1
5         0       3    1    1     1         2         1      1
6         0       1    1    3     3         0         1      1
7         0       3    1    0     2         0         0      4
8         1       3    0    1     1         0         0      3
9         1       2    0    0     2         1         0      3


## Prediction


In [30]:
clf = tree.DecisionTreeClassifier(random_state=7, criterion='entropy')

In [31]:
X_train = train.drop('Survived', axis=1)
Y_train = train['Survived']
X_test = test.drop('PassengerId', axis=1).copy()

In [32]:
clf.fit(X_train, Y_train)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=7,
            splitter='best')

In [33]:
Y_pred = clf.predict(X_test)

In [36]:
accuracy = round(clf.score(X_train, Y_train)*100,2)

In [37]:
accuracy

87.319999999999993

In [40]:
submission = pd.DataFrame({
    "PassengerId": test['PassengerId'],
    "Survived": Y_pred
})
submission.to_csv('submission.csv', index=False)