In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
train = pd.read_csv('train.csv')
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


###### To extract the titles from the name feature as name alone is not an important feature. 
We will write a regular expression which will find a pattern that has 1 whitespace chacater followed by several non whitespace characters followed by a dot.

In [4]:
import re
patt = re.compile('\s(\S+\.)')

In [5]:
# ary = []
# for i in train['Name'].values:
#     print(i ,"-->", re.search(patt,i)[1])
#     ary.append(re.search(patt,i)[1])
    
# print(type(ary))

# npary = np.array(ary)
# print(type(npary))

##### Here we created a numpy array which contains the titles of all the passengers. 
This code can be executed in one line as well. Hence I am commenting the above code and using an efficient way of code. 

In [6]:
Titles = np.array([re.search(patt,i)[1] for i in train['Name'].values])

In [7]:
np.unique(Titles)

array(['Capt.', 'Col.', 'Countess.', 'Don.', 'Dr.', 'Jonkheer.', 'Lady.',
       'Major.', 'Master.', 'Miss.', 'Mlle.', 'Mme.', 'Mr.', 'Mrs.',
       'Ms.', 'Rev.', 'Sir.'], dtype='<U9')

##### Now we will include Titles as a new feature and will drop the Name feature.

In [8]:
train = train.assign(Title = Titles)
train = train.drop(['Name'],axis=1)

In [9]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title
0,1,0,3,male,22.0,1,0,A/5 21171,7.25,,S,Mr.
1,2,1,1,female,38.0,1,0,PC 17599,71.2833,C85,C,Mrs.
2,3,1,3,female,26.0,0,0,STON/O2. 3101282,7.925,,S,Miss.
3,4,1,1,female,35.0,1,0,113803,53.1,C123,S,Mrs.
4,5,0,3,male,35.0,0,0,373450,8.05,,S,Mr.


In [10]:
train.groupby(train['Title']).size()

Title
Capt.          1
Col.           2
Countess.      1
Don.           1
Dr.            7
Jonkheer.      1
Lady.          1
Major.         2
Master.       40
Miss.        182
Mlle.          2
Mme.           1
Mr.          517
Mrs.         125
Ms.            1
Rev.           6
Sir.           1
dtype: int64

##### Here we see that Mlle. or Miss. or Ms are basically the same, so we will reassign them for better results.
##### Similarly we will make Capt. or Col. or Major. as Army

In [11]:
train['Title'] = train['Title'].replace(['Mlle.'],'Miss.')
train['Title'] = train['Title'].replace(['Ms.'],'Miss.')
train['Title'] = train['Title'].replace(['Mme.'],'Mr.')
train['Title'] = train['Title'].replace(['Capt.','Col.','Major.'],'Army.')
train['Title'] = train['Title'].replace(['Countess.','Don.','Jonkheer.','Lady.','Sir.'],'Noble.')


In [12]:
train.groupby(train['Title']).size()

Title
Army.        5
Dr.          7
Master.     40
Miss.      185
Mr.        518
Mrs.       125
Noble.       5
Rev.         6
dtype: int64

In [13]:
train.groupby(train['Title']).mean()

Unnamed: 0_level_0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Army.,615.2,0.4,1.0,56.6,0.2,0.2,38.02
Dr.,545.857143,0.428571,1.285714,42.0,0.571429,0.0,49.168457
Master.,414.975,0.575,2.625,4.574167,2.3,1.375,34.703125
Miss.,411.967568,0.702703,2.291892,21.845638,0.702703,0.540541,43.800092
Mr.,454.335907,0.158301,2.407336,32.347118,0.287645,0.15251,24.528159
Mrs.,453.16,0.792,2.0,35.898148,0.696,0.832,45.138533
Noble.,554.2,0.6,1.0,41.6,0.4,0.0,42.15
Rev.,485.666667,0.0,2.0,43.166667,0.166667,0.166667,18.3125


### Now lets modify the cabin feature as well
Here we only need the first alphabet of the cabin i.e. cabin type. Passengers that are not having any cabin will be assigned None

In [14]:
cabin = np.array([i[0] if not pd.isnull(i) else 'None' for i in train['Cabin']])

In [15]:
cabin

array(['None', 'C', 'None', 'C', 'None', 'None', 'E', 'None', 'None',
       'None', 'G', 'C', 'None', 'None', 'None', 'None', 'None', 'None',
       'None', 'None', 'None', 'D', 'None', 'A', 'None', 'None', 'None',
       'C', 'None', 'None', 'None', 'B', 'None', 'None', 'None', 'None',
       'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None',
       'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None',
       'D', 'None', 'B', 'C', 'None', 'None', 'None', 'None', 'None', 'B',
       'C', 'None', 'None', 'None', 'F', 'None', 'None', 'None', 'None',
       'None', 'None', 'None', 'None', 'F', 'None', 'None', 'None',
       'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None',
       'None', 'C', 'None', 'None', 'None', 'E', 'None', 'None', 'None',
       'A', 'D', 'None', 'None', 'None', 'None', 'D', 'None', 'None',
       'None', 'None', 'None', 'None', 'None', 'C', 'None', 'None',
       'None', 'None', 'None', 'None', 'None', 'B', 'None', 'None',
   

In [16]:
train = train.assign(Cabins = cabin)
train = train.drop(['Cabin'],axis=1)

In [17]:
train.groupby(train["Cabins"]).mean()

Unnamed: 0_level_0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
Cabins,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
A,439.6,0.466667,1.0,44.833333,0.133333,0.133333,39.623887
B,521.808511,0.744681,1.0,34.955556,0.361702,0.574468,113.505764
C,406.440678,0.59322,1.0,36.086667,0.644068,0.474576,100.151341
D,475.939394,0.757576,1.121212,39.032258,0.424242,0.30303,57.244576
E,502.4375,0.75,1.3125,38.116667,0.3125,0.3125,46.026694
F,370.384615,0.615385,2.384615,19.954545,0.538462,0.538462,18.696792
G,216.0,0.5,3.0,14.75,0.5,1.25,13.58125
,443.208151,0.299854,2.63901,27.555293,0.547307,0.365357,19.157325
T,340.0,0.0,1.0,45.0,0.0,0.0,35.5


In [18]:
train.groupby(train["Cabins"]).size()

Cabins
A        15
B        47
C        59
D        33
E        32
F        13
G         4
None    687
T         1
dtype: int64

In [19]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Embarked       889 non-null object
Title          891 non-null object
Cabins         891 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


###### Here we see that 'Embarked' field has 2 missing value.Hence we will assign them as the character that is occuring the most which is 'S'

In [20]:
train['Embarked'] = train['Embarked'].isnull().fillna("S")

In [21]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Embarked       891 non-null bool
Title          891 non-null object
Cabins         891 non-null object
dtypes: bool(1), float64(2), int64(5), object(4)
memory usage: 77.5+ KB


###### Now the only feature left that has missing value is Age.
One way to fill the empty value would be to take the mean of the column and fill all the missing or NAN value with it.
But we can do even better. 
Since, we know that there are people belonging to different class of society, so will assign age of the missing person with the average mean of the class and age and title he belongs to.

In [22]:
train.groupby(['Pclass','Title','Sex']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,PassengerId,Survived,Age,SibSp,Parch,Fare,Embarked
Pclass,Title,Sex,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,Army.,male,615.2,0.4,56.6,0.2,0.2,38.02,0.0
1,Dr.,female,797.0,1.0,49.0,0.0,0.0,25.9292,0.0
1,Dr.,male,576.75,0.5,42.0,1.0,0.0,73.4375,0.0
1,Master.,male,518.333333,1.0,5.306667,0.666667,2.0,117.802767,0.0
1,Miss.,female,436.8125,0.958333,29.744681,0.375,0.5625,121.694356,0.020833
1,Mr.,female,370.0,1.0,24.0,0.0,0.0,69.3,0.0
1,Mr.,male,441.186916,0.345794,41.58046,0.280374,0.252336,68.034385,0.0
1,Mrs.,female,491.785714,0.97619,40.882353,0.785714,0.380952,93.170636,0.02381
1,Noble.,female,658.5,1.0,40.5,0.5,0.0,63.05,0.0
1,Noble.,male,484.666667,0.333333,42.333333,0.333333,0.0,28.216667,0.0


In [23]:
train_medain = train.groupby(['Pclass','Title','Sex']).median()
train_medain = train_medain.reset_index()[['Sex', 'Pclass', 'Title', 'Age']]
train_medain.head()

Unnamed: 0,Sex,Pclass,Title,Age
0,male,1,Army.,56.0
1,female,1,Dr.,49.0
2,male,1,Dr.,44.0
3,male,1,Master.,4.0
4,female,1,Miss.,30.0


###### Here we see that a male belonging to Pclass 1 with title as Army will have an average of 56.0
###### Similarly an unmarried female belonging to Pclass 1  will have an average of 30.0

In [24]:
train_medain = train_medain.reset_index()[['Sex', 'Pclass', 'Title', 'Age']]

In [25]:
train_medain.head()

Unnamed: 0,Sex,Pclass,Title,Age
0,male,1,Army.,56.0
1,female,1,Dr.,49.0
2,male,1,Dr.,44.0
3,male,1,Master.,4.0
4,female,1,Miss.,30.0


In [26]:
train[train['Age'].isnull()][['Age',"Sex","Pclass",'Title']]

Unnamed: 0,Age,Sex,Pclass,Title
5,,male,3,Mr.
17,,male,2,Mr.
19,,female,3,Mrs.
26,,male,3,Mr.
28,,female,3,Miss.
29,,male,3,Mr.
31,,female,1,Mrs.
32,,female,3,Miss.
36,,male,3,Mr.
42,,male,3,Mr.


In [27]:
def getAge(i):
    return train_medain[(train_medain['Sex'] == i['Sex']) & (train_medain['Title'] == i['Title']) & (train_medain['Pclass'] == i['Pclass'])]['Age'].values[0]


###### A helper function is defined here which will return the average age.

In [28]:
train['Age'] = train.apply(lambda i : i['Age'] if not pd.isnull(i['Age']) else getAge(i) , axis=1)

In [29]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Sex            891 non-null object
Age            891 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Embarked       891 non-null bool
Title          891 non-null object
Cabins         891 non-null object
dtypes: bool(1), float64(2), int64(5), object(4)
memory usage: 77.5+ KB


In [30]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,Title,Cabins
0,1,0,3,male,22.0,1,0,A/5 21171,7.25,False,Mr.,
1,2,1,1,female,38.0,1,0,PC 17599,71.2833,False,Mrs.,C
2,3,1,3,female,26.0,0,0,STON/O2. 3101282,7.925,False,Miss.,
3,4,1,1,female,35.0,1,0,113803,53.1,False,Mrs.,C
4,5,0,3,male,35.0,0,0,373450,8.05,False,Mr.,


###### From here we will take care of categorical variables.
For that we will be using LabelEncoder and then OneHotEncoder provided by sklearn in preprocessing

In [31]:
from sklearn.preprocessing import LabelEncoder
lblEncoder = LabelEncoder()
train['Sex'] = lblEncoder.fit_transform(train['Sex'])

In [32]:
train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,0.647587,29.14385,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,0.47799,13.496074,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,0.0,21.0,0.0,0.0,7.9104
50%,446.0,0.0,3.0,1.0,26.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,1.0,36.75,1.0,0.0,31.0
max,891.0,1.0,3.0,1.0,80.0,8.0,6.0,512.3292


In [33]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,Title,Cabins
0,1,0,3,1,22.0,1,0,A/5 21171,7.25,False,Mr.,
1,2,1,1,0,38.0,1,0,PC 17599,71.2833,False,Mrs.,C
2,3,1,3,0,26.0,0,0,STON/O2. 3101282,7.925,False,Miss.,
3,4,1,1,0,35.0,1,0,113803,53.1,False,Mrs.,C
4,5,0,3,1,35.0,0,0,373450,8.05,False,Mr.,


In [34]:
train.count()

PassengerId    891
Survived       891
Pclass         891
Sex            891
Age            891
SibSp          891
Parch          891
Ticket         891
Fare           891
Embarked       891
Title          891
Cabins         891
dtype: int64

In [35]:
train['Embarked'] = lblEncoder.fit_transform(train['Embarked'])

In [36]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,Title,Cabins
0,1,0,3,1,22.0,1,0,A/5 21171,7.25,0,Mr.,
1,2,1,1,0,38.0,1,0,PC 17599,71.2833,0,Mrs.,C
2,3,1,3,0,26.0,0,0,STON/O2. 3101282,7.925,0,Miss.,
3,4,1,1,0,35.0,1,0,113803,53.1,0,Mrs.,C
4,5,0,3,1,35.0,0,0,373450,8.05,0,Mr.,


In [37]:
lblEncoder2 = LabelEncoder()
train['Title'] = lblEncoder2.fit_transform(train['Title'])

In [38]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,Title,Cabins
0,1,0,3,1,22.0,1,0,A/5 21171,7.25,0,4,
1,2,1,1,0,38.0,1,0,PC 17599,71.2833,0,5,C
2,3,1,3,0,26.0,0,0,STON/O2. 3101282,7.925,0,3,
3,4,1,1,0,35.0,1,0,113803,53.1,0,5,C
4,5,0,3,1,35.0,0,0,373450,8.05,0,4,


In [39]:
train['Cabins'] = lblEncoder2.fit_transform(train['Cabins'])

In [40]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,Title,Cabins
0,1,0,3,1,22.0,1,0,A/5 21171,7.25,0,4,7
1,2,1,1,0,38.0,1,0,PC 17599,71.2833,0,5,2
2,3,1,3,0,26.0,0,0,STON/O2. 3101282,7.925,0,3,7
3,4,1,1,0,35.0,1,0,113803,53.1,0,5,2
4,5,0,3,1,35.0,0,0,373450,8.05,0,4,7


###### We don't need ticket feature as we are already having Fare and Pclass.
Thus we are dropping the ticket feature 

In [41]:
train = train.drop(['Ticket'],axis=1)

In [42]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title,Cabins
0,1,0,3,1,22.0,1,0,7.25,0,4,7
1,2,1,1,0,38.0,1,0,71.2833,0,5,2
2,3,1,3,0,26.0,0,0,7.925,0,3,7
3,4,1,1,0,35.0,1,0,53.1,0,5,2
4,5,0,3,1,35.0,0,0,8.05,0,4,7


In [43]:
X = train.iloc[:,2:].values
X[0]

array([ 3.  ,  1.  , 22.  ,  1.  ,  0.  ,  7.25,  0.  ,  4.  ,  7.  ])

In [44]:
from sklearn.preprocessing import OneHotEncoder
onehotencoder = OneHotEncoder(categorical_features = [7,8])
X = onehotencoder.fit_transform(X)

X = X.toarray()

X[0]

array([ 0.  ,  0.  ,  0.  ,  0.  ,  1.  ,  0.  ,  0.  ,  0.  ,  0.  ,
        0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  1.  ,  0.  ,  3.  ,
        1.  , 22.  ,  1.  ,  0.  ,  7.25,  0.  ])

In [45]:
X = X[:, 1:]
X[0]

array([ 0.  ,  0.  ,  0.  ,  1.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,
        0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  1.  ,  0.  ,  3.  ,  1.  ,
       22.  ,  1.  ,  0.  ,  7.25,  0.  ])

In [46]:
y = train.iloc[:, 1:2].values

###### Our Data Preprocessing is completed here, we have removed all the categorical variables and taken care of missing values

In [47]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state =0, test_size = 0.2)

In [48]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
logreg.fit(X_train,y_train)

y_pred = logreg.predict(X_test)

from sklearn.metrics import accuracy_score
print(accuracy_score(y_test,y_pred))

from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test,y_pred))

0.8212290502793296
[[93 17]
 [15 54]]


  y = column_or_1d(y, warn=True)


###### Here we got an accuracy of over 82% which is very impressive 
Lets see if we can improve it using other classifiers

In [49]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors = 10, metric = 'minkowski', p = 2)

knn.fit(X_train,y_train)
y_pred = knn.predict(X_test)

from sklearn.metrics import accuracy_score
print(accuracy_score(y_test,y_pred))

from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test,y_pred))

0.7318435754189944
[[99 11]
 [37 32]]


  after removing the cwd from sys.path.


In [50]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators = 265)

rfc.fit(X_train,y_train)
y_pred = rfc.predict(X_test)

from sklearn.metrics import accuracy_score
print(accuracy_score(y_test,y_pred))

from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test,y_pred))

  after removing the cwd from sys.path.


0.8379888268156425
[[100  10]
 [ 19  50]]


In [51]:
from sklearn.svm import SVC
svc = SVC(kernel = 'linear', random_state = 0)

svc.fit(X_train,y_train)
y_pred = svc.predict(X_test)

from sklearn.metrics import accuracy_score
print(accuracy_score(y_test,y_pred))

from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test,y_pred))

  y = column_or_1d(y, warn=True)


0.7988826815642458
[[92 18]
 [18 51]]
