In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

import seaborn as sns

In [2]:
df = pd.read_csv('train.csv')[
    ['Age','Pclass','SibSp','Parch','Survived']
]
df.head()

Unnamed: 0,Age,Pclass,SibSp,Parch,Survived
0,22.0,3,1,0,0
1,38.0,1,1,0,1
2,26.0,3,0,0,1
3,35.0,1,1,0,1
4,35.0,3,0,0,0


In [3]:
df.isnull().mean()*100

Unnamed: 0,0
Age,19.86532
Pclass,0.0
SibSp,0.0
Parch,0.0
Survived,0.0


In [4]:
# drop the rows which have missing values
df.dropna(inplace=True)

In [5]:
df.head()

Unnamed: 0,Age,Pclass,SibSp,Parch,Survived
0,22.0,3,1,0,0
1,38.0,1,1,0,1
2,26.0,3,0,0,1
3,35.0,1,1,0,1
4,35.0,3,0,0,0


In [6]:
X = df.iloc[:,0:-1]
y = df.iloc[:,-1]

In [7]:
X.head()

Unnamed: 0,Age,Pclass,SibSp,Parch
0,22.0,3,1,0
1,38.0,1,1,0
2,26.0,3,0,0
3,35.0,1,1,0
4,35.0,3,0,0


In [8]:
np.mean(cross_val_score(
    LogisticRegression(),X,y,scoring='accuracy',cv=20
))

np.float64(0.6933333333333332)

**Applying Feature Construction**

In [9]:
X['Family_size'] = X['SibSp'] + X['Parch'] + 1
# the passenger + sibsp + parch
X.head()

Unnamed: 0,Age,Pclass,SibSp,Parch,Family_size
0,22.0,3,1,0,2
1,38.0,1,1,0,2
2,26.0,3,0,0,1
3,35.0,1,1,0,2
4,35.0,3,0,0,1


In [10]:
def myfunc(num):
  if num==1 :
    # alone
    return 0
  elif num > 1 and num <=4 :
    # small family
    return 1
  else:
    # large family
    return 2

In [12]:
myfunc(10)

2

In [13]:
X['Family_type'] = X['Family_size'].apply(myfunc)
X.head()

Unnamed: 0,Age,Pclass,SibSp,Parch,Family_size,Family_type
0,22.0,3,1,0,2,1
1,38.0,1,1,0,2,1
2,26.0,3,0,0,1,0
3,35.0,1,1,0,2,1
4,35.0,3,0,0,1,0


In [14]:
X.drop(columns=[
    'SibSp','Parch','Family_size'
],inplace=True)

In [15]:
X.head()

Unnamed: 0,Age,Pclass,Family_type
0,22.0,3,1
1,38.0,1,1
2,26.0,3,0
3,35.0,1,1
4,35.0,3,0


In [16]:
np.mean(cross_val_score(
    LogisticRegression(),X,y,scoring='accuracy',cv=20
))

np.float64(0.7003174603174602)

**Feature Splitting**

In [17]:
df = pd.read_csv('train.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [18]:
df['Name']

Unnamed: 0,Name
0,"Braund, Mr. Owen Harris"
1,"Cumings, Mrs. John Bradley (Florence Briggs Th..."
2,"Heikkinen, Miss. Laina"
3,"Futrelle, Mrs. Jacques Heath (Lily May Peel)"
4,"Allen, Mr. William Henry"
...,...
886,"Montvila, Rev. Juozas"
887,"Graham, Miss. Margaret Edith"
888,"Johnston, Miss. Catherine Helen ""Carrie"""
889,"Behr, Mr. Karl Howell"


In [19]:
df['Name'].str.split(',',
                     expand=True)

Unnamed: 0,0,1
0,Braund,Mr. Owen Harris
1,Cumings,Mrs. John Bradley (Florence Briggs Thayer)
2,Heikkinen,Miss. Laina
3,Futrelle,Mrs. Jacques Heath (Lily May Peel)
4,Allen,Mr. William Henry
...,...,...
886,Montvila,Rev. Juozas
887,Graham,Miss. Margaret Edith
888,Johnston,"Miss. Catherine Helen ""Carrie"""
889,Behr,Mr. Karl Howell


In [20]:
df['Name'].str.split(',',
                     expand=True)[1]

Unnamed: 0,1
0,Mr. Owen Harris
1,Mrs. John Bradley (Florence Briggs Thayer)
2,Miss. Laina
3,Mrs. Jacques Heath (Lily May Peel)
4,Mr. William Henry
...,...
886,Rev. Juozas
887,Miss. Margaret Edith
888,"Miss. Catherine Helen ""Carrie"""
889,Mr. Karl Howell


In [21]:
df['Name'].str.split(',',
                     expand=True)[1].str.split(
                         '.',expand=True
                     )

Unnamed: 0,0,1,2
0,Mr,Owen Harris,
1,Mrs,John Bradley (Florence Briggs Thayer),
2,Miss,Laina,
3,Mrs,Jacques Heath (Lily May Peel),
4,Mr,William Henry,
...,...,...,...
886,Rev,Juozas,
887,Miss,Margaret Edith,
888,Miss,"Catherine Helen ""Carrie""",
889,Mr,Karl Howell,


In [22]:
df['Name'].str.split(',',
                     expand=True)[1].str.split(
                         '.',expand=True
                     )[0]

Unnamed: 0,0
0,Mr
1,Mrs
2,Miss
3,Mrs
4,Mr
...,...
886,Rev
887,Miss
888,Miss
889,Mr


In [23]:
df['Title'] = df['Name'].str.split(',',
                     expand=True)[1].str.split(
                         '.',expand=True
                     )[0]
df['Title']

Unnamed: 0,Title
0,Mr
1,Mrs
2,Miss
3,Mrs
4,Mr
...,...
886,Rev
887,Miss
888,Miss
889,Mr


In [24]:
df[['Title','Name']]

Unnamed: 0,Title,Name
0,Mr,"Braund, Mr. Owen Harris"
1,Mrs,"Cumings, Mrs. John Bradley (Florence Briggs Th..."
2,Miss,"Heikkinen, Miss. Laina"
3,Mrs,"Futrelle, Mrs. Jacques Heath (Lily May Peel)"
4,Mr,"Allen, Mr. William Henry"
...,...,...
886,Rev,"Montvila, Rev. Juozas"
887,Miss,"Graham, Miss. Margaret Edith"
888,Miss,"Johnston, Miss. Catherine Helen ""Carrie"""
889,Mr,"Behr, Mr. Karl Howell"


In [28]:
df.sample(1)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title
438,439,0,1,"Fortune, Mr. Mark",male,64.0,1,4,19950,263.0,C23 C25 C27,S,Mr


In [33]:
df.groupby('Title')['Survived'].value_counts()

Unnamed: 0_level_0,Unnamed: 1_level_0,count
Title,Survived,Unnamed: 2_level_1
Capt,0,1
Col,0,1
Col,1,1
Don,0,1
Dr,0,4
Dr,1,3
Jonkheer,0,1
Lady,1,1
Major,0,1
Major,1,1


In [41]:
df.shape

(891, 13)

In [44]:
# Mr -> survived
 (81)/(436+81)

0.15667311411992263

In [50]:
df.groupby('Title')['Survived'].count()

Unnamed: 0_level_0,Survived
Title,Unnamed: 1_level_1
Capt,1
Col,2
Don,1
Dr,7
Jonkheer,1
Lady,1
Major,2
Master,40
Miss,182
Mlle,2


In [51]:
df.groupby('Title')['Survived'].value_counts()/df.groupby('Title')['Survived'].count()

Unnamed: 0_level_0,Unnamed: 1_level_0,0
Title,Survived,Unnamed: 2_level_1
Capt,0,1.0
Col,0,0.5
Col,1,0.5
Don,0,1.0
Dr,0,0.571429
Dr,1,0.428571
Jonkheer,0,1.0
Lady,1,1.0
Major,0,0.5
Major,1,0.5


In [62]:
# proportion of 1's
df.groupby('Title')['Survived'].mean().sort_values(ascending=False)

Unnamed: 0_level_0,Survived
Title,Unnamed: 1_level_1
Lady,1.0
Ms,1.0
Sir,1.0
Mme,1.0
the Countess,1.0
Mlle,1.0
Mrs,0.792
Miss,0.697802
Master,0.575
Major,0.5


people with these saultations had a higher rate of survival :

Mrs	0.792000

Miss	0.697802

Master	0.575000

In [63]:
df.sample(1)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.45,,S,Miss


In [64]:
df['Is_Married'] = 0
df.sample(1)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title,Is_Married
787,788,0,3,"Rice, Master. George Hugh",male,8.0,4,1,382652,29.125,,Q,Master,0


In [74]:
def myfunc(salutation):
  if salutation.strip() == 'Mrs':
    return 1
  else :
    return 0

df['Is_Married'] = df['Title'].apply(myfunc)
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title,Is_Married
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S,Mr,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Mrs,1
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S,Miss,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S,Mrs,1
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S,Mr,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S,Rev,0
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S,Miss,0
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S,Miss,0
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C,Mr,0


In [70]:
df.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title,Is_Married
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,Mr,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Mrs,0
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,Miss,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,Mrs,0
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,Mr,0


In [75]:
df['Is_Married']

Unnamed: 0,Is_Married
0,0
1,1
2,0
3,1
4,0
...,...
886,0
887,0
888,0
889,0
