In [82]:
import numpy as np
import pandas as pd

from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

import seaborn as sns

In [83]:
df = pd.read_csv('titanic.csv')[['Age','Pclass','SibSp','Parch',"Survived"]]

In [84]:
df.head()

Unnamed: 0,Age,Pclass,SibSp,Parch,Survived
0,22.0,3,1,0,0
1,38.0,1,1,0,1
2,26.0,3,0,0,1
3,35.0,1,1,0,1
4,35.0,3,0,0,0


In [85]:
df.dropna(inplace=True)

In [86]:
df.head()

Unnamed: 0,Age,Pclass,SibSp,Parch,Survived
0,22.0,3,1,0,0
1,38.0,1,1,0,1
2,26.0,3,0,0,1
3,35.0,1,1,0,1
4,35.0,3,0,0,0


In [87]:
X = df.iloc[:,0:4]
Y = df.iloc[:,-1]

In [88]:
X.head()

Unnamed: 0,Age,Pclass,SibSp,Parch
0,22.0,3,1,0
1,38.0,1,1,0
2,26.0,3,0,0
3,35.0,1,1,0
4,35.0,3,0,0


In [89]:
Y.head()

Unnamed: 0,Survived
0,0
1,1
2,1
3,1
4,0


In [90]:
np.mean(cross_val_score(LogisticRegression(),X,Y,scoring='accuracy',cv=20))

0.6933333333333332

APPLYING FEATURE CONSTRUCTION

In [91]:
X['Family_size'] = X['SibSp'] + X['Parch'] + 1

In [92]:
X.head()

Unnamed: 0,Age,Pclass,SibSp,Parch,Family_size
0,22.0,3,1,0,2
1,38.0,1,1,0,2
2,26.0,3,0,0,1
3,35.0,1,1,0,2
4,35.0,3,0,0,1


In [93]:
def myfunc(num):
  if num==1:
    #alone
    return 0
  elif num>1 and num <=4:
    #small family
    return 1
  else:
    #large family
    return 2

In [94]:
myfunc(4)

1

In [95]:
X['Family_type'] = X['Family_size'].apply(myfunc)

In [96]:
X.head()

Unnamed: 0,Age,Pclass,SibSp,Parch,Family_size,Family_type
0,22.0,3,1,0,2,1
1,38.0,1,1,0,2,1
2,26.0,3,0,0,1,0
3,35.0,1,1,0,2,1
4,35.0,3,0,0,1,0


In [97]:
X.drop(columns=['SibSp','Parch','Family_size'],inplace=True)

In [98]:
X.head()

Unnamed: 0,Age,Pclass,Family_type
0,22.0,3,1
1,38.0,1,1
2,26.0,3,0
3,35.0,1,1
4,35.0,3,0


In [99]:
#accuracy improved
np.mean(cross_val_score(LogisticRegression(),X,Y,scoring='accuracy',cv=20))

0.7003174603174602

FEATURE SPLITTING

In [100]:
data = pd.read_csv('titanic.csv')

In [101]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [102]:
data['Name']

Unnamed: 0,Name
0,"Braund, Mr. Owen Harris"
1,"Cumings, Mrs. John Bradley (Florence Briggs Th..."
2,"Heikkinen, Miss. Laina"
3,"Futrelle, Mrs. Jacques Heath (Lily May Peel)"
4,"Allen, Mr. William Henry"
...,...
886,"Montvila, Rev. Juozas"
887,"Graham, Miss. Margaret Edith"
888,"Johnston, Miss. Catherine Helen ""Carrie"""
889,"Behr, Mr. Karl Howell"


In [103]:
#splitting the salutation
data['Title'] = data['Name'].str.split(',', expand=True)[1].str.split('.',expand=True)[0]
#extracting before and after , first
#then from 2nd part,extracting before and after , again
#then from all these,keeping only the salutation part
data['Title']

Unnamed: 0,Title
0,Mr
1,Mrs
2,Miss
3,Mrs
4,Mr
...,...
886,Rev
887,Miss
888,Miss
889,Mr


In [104]:
data[['Title','Name']]

Unnamed: 0,Title,Name
0,Mr,"Braund, Mr. Owen Harris"
1,Mrs,"Cumings, Mrs. John Bradley (Florence Briggs Th..."
2,Miss,"Heikkinen, Miss. Laina"
3,Mrs,"Futrelle, Mrs. Jacques Heath (Lily May Peel)"
4,Mr,"Allen, Mr. William Henry"
...,...,...
886,Rev,"Montvila, Rev. Juozas"
887,Miss,"Graham, Miss. Margaret Edith"
888,Miss,"Johnston, Miss. Catherine Helen ""Carrie"""
889,Mr,"Behr, Mr. Karl Howell"


In [106]:
(data.groupby('Title')['Survived'].mean()).sort_values(ascending=False)


Unnamed: 0_level_0,Survived
Title,Unnamed: 1_level_1
the Countess,1.0
Mlle,1.0
Sir,1.0
Ms,1.0
Lady,1.0
Mme,1.0
Mrs,0.792
Miss,0.697802
Master,0.575
Col,0.5


In [114]:
data['is_married'] = 0
data['is_married'].loc[data['Title']=='Mrs'] =1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['is_married'].loc[data['Title']=='Mrs'] =1


In [115]:
data['is_married']

Unnamed: 0,is_married
0,0
1,0
2,0
3,0
4,0
...,...
886,0
887,0
888,0
889,0
