In [59]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

import seaborn as sns

In [60]:
df = pd.read_csv('train.csv', usecols = ['Age','Pclass','SibSp','Parch','Survived'])

In [61]:
df.head() 

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch
0,0,3,22.0,1,0
1,1,1,38.0,1,0
2,1,3,26.0,0,0
3,1,1,35.0,1,0
4,0,3,35.0,0,0


In [62]:
df.dropna(inplace = True)

In [63]:
df.head()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch
0,0,3,22.0,1,0
1,1,1,38.0,1,0
2,1,3,26.0,0,0
3,1,1,35.0,1,0
4,0,3,35.0,0,0


In [64]:
X = df.drop(columns = ['Survived'])
y = df['Survived']

In [65]:
X.head()

Unnamed: 0,Pclass,Age,SibSp,Parch
0,3,22.0,1,0
1,1,38.0,1,0
2,3,26.0,0,0
3,1,35.0,1,0
4,3,35.0,0,0


In [66]:
np.mean(cross_val_score(LogisticRegression(), X,y, scoring ='accuracy', cv=20))

0.6933333333333332

### **Applying Feature Construction**

Here we will merge Parch column and SibSp to form a new feature and then with this new feature we will create new feature named 'Family_type'

Dataset used - titanic Dataset

In [67]:
X['Family_size'] = X['SibSp'] + X['Parch'] + 1    

In [68]:
X.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Family_size
0,3,22.0,1,0,2
1,1,38.0,1,0,2
2,3,26.0,0,0,1
3,1,35.0,1,0,2
4,3,35.0,0,0,1


In [69]:
def myfam(size):
  if size == 1:
    # alone
    return 0
  elif size > 0 and size <= 4:
    #small family
    return 1
  else:
    # large family
    return 2

In [70]:
X ['Family_type'] = X['Family_size'].apply(myfam)   # Called myfam() (the function that's created just above) and passed X['Family_size] data to the function

In [71]:
X.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Family_size,Family_type
0,3,22.0,1,0,2,1
1,1,38.0,1,0,2,1
2,3,26.0,0,0,1,0
3,1,35.0,1,0,2,1
4,3,35.0,0,0,1,0


In [72]:
# Now getting rid of SibSp, Parch and Family_size since we have already created Family_type column from these columns now

X.drop(columns = ['SibSp','Parch','Family_size'], inplace = True)

In [73]:
X.head()

Unnamed: 0,Pclass,Age,Family_type
0,3,22.0,1
1,1,38.0,1
2,3,26.0,0
3,1,35.0,1
4,3,35.0,0


In [74]:
# You will notice accuracy increased (70% approx) a bit compared to what we had earlier above (69.33)

np.mean(cross_val_score(LogisticRegression(), X,y, scoring = 'accuracy', cv = 20))

0.7003174603174602

### **Feature Splitting**

Here from the Name column of titanic dataset we will separate the Salutation (Mr, Mrs etc) from the actual name

Dataset used - titanic Dataset

In [75]:
df = pd.read_csv('train.csv')

In [76]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [77]:
df['Name']    # Here notice before comma we have name and after comma we have salutation (Mr, Mrs etc)

0                                Braund, Mr. Owen Harris
1      Cumings, Mrs. John Bradley (Florence Briggs Th...
2                                 Heikkinen, Miss. Laina
3           Futrelle, Mrs. Jacques Heath (Lily May Peel)
4                               Allen, Mr. William Henry
                             ...                        
886                                Montvila, Rev. Juozas
887                         Graham, Miss. Margaret Edith
888             Johnston, Miss. Catherine Helen "Carrie"
889                                Behr, Mr. Karl Howell
890                                  Dooley, Mr. Patrick
Name: Name, Length: 891, dtype: object

In [79]:
# Here you will see you will get two things. Name (which comes before comma) and salutation (which comes just after comma). Since we have to extract salutation we will extract [1] fro the extracted data

df['Name'].str.split(',' , expand = True)   

Unnamed: 0,0,1
0,Braund,Mr. Owen Harris
1,Cumings,Mrs. John Bradley (Florence Briggs Thayer)
2,Heikkinen,Miss. Laina
3,Futrelle,Mrs. Jacques Heath (Lily May Peel)
4,Allen,Mr. William Henry
...,...,...
886,Montvila,Rev. Juozas
887,Graham,Miss. Margaret Edith
888,Johnston,"Miss. Catherine Helen ""Carrie"""
889,Behr,Mr. Karl Howell


In [82]:
# Again we extracted the [1] (i.e. elements after commna in previous output) and then again on this we applied splitting based on full stop (.) and again when we find [0] of this we get Salutation
df['Title'] = df['Name'].str.split(',' , expand = True) [1].str.split('.', expand = True)[0] 

In [85]:
df[['Title', 'Name']]

Unnamed: 0,Title,Name
0,Mr,"Braund, Mr. Owen Harris"
1,Mrs,"Cumings, Mrs. John Bradley (Florence Briggs Th..."
2,Miss,"Heikkinen, Miss. Laina"
3,Mrs,"Futrelle, Mrs. Jacques Heath (Lily May Peel)"
4,Mr,"Allen, Mr. William Henry"
...,...,...
886,Rev,"Montvila, Rev. Juozas"
887,Miss,"Graham, Miss. Margaret Edith"
888,Miss,"Johnston, Miss. Catherine Helen ""Carrie"""
889,Mr,"Behr, Mr. Karl Howell"
