In [318]:
import numpy as np
import pandas as pd

from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

import seaborn as sns

In [319]:
df = pd.read_csv('train.csv')[['Age','Pclass','SibSp','Parch','Survived']]

In [320]:
df.head()

Unnamed: 0,Age,Pclass,SibSp,Parch,Survived
0,22.0,3,1,0,0
1,38.0,1,1,0,1
2,26.0,3,0,0,1
3,35.0,1,1,0,1
4,35.0,3,0,0,0


In [321]:
df.shape

(891, 5)

In [322]:
df.dropna(inplace=True)

In [323]:
df.shape

(714, 5)

In [324]:
X = df.iloc[:,0:4]
y = df.iloc[:,-1]

In [325]:
X.sample(3)

Unnamed: 0,Age,Pclass,SibSp,Parch
626,57.0,2,0,0
498,25.0,1,1,2
654,18.0,3,0,0


In [326]:
y.sample(3)

631    0
521    0
127    1
Name: Survived, dtype: int64

In [327]:
np.mean(cross_val_score(LogisticRegression(),X,y,scoring='accuracy',cv=20))

np.float64(0.6933333333333332)

## Applying Feature Construction

In [328]:
X['Family_size'] = X['SibSp'] + X['Parch'] + 1

In [329]:
X

Unnamed: 0,Age,Pclass,SibSp,Parch,Family_size
0,22.0,3,1,0,2
1,38.0,1,1,0,2
2,26.0,3,0,0,1
3,35.0,1,1,0,2
4,35.0,3,0,0,1
...,...,...,...,...,...
885,39.0,3,0,5,6
886,27.0,2,0,0,1
887,19.0,1,0,0,1
889,26.0,1,0,0,1


In [330]:
def myfunc(num):
    if num == 1:
        # alone
        return 0
    elif num > 1 and num <= 4:
        # small family
        return 1
    else:
        # large family
        return 2

In [331]:
X['Family_type'] = X['Family_size'].apply(myfunc)

In [332]:
X

Unnamed: 0,Age,Pclass,SibSp,Parch,Family_size,Family_type
0,22.0,3,1,0,2,1
1,38.0,1,1,0,2,1
2,26.0,3,0,0,1,0
3,35.0,1,1,0,2,1
4,35.0,3,0,0,1,0
...,...,...,...,...,...,...
885,39.0,3,0,5,6,2
886,27.0,2,0,0,1,0
887,19.0,1,0,0,1,0
889,26.0,1,0,0,1,0


In [333]:
X.drop(columns=['SibSp','Parch','Family_size'],inplace=True)

In [334]:
X

Unnamed: 0,Age,Pclass,Family_type
0,22.0,3,1
1,38.0,1,1
2,26.0,3,0
3,35.0,1,1
4,35.0,3,0
...,...,...,...
885,39.0,3,2
886,27.0,2,0
887,19.0,1,0
889,26.0,1,0


In [335]:
np.mean(cross_val_score(LogisticRegression(),X,y,scoring='accuracy',cv=20))

np.float64(0.7003174603174602)

## Feature Splitting

In [336]:
df = pd.read_csv('train.csv')

In [337]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [338]:
df.dropna(inplace=True)

In [339]:
df['Name']

1      Cumings, Mrs. John Bradley (Florence Briggs Th...
3           Futrelle, Mrs. Jacques Heath (Lily May Peel)
6                                McCarthy, Mr. Timothy J
10                       Sandstrom, Miss. Marguerite Rut
11                              Bonnell, Miss. Elizabeth
                             ...                        
871     Beckwith, Mrs. Richard Leonard (Sallie Monypeny)
872                             Carlsson, Mr. Frans Olof
879        Potter, Mrs. Thomas Jr (Lily Alexenia Wilson)
887                         Graham, Miss. Margaret Edith
889                                Behr, Mr. Karl Howell
Name: Name, Length: 183, dtype: object

In [340]:
df['Title'] = df['Name'].str.split(', ', expand=True)[1].str.split('.', expand=True)[0].str.strip()

In [341]:
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Mrs
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S,Mrs
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S,Mr
10,11,1,3,"Sandstrom, Miss. Marguerite Rut",female,4.0,1,1,PP 9549,16.7000,G6,S,Miss
11,12,1,1,"Bonnell, Miss. Elizabeth",female,58.0,0,0,113783,26.5500,C103,S,Miss
...,...,...,...,...,...,...,...,...,...,...,...,...,...
871,872,1,1,"Beckwith, Mrs. Richard Leonard (Sallie Monypeny)",female,47.0,1,1,11751,52.5542,D35,S,Mrs
872,873,0,1,"Carlsson, Mr. Frans Olof",male,33.0,0,0,695,5.0000,B51 B53 B55,S,Mr
879,880,1,1,"Potter, Mrs. Thomas Jr (Lily Alexenia Wilson)",female,56.0,0,1,11767,83.1583,C50,C,Mrs
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S,Miss


In [342]:
df[['Title','Name']]

Unnamed: 0,Title,Name
1,Mrs,"Cumings, Mrs. John Bradley (Florence Briggs Th..."
3,Mrs,"Futrelle, Mrs. Jacques Heath (Lily May Peel)"
6,Mr,"McCarthy, Mr. Timothy J"
10,Miss,"Sandstrom, Miss. Marguerite Rut"
11,Miss,"Bonnell, Miss. Elizabeth"
...,...,...
871,Mrs,"Beckwith, Mrs. Richard Leonard (Sallie Monypeny)"
872,Mr,"Carlsson, Mr. Frans Olof"
879,Mrs,"Potter, Mrs. Thomas Jr (Lily Alexenia Wilson)"
887,Miss,"Graham, Miss. Margaret Edith"


In [343]:
df.groupby('Title')['Survived'].mean().sort_values(ascending=False)

Title
Col             1.000000
Lady            1.000000
Master          1.000000
Mlle            1.000000
Mme             1.000000
Sir             1.000000
the Countess    1.000000
Miss            0.931818
Mrs             0.921053
Dr              0.666667
Major           0.500000
Mr              0.370370
Capt            0.000000
Name: Survived, dtype: float64

In [344]:
df['is_married'] = 0
# df['is_married'].loc[df['Title']=='Mrs']=1        FutureWarning

    # syntax ->  .loc[rows_condition, column_name] = value

df.loc[df['Title'] == 'Mrs','is_married']=1       # safe

In [345]:
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title,is_married
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Mrs,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S,Mrs,1
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S,Mr,0
10,11,1,3,"Sandstrom, Miss. Marguerite Rut",female,4.0,1,1,PP 9549,16.7000,G6,S,Miss,0
11,12,1,1,"Bonnell, Miss. Elizabeth",female,58.0,0,0,113783,26.5500,C103,S,Miss,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
871,872,1,1,"Beckwith, Mrs. Richard Leonard (Sallie Monypeny)",female,47.0,1,1,11751,52.5542,D35,S,Mrs,1
872,873,0,1,"Carlsson, Mr. Frans Olof",male,33.0,0,0,695,5.0000,B51 B53 B55,S,Mr,0
879,880,1,1,"Potter, Mrs. Thomas Jr (Lily Alexenia Wilson)",female,56.0,0,1,11767,83.1583,C50,C,Mrs,1
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S,Miss,0


In [346]:
df[['Title', 'is_married']].value_counts()

Title         is_married
Mr            0             81
Miss          0             44
Mrs           1             38
Master        0              7
Dr            0              3
Major         0              2
Mlle          0              2
Capt          0              1
Col           0              1
Lady          0              1
Mme           0              1
Sir           0              1
the Countess  0              1
Name: count, dtype: int64

In [347]:
X = df[['Age', 'Fare', 'is_married']]
y = df['Survived']

In [348]:
np.mean(cross_val_score(LogisticRegression(),X,y,scoring='accuracy',cv=20))

np.float64(0.7149999999999999)

In [349]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


X = df[['Age', 'Fare', 'is_married']]  # use relevant features
y = df['Survived']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

clf = LogisticRegression()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print('Accuracy:', accuracy_score(y_test, y_pred))


Accuracy: 0.7297297297297297
