In [136]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer, MissingIndicator
from sklearn.compose import ColumnTransformer

In [4]:
df = pd.read_csv('train-2.csv', usecols=['Age','Fare','Survived'])

In [5]:
df.head()

Unnamed: 0,Survived,Age,Fare
0,0,22.0,7.25
1,1,38.0,71.2833
2,1,26.0,7.925
3,1,35.0,53.1
4,0,35.0,8.05


In [135]:
df.shape

(891, 3)

In [137]:
X = df.drop(['Survived'], axis=1)
y = df['Survived']

In [138]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)

In [139]:
X_train.head()

Unnamed: 0,Age,Fare
462,47.0,38.5
37,21.0,8.05
586,47.0,15.0
308,30.0,24.0
240,,14.4542


In [140]:
X_train.isnull().sum()

Age     139
Fare      0
dtype: int64

In [21]:
ct = ColumnTransformer([('tf1', SimpleImputer(), ['Age'])], remainder='passthrough')

In [22]:
ct.fit(X_train)

In [29]:
X_train_transformed = ct.fit_transform(X_train)

#### Let's also make sure the target column is not having any missing data | Not Necessary

In [25]:
y_train.isnull().sum()

0

In [32]:
X_test.isnull().sum()

Age     29
Fare     0
dtype: int64

In [33]:
ct2 = ColumnTransformer(transformers=[('tf1',SimpleImputer(),['Age'])], remainder='passthrough')

In [34]:
ct2.fit(X_test)

In [35]:
X_test_transformed = ct2.fit_transform(X_test)

#### Now Train a Model with Logistic Regression Algorithm

In [26]:
from sklearn.linear_model import LogisticRegression

In [27]:
log_reg = LogisticRegression()

In [30]:
log_reg.fit(X_train_transformed, y_train)

In [36]:
y_predicted_model = log_reg.predict(X_test_transformed)

In [37]:
from sklearn.metrics import accuracy_score

In [38]:
# predicted data = y_predicted_model
# Actual data = y_test

In [39]:
accuracy = accuracy_score(y_true=y_test, y_pred=y_predicted_model)

In [41]:
accuracy

0.6480446927374302

### Applying Missing Indicator method to handle the missing data in the dataset.

+ Let's see the accuracy when we use the Missing Indicator

In [44]:
df_2 = pd.read_csv('train-2.csv', usecols=['Age','Fare','Survived'])
df_2.head()

Unnamed: 0,Survived,Age,Fare
0,0,22.0,7.25
1,1,38.0,71.2833
2,1,26.0,7.925
3,1,35.0,53.1
4,0,35.0,8.05


In [45]:
X = df_2.drop(columns=['Survived'])
y = df_2.drop(columns=['Age','Fare'])

In [48]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [55]:
# type(X_train)

In [50]:
X_train.isnull().sum()

Age     134
Fare      0
dtype: int64

In [54]:
type(X_train['Age'].isnull())

pandas.core.series.Series

#### Adding a column which will store True or False based on the missing data in the existing column

In [57]:
X_train['Age_Indicator'] = X_train['Age'].isnull()

In [59]:
# X_train

In [63]:
# y_train.isnull().sum()

In [67]:
X_test.isnull().sum()

Age     43
Fare     0
dtype: int64

In [68]:
X_test['Age_Indicator'] = X_test['Age'].isnull()

In [69]:
X_test

Unnamed: 0,Age,Fare,Age_Indicator
762,20.0,7.2292,False
105,28.0,7.8958,False
861,21.0,11.5000,False
39,14.0,11.2417,False
621,42.0,52.5542,False
...,...,...,...
470,,7.2500,True
225,22.0,9.3500,False
653,,7.8292,True
340,2.0,26.0000,False


In [73]:
X_train.isnull().sum()

Age              134
Fare               0
Age_Indicator      0
dtype: int64

+ <b>Since the Age column/feature is having NaN values, hence to avoid the error in training we will Impute the missing value with the mean & we will focus on the new column which we added to improve the accuracy</b>

In [75]:
simp_Imp = SimpleImputer()

In [76]:
simp_Imp.fit(X_train)

In [77]:
X_train_trf = simp_Imp.fit_transform(X_train)

In [78]:
X_test_trf = simp_Imp.fit_transform(X_test)

In [86]:
(pd.DataFrame(X_test_trf)).isnull().sum()

0    0
1    0
2    0
dtype: int64

In [88]:
# X_train_trf

In [81]:
(pd.DataFrame(X_train_trf)).isnull().sum()

0    0
1    0
2    0
dtype: int64

In [70]:
from sklearn.linear_model import LogisticRegression

In [71]:
log_reg_indicator = LogisticRegression()

In [83]:
log_reg_indicator.fit(X_train_trf, y_train)

  y = column_or_1d(y, warn=True)


In [84]:
y_pred = log_reg_indicator.predict(X_test_trf)

In [85]:
accuracy_score(y_test,y_pred)

0.6480446927374302

In [94]:
# X_train

In [141]:
simp_Imp2 = SimpleImputer(add_indicator=True)

In [142]:
simp_Imp2.fit(X_train)

In [143]:
X_train_tf2 = simp_Imp2.fit_transform(X_train)
X_test_tf2 = simp_Imp2.fit_transform(X_test)

In [144]:
log_reg_indi = LogisticRegression()
log_reg_indi.fit(X_train_tf2, y_train)

In [145]:
y_pred2 = log_reg_indi.predict(X_test_tf2)

In [146]:
accuracy_score(y_test, y_pred2)

0.6424581005586593

In [113]:
X_train = X_train.drop(columns = ['Age_Indicator'])

In [114]:
X_train.head()

Unnamed: 0,Age,Fare
604,35.0,26.55
443,28.0,13.0
612,,15.5
330,,23.25
290,26.0,78.85


In [115]:
X_test = X_test.drop(columns=['Age_Indicator'])

In [116]:
X_test.head()

Unnamed: 0,Age,Fare
762,20.0,7.2292
105,28.0,7.8958
861,21.0,11.5
39,14.0,11.2417
621,42.0,52.5542


In [117]:
mi = MissingIndicator()

mi.fit(X_train)

In [118]:
X_train_missing_indicator = mi.fit_transform(X_train)
X_test_missing_indicator = mi.fit_transform(X_test)

In [121]:
# X_test_missing_indicator

In [122]:
X_train['Age_NA'] = X_train_missing_indicator
X_test['Age_NA'] = X_test_missing_indicator

In [123]:
X_train.head()

Unnamed: 0,Age,Fare,Age_NA
604,35.0,26.55,False
443,28.0,13.0,False
612,,15.5,True
330,,23.25,True
290,26.0,78.85,False


In [124]:
X_test.head()

Unnamed: 0,Age,Fare,Age_NA
762,20.0,7.2292,False
105,28.0,7.8958,False
861,21.0,11.5,False
39,14.0,11.2417,False
621,42.0,52.5542,False


In [126]:
simple = SimpleImputer()

In [127]:
X_train_trfd = simple.fit_transform(X_train)
X_test_trfd = simple.fit_transform(X_test)

In [130]:
pd.DataFrame(X_train_trfd).isnull().sum(0)

0    0
1    0
2    0
dtype: int64

In [131]:
log_reg_3 = LogisticRegression()

In [132]:
log_reg_3.fit(X_train_trfd, y_train)

  y = column_or_1d(y, warn=True)


In [133]:
y_pred_trfd = log_reg_3.predict(X_test_trfd)

In [134]:
accuracy_score(y_test, y_pred_trfd)

0.6480446927374302