# Imports

In [1]:
!pip install yellowbrick --quiet

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
from sklearn import (
    metrics,
    model_selection,
)
from yellowbrick.classifier import (
    ConfusionMatrix,
    ROCAUC,
)
from yellowbrick.model_selection import (
    LearningCurve,
)


# Gather Data

In [6]:
df = pd.read_excel(r'titanic3.xls')
df_orig = df
df.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


# Clean Data

In [4]:
df.shape

(1309, 14)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 14 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   pclass     1309 non-null   int64  
 1   survived   1309 non-null   int64  
 2   name       1309 non-null   object 
 3   sex        1309 non-null   object 
 4   age        1046 non-null   float64
 5   sibsp      1309 non-null   int64  
 6   parch      1309 non-null   int64  
 7   ticket     1309 non-null   object 
 8   fare       1308 non-null   float64
 9   cabin      295 non-null    object 
 10  embarked   1307 non-null   object 
 11  boat       486 non-null    object 
 12  body       121 non-null    float64
 13  home.dest  745 non-null    object 
dtypes: float64(3), int64(4), object(7)
memory usage: 143.3+ KB


In [8]:
df.isnull().sum()

pclass          0
survived        0
name            0
sex             0
age           263
sibsp           0
parch           0
ticket          0
fare            1
cabin        1014
embarked        2
boat          823
body         1188
home.dest     564
dtype: int64

## Create Features

In [9]:
df = df.drop(columns=[
    'name',
    'ticket',
    'cabin',
    'boat',
    'body',
    'home.dest',

])

In [10]:
df.shape

(1309, 8)

In [12]:
df = pd.get_dummies(df, drop_first=True)

In [13]:
df.shape

(1309, 9)

In [14]:
df.columns

Index(['pclass', 'survived', 'age', 'sibsp', 'parch', 'fare', 'sex_male',
       'embarked_Q', 'embarked_S'],
      dtype='object')

In [16]:
X, y = df.drop(columns=['survived']), df['survived']

# Sample Data

In [17]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(
    X,
    y,
    test_size=.3,
    random_state=42,
)

In [18]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((916, 8), (916,), (393, 8), (393,))

# Impute Data

In [19]:
from sklearn.experimental import (
    enable_iterative_imputer
)
from sklearn import impute
from sklearn.compose import make_column_selector as selector

num_cols = selector(dtype_include=np.number)(X)
cat_cols = selector(dtype_include=object)(X)

imputer = impute.IterativeImputer()

imputed = imputer.fit_transform(X_train[num_cols])
X_train.loc[:, num_cols] = imputed

imputed = imputer.transform(X_test[num_cols])
X_test.loc[:, num_cols] = imputed


In [20]:
X_train.head()

Unnamed: 0,pclass,age,sibsp,parch,fare,sex_male,embarked_Q,embarked_S
1214,3.0,26.946753,0.0,0.0,8.6625,1.0,0.0,1.0
677,3.0,26.0,0.0,0.0,7.8958,1.0,0.0,1.0
534,2.0,19.0,0.0,0.0,26.0,0.0,0.0,1.0
1174,3.0,0.225946,8.0,2.0,69.55,0.0,0.0,1.0
864,3.0,28.0,0.0,0.0,7.775,0.0,0.0,1.0


In [21]:
X_train.isna().sum().sum()

0

In [22]:
X_test.isna().sum().sum()

0