In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import LabelEncoder

In [2]:
df = pd.read_csv('titanic.csv')
df.sample(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
298,299,1,1,"Saalfeld, Mr. Adolphe",male,,0,0,19988,30.5,C106,S
283,284,1,3,"Dorking, Mr. Edward Arthur",male,19.0,0,0,A/5. 10482,8.05,,S
862,863,1,1,"Swift, Mrs. Frederick Joel (Margaret Welles Ba...",female,48.0,0,0,17466,25.9292,D17,S
721,722,0,3,"Jensen, Mr. Svend Lauritz",male,17.0,1,0,350048,7.0542,,S
656,657,0,3,"Radeff, Mr. Alexander",male,,0,0,349223,7.8958,,S


In [3]:
df = df.drop(['PassengerId', 'Name', 'Ticket','Cabin'],axis=1)
df.sample(5)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
415,0,3,female,,0,0,8.05,S
322,1,2,female,30.0,0,0,12.35,Q
6,0,1,male,54.0,0,0,51.8625,S
616,0,3,male,34.0,1,1,14.4,S
485,0,3,female,,3,1,25.4667,S


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Sex       891 non-null    object 
 3   Age       714 non-null    float64
 4   SibSp     891 non-null    int64  
 5   Parch     891 non-null    int64  
 6   Fare      891 non-null    float64
 7   Embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(2)
memory usage: 55.8+ KB


In [5]:
df.isnull().sum()

Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Embarked      2
dtype: int64

In [6]:
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=['Survived']),df['Survived'],test_size=0.2,random_state=0)

In [7]:
print(X_train.shape)
print(X_test.shape)

(712, 7)
(179, 7)


In [8]:
X_train.head(2)

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
140,3,female,,0,2,15.2458,C
439,2,male,31.0,0,0,10.5,S


In [9]:
y_train.head(5)

140    0
439    0
817    0
378    0
491    0
Name: Survived, dtype: int64

## Applying Imputation 

In [10]:
si_age = SimpleImputer()
si_embarked = SimpleImputer(strategy='most_frequent')

X_train_age = si_age.fit_transform(X_train[['Age']])
X_train_embarked = si_embarked.fit_transform(X_train[['Embarked']])

X_test_age = si_age.transform(X_test[['Age']])
X_test_embarked = si_embarked.transform(X_test[['Embarked']])

In [11]:
X_train_embarked

array([['C'],
       ['S'],
       ['C'],
       ['C'],
       ['S'],
       ['S'],
       ['S'],
       ['Q'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['C'],
       ['Q'],
       ['Q'],
       ['C'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['C'],
       ['S'],
       ['C'],
       ['S'],
       ['C'],
       ['S'],
       ['S'],
       ['Q'],
       ['S'],
       ['S'],
       ['C'],
       ['S'],
       ['C'],
       ['S'],
       ['S'],
       ['C'],
       ['S'],
       ['S'],
       ['C'],
       ['S'],
       ['C'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['C'],
       ['S'],
       ['S'],
       ['S'],
       ['C'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['C'],
       ['Q'],
       ['S'],
       ['S'],
       ['S'],
      

In [12]:
X_train_age

array([[29.74518389],
       [31.        ],
       [31.        ],
       [20.        ],
       [21.        ],
       [45.5       ],
       [22.        ],
       [29.74518389],
       [29.74518389],
       [26.        ],
       [25.        ],
       [21.        ],
       [31.        ],
       [15.        ],
       [29.74518389],
       [29.74518389],
       [65.        ],
       [29.74518389],
       [ 1.        ],
       [34.        ],
       [49.        ],
       [18.        ],
       [29.74518389],
       [70.        ],
       [14.        ],
       [19.        ],
       [30.        ],
       [31.        ],
       [32.        ],
       [16.        ],
       [50.        ],
       [24.        ],
       [56.        ],
       [ 7.        ],
       [ 9.        ],
       [33.        ],
       [19.        ],
       [32.5       ],
       [ 1.        ],
       [45.        ],
       [29.74518389],
       [19.        ],
       [21.        ],
       [ 4.        ],
       [28.        ],
       [17

## one hot encoding on Sex and Embarked 
### Both are nominal categorical variable

In [13]:
ohe_sex = OneHotEncoder(sparse_output=False,handle_unknown='ignore')
ohe_Embarked = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

X_train_sex = ohe_sex.fit_transform(X_train[['Sex']])
X_train_embarked = ohe_Embarked.fit_transform(X_train[['Embarked']])

X_test_sex = ohe_sex.transform(X_test[['Sex']])
X_test_embarked =ohe_Embarked.transform(X_test[['Embarked']])

In [14]:
X_train_embarked

array([[1., 0., 0., 0.],
       [0., 0., 1., 0.],
       [1., 0., 0., 0.],
       ...,
       [0., 1., 0., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.]])

In [15]:
X_train.head(2)

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
140,3,female,,0,2,15.2458,C
439,2,male,31.0,0,0,10.5,S


In [16]:
##remove the 3 columns form the X_train which are transformed
X_train_remove = X_train.drop(columns=['Sex','Age','Embarked']) 
X_test_remove = X_test.drop(columns=['Sex','Age','Embarked'])

In [17]:
##concatenate all the columns
X_train_transformed = np.concatenate((X_train_remove,X_train_age,X_train_sex,X_train_embarked), axis=1)
X_test_transformed = np.concatenate((X_test_remove,X_test_age,X_test_sex,X_test_embarked), axis=1)

In [18]:
X_train_transformed.shape

(712, 11)

In [19]:
X_test_transformed.shape

(179, 11)

In [37]:
#train the  model with the transformed dataset
from sklearn.tree import DecisionTreeClassifier
dt_clf = DecisionTreeClassifier()

In [38]:
dt_clf.fit(X_train_transformed,y_train)

In [40]:
y_pred = dt_clf.predict(X_test_transformed)
y_pred

array([0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0,
       1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1,
       0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1,
       1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0,
       1, 0, 1], dtype=int64)

In [41]:
#check the model accuracy
from sklearn.metrics import accuracy_score

In [42]:
accuracy_score(y_test,y_pred)

0.7821229050279329

In [43]:
##dump the model for dislocation 
import pickle

In [46]:
pickle.dump(ohe_sex,open('models/ohe_sex.pkl','wb'))
pickle.dump(ohe_Embarked,open('models/ohe_Embarked.pkl','wb'))
pickle.dump(dt_clf,open('models/dt_clf.pkl','wb'))