In [None]:
# ! pip install sklearn jupyter pandas category-encoders

In [None]:
import os

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import numpy as np

from sklearn.impute import SimpleImputer
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.tree import DecisionTreeClassifier

In [None]:
FOLDER_NAME = "titanic"

validatin_X = pd.read_csv(os.path.join(FOLDER_NAME, "test.csv"))
train = pd.read_csv(os.path.join(FOLDER_NAME, "train.csv"))

# Drop useless cols
drop_cols = ["Name", "PassengerId"]
test_X = test_X.drop(columns=drop_cols)
train = train.drop(columns=drop_cols)

train

In [None]:
print(f'Train dataset has {train.shape[0]} rows and {train.shape[1]} columns.')
print(f'Test dataset has {test_X.shape[0]} rows and {test_X.shape[1]} columns.')

In [None]:
print(train.isna().sum() / train.shape[0])
print("-" * 25)
print(test_X.isna().sum() / test_X.shape[0])

Most columns are full and both test and train datasets have same distribution of NaN values.

In [None]:
train.describe()

In [None]:
# Lets explore the data
sns.set_theme(style="darkgrid")
sns.displot(train, x="Age", col="Survived", row="Sex")

As you can see females were more likely to survive rather than males from titanic

In [None]:
sns.displot(train, x="Fare", col="Survived", row="Pclass", binwidth=10)

Looks like the kids didnt survive the titanic either

Dataset is not horribly imbalanced

In [None]:
train_Y = train["Survived"]
train_X = train.drop(columns="Survived")
del train

In [None]:
print(train_Y.value_counts())

In [None]:
# Lets binary encode the sex values

label_encoder = LabelEncoder().fit(train_X["Sex"])
train_X["SexEnc"] = label_encoder.transform(train_X["Sex"])
train_X = train_X.drop(columns="Sex")
test_X["SexEnc"] = label_encoder.transform(test_X["Sex"])
test_X = test_X.drop(columns="Sex")

del label_encoder
train_X

In [None]:
print(f"There are {train_X['Embarked'].isna().sum()} NaN values in train_X['Embarked']")


Let us convert NaN to most frequent value for "Embarked" since it is 2/890 which is insignificant. Then we can one-hot encode the "Embarked" value.

In [None]:

def onehot_encode_embarked(train_X, test_X):
    train_embarked = train_X["Embarked"].to_numpy().reshape(-1, 1)
    test_embarked = test_X["Embarked"].to_numpy().reshape(-1, 1)

    imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
    train_embarked = imputer.fit_transform(train_embarked)
    test_embarked = imputer.transform(test_embarked)

    onehot_encoder = OneHotEncoder(sparse=False).fit(train_embarked)
    train_embarked_enc = onehot_encoder.transform(train_embarked)
    test_embarked_enc = onehot_encoder.transform(test_embarked)

    # Sanity check to see that data is encoded the same way in test and train
    print(test_embarked_enc[0], test_embarked[0])
    print(train_embarked_enc[0], train_embarked[0])
    categories = [f"Embarked_{c}" for c in onehot_encoder.categories_[0]]

    train_emb_df = pd.DataFrame(data=train_embarked_enc, columns=categories)
    test_emb_df = pd.DataFrame(data=test_embarked_enc, columns=categories)

    train_X = pd.concat([train_X, train_emb_df], axis=1).drop(columns="Embarked")
    test_X = pd.concat([test_X, test_emb_df], axis=1).drop(columns="Embarked")
    return train_X, test_X

train_X, test_X = onehot_encode_embarked(train_X, test_X)

# Lets drop Embarked_S as that value is already encoded implicity. It adds no new information
train_X = train_X.drop(columns="Embarked_S")
test_X = test_X.drop(columns="Embarked_S")

train_X

In [None]:
print(f"There are {train_X['Age'].isna().sum()} NaN values in train_X[Age]")
print(f"There are {train_X['Fare'].isna().sum()} NaN values in train_X[Fare]")
print(f"The mean for SexEnc == 1 is {train_X[train_X['SexEnc'] == 1]['Age'].mean()}")
print(f"The mean for SexEnc == 0 is {train_X[train_X['SexEnc'] == 0]['Age'].mean()}")

Let us replace the missing ages with different means for males and females

In [None]:
mask_1 = train_X['SexEnc'] == 1 & train_X['Age'].isna()
train_X["Age"] = train_X["Age"].mask(cond=mask_1, 
                                     other=train_X[train_X['SexEnc'] == 1]['Age'].mean())

mask_2 = train_X['SexEnc'] == 0 & train_X['Age'].isna()
train_X["Age"] = train_X["Age"].mask(cond=mask_2, 
                                     other=train_X[train_X['SexEnc'] == 0]['Age'].mean())

print(f"There are {train_X['Age'].isna().sum()} NaN values in train_X[Age]")
train_X

In [None]:
# Let us do the same calc for test set

mask_1 = test_X['SexEnc'] == 1 & test_X['Age'].isna()
test_X["Age"] = test_X["Age"].mask(cond=mask_1, 
                                     other=test_X[test_X['SexEnc'] == 1]['Age'].mean())

mask_2 = test_X['SexEnc'] == 0 & test_X['Age'].isna()
test_X["Age"] = test_X["Age"].mask(cond=mask_2, 
                                     other=test_X[test_X['SexEnc'] == 0]['Age'].mean())

print(f"There are {test_X['Age'].isna().sum()} NaN values in test_X[Age]")
test_X

In [None]:
print(f"There are {train_X['Ticket'].nunique()} unique values in Ticket column in train")
print(f"There are {test_X['Ticket'].nunique()} unique values in Ticket column in test")

One-hot/base-k style encoding it would make the number of features very high. This would impact our ML performance. This would be the perfect candidate for Target Encoding. 

Read this [blog](https://maxhalford.github.io/blog/target-encoding/) to get a better understanding.

In [None]:
def smooth_mean(train_X, train_Y, test_X, m):
    temp = pd.concat([train_X, train_Y], axis=1)

    # Prior survival prob
    prior = temp["Survived"].mean()

    # Compute the number of values and the mean of each group
    train_agg = temp.groupby("Ticket")["Survived"].agg(['count', 'mean'])
    counts = train_agg['count']
    means = train_agg['mean']

    # Compute the "smoothed" means for train dataset
    smooth = (counts * means + m * prior) / (counts + m)

    # Replace each value by the according smoothed mean in train
    # and test
    train_X["Ticket"] = temp["Ticket"].map(smooth)
    test_X["Ticket"] = test_X["Ticket"].map(smooth).fillna(prior)

    return train_X, test_X

train_X, test_X = smooth_mean(train_X, train_Y, test_X, 200)
train_X

In [None]:
# Let's drop the Cabin column since more than 70% of it is NaN anyway
train_X = train_X.drop(columns="Cabin")
test_X = test_X.drop(columns="Cabin")

# Lets also impute missing fares
def impute_fares(train_X, test_X):
    train_fare = train_X["Fare"].to_numpy().reshape(-1, 1)
    test_fare = test_X["Fare"].to_numpy().reshape(-1, 1)

    imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
    imputer.fit(train_fare)
    train_fare = imputer.transform(train_fare)
    test_fare = imputer.transform(test_fare)

    train_fare_df = pd.DataFrame(data=train_fare, columns=["Fare"])
    test_fare_df = pd.DataFrame(data=test_fare, columns=["Fare"])

    train_X["Fare"] = train_fare_df["Fare"]
    test_X["Fare"] = test_fare_df["Fare"]

    return train_X, test_X
    
train_X, test_X = impute_fares(train_X, test_X)

print(train_X.isna().sum())
print(test_X.isna().sum())

In [None]:


clf = DecisionTreeClassifier(random_state=0)
clf.fit(train_X, train_Y)

Y_pred = clf.predict(test_X)
print(confusion_matrix(train_Y, Y_pred))
