# 0. Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import altair as alt


from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer

from sklearn.compose import ColumnTransformer
from sklearn.dummy import DummyClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.preprocessing import (
    StandardScaler,
    OneHotEncoder
)

from sklearn.model_selection import (
    train_test_split,
    cross_validate,
    RandomizedSearchCV,
    GridSearchCV
)

chosen_seed = 2021

# 1. Reading Data

In [2]:
data_df = pd.read_csv("data/train.csv", index_col="PassengerId")
deploy_df = pd.read_csv("data/test.csv", index_col="PassengerId")

# 2. Data Splitting

In [3]:
train_df, test_df = train_test_split(data_df, test_size=0.2, random_state=chosen_seed)
print(train_df.shape)
print(test_df.shape)

(712, 11)
(179, 11)


In [4]:
X_train, y_train = train_df.drop(columns=["Survived"]), train_df["Survived"]
X_test, y_test = test_df.drop(columns=["Survived"]), test_df["Survived"]

# 3. Basic EDA

## 3.1 High level summary

In [5]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 712 entries, 91 to 117
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  712 non-null    int64  
 1   Pclass    712 non-null    int64  
 2   Name      712 non-null    object 
 3   Sex       712 non-null    object 
 4   Age       575 non-null    float64
 5   SibSp     712 non-null    int64  
 6   Parch     712 non-null    int64  
 7   Ticket    712 non-null    object 
 8   Fare      712 non-null    float64
 9   Cabin     170 non-null    object 
 10  Embarked  711 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 66.8+ KB


In [6]:
train_df.describe(include="all")

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
count,712.0,712.0,712,712,575.0,712.0,712.0,712.0,712.0,170,711
unique,,,712,2,,,,565.0,,131,3
top,,,"Saad, Mr. Amin",male,,,,347082.0,,C23 C25 C27,S
freq,,,1,461,,,,6.0,,4,514
mean,0.380618,2.285112,,,29.414783,0.533708,0.391854,,33.388155,,
std,0.48588,0.842875,,,14.589601,1.099284,0.802311,,50.807818,,
min,0.0,1.0,,,0.42,0.0,0.0,,0.0,,
25%,0.0,1.0,,,20.0,0.0,0.0,,7.925,,
50%,0.0,3.0,,,28.0,0.0,0.0,,15.0479,,
75%,1.0,3.0,,,37.0,1.0,0.0,,31.3875,,


## 3.2 Analyzing Nulls

In [7]:
na_summary = train_df.isna().sum().reset_index(name="na_count").query("na_count != 0")
na_summary["perc"] = na_summary["na_count"] / train_df.shape[0] * 100
na_summary

Unnamed: 0,index,na_count,perc
4,Age,137,19.241573
9,Cabin,542,76.123596
10,Embarked,1,0.140449


Observations:
- We have 77% nulls in Cabin. Hence we will not use this feature
- We only have 2 missing values in Embarked. Creating a new category called `missing` may not be useful as we only have 2 records to learn from. We will impute this with most frequent value
- We have around 20% nulls in age. We will look into it in details to find out an imputation strategy

## 3.3 EDA - Initial feature classification

- This classification is based on our initial understanding of the data
- It will be used for EDA only
- This is subjected to change based on our findings from EDA

In [8]:
numeric_features = ["Age", "SibSp" ,"Parch", "Fare"]
categorical_features = ["Sex", "Embarked"]
ordinal_features = ["Pclass"]
remainder_features = []
drop_features = ["Name", "Ticket", "Cabin"]

## 3.4 EDA - Numeric features

In [9]:
numeric_plots = alt.Chart(train_df).mark_line(interpolate="step").encode(
    x=alt.X(alt.repeat("repeat"), type="quantitative", bin=alt.Bin(maxbins=20)), y="count()", color="Survived:N"
).properties(width=200, height=100).repeat(repeat = numeric_features, columns=2)
numeric_plots

In [10]:
numeric_plots = (
    alt.Chart(train_df.reset_index())
    .mark_point(opacity=0.2, size=5)
    .encode(
        x=alt.X(alt.repeat("column"), type="quantitative"),
        y=alt.X(alt.repeat("row"), type="quantitative"),
        tooltip="PassengerId"
    )
    .properties(width=180, height=100)
    .repeat(column=numeric_features, row=numeric_features)
)
numeric_plots

In [11]:
# correlation chart all variable
corr_df = train_df[numeric_features].corr().stack().reset_index(name="corr")
corr_df["round_corr"] = np.round(corr_df["corr"], 2)
corr_plot = (
    alt.Chart(
        corr_df.query("level_0 != 'is_canceled' & level_1 != 'is_canceled'"),
        title="Feature Correlation",
    )
    .mark_rect()
    .encode(
        x="level_0",
        y="level_1",
        tooltip="corr",
        color=alt.Color(
            "corr", scale=alt.Scale(domain=(-1, 1), scheme="purpleorange")
        ),
    )
    .properties(width=300, height=300)
)
corr_text = (
    alt.Chart(corr_df.query("level_0 != 'is_canceled' & level_1 != 'is_canceled'"))
    .mark_text(size=12)
    .encode(
        x=alt.X("level_0", title="Features"),
        y=alt.Y("level_1", title="Features"),
        text="round_corr",
    )
)
corr_all = corr_plot + corr_text
corr_all

**Observations:**

- Passenger ID 738, 259 looks like an outlier in terms of Fare
- Fare is heavily skewed, we will try log transformation
- Parch and SibSp looks like ordinal feature. We will try to engineer new features from them

## 3.5 EDA - Categorical vs numerical features

In [12]:
numeric_features = ["Age", "Fare"]
categorical_features = ["Sex", "Embarked", "SibSp", "Parch"]
ordinal_features = ["Pclass"]
remainder_features = []
drop_features = ["Name", "Ticket", "Cabin"]

In [13]:
cat_vs_num_plots = (
    alt.Chart(train_df.reset_index())
    .mark_boxplot()
    .encode(
        x=alt.X(alt.repeat("column"), type="quantitative"),
        y=alt.X(alt.repeat("row"), type="nominal"),
        tooltip="PassengerId"
    )
    .properties(width=180, height=100)
    .repeat(column=numeric_features, row=ordinal_features + categorical_features + ["Survived"])
)
cat_vs_num_plots

**Observations:**

- Passenger ID 738, 259 looks like an outlier in terms of Fare. But they both survived. Hence, we will not remove them and try to scale the data to make them relevant
- Age has significant correlation with Sex and Pclass

## 3.6 EDA - Categorical vs categorical features

In [14]:
categorical_plots = (
    alt.Chart(train_df)
    .mark_circle()
    .encode(
        x=alt.X(alt.repeat("column"), type="nominal"),
        y=alt.X(alt.repeat("row"), type="nominal"),
        size="count()"
    )
    .properties(width=180, height=100)
    .repeat(
        column=ordinal_features + categorical_features + ["Survived"],
        row=ordinal_features + categorical_features + ["Survived"],
    )
)
categorical_plots

**Observations:**

- Pclass is strongly related with Survived and Embarked. It will suit more as ordinal feature
- Sex is significant factor for survival
- C Embarkment has more chance of survival

## 3.7 EDA - Class Imbalance

In [15]:
class_df = train_df.value_counts("Survived").reset_index(name = "n_observations")
class_df = class_df.assign(perc = class_df["n_observations"]/ train_df.shape[0]*100)
class_df

Unnamed: 0,Survived,n_observations,perc
0,0,441,61.938202
1,1,271,38.061798


**Observation:**
    
- We have class imbalance.
- 62% of training data indicate non defaulters
- 38% of training data indicate defaults
- We will use f1-score as our validation metric, as we would like to maximize both precision and recall
- We will also keep as eye on the accuracy

## 3.8 EDA - Summary

In [16]:
train_df.columns

Index(['Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket',
       'Fare', 'Cabin', 'Embarked'],
      dtype='object')

| Feature | Type | Comments |
|---------|------|----------|
| Survived | Response | Class Imbalance. Will use F1-score as metric |
| Pclass | Ordinal | Very significant |
| Name | Drop | Can be used for feature engineering in future |
| Sex | Binary | Very significant |
| Age | Numerical | Significant. Can be used for feature engineering in future |
| SibSp | Count | Not very significant but can be used for feature engineering |
| Parch | Count | Not very significant but can be used for feature engineering |
| Ticket | Drop | Can be used for feature engineering in future |
| Cabin | Drop | 76% values are NULL |
| Embarked | Categorical | Significant |

# 4. Feature engineering

## 4.1 Log of Fare

In [29]:
train_df = train_df.assign(fare_log = np.log(train_df.Fare + 1))
test_df = test_df.assign(fare_log = np.log(test_df.Fare + 1))
deploy_df = train_df.assign(fare_log = np.log(deploy_df.Fare + 1))
numeric_features.append("fare_log")

In [30]:
numeric_plots = alt.Chart(train_df).mark_line(interpolate="monotone").encode(
    x=alt.X("fare_log", bin=alt.Bin(maxbins=10)), 
    y="count()", 
    color="Survived:N"
).properties(width=200, height=100)
numeric_plots

In [31]:
numeric_plots = (
    alt.Chart(train_df.reset_index())
    .mark_point(opacity=0.2, size=5)
    .encode(
        x=alt.X("fare_log"),
        y=alt.X(alt.repeat("repeat"), type="quantitative"),
        tooltip="PassengerId"
    )
    .properties(width=180, height=100)
    .repeat(repeat=numeric_features)
)
numeric_plots

In [32]:
cat_vs_num_plots = (
    alt.Chart(train_df.reset_index())
    .mark_boxplot()
    .encode(
        x=alt.X("fare_log"),
        y=alt.X(alt.repeat("repeat"), type="nominal"),
        tooltip="PassengerId"
    )
    .properties(width=180, height=100)
    .repeat(repeat=ordinal_features + categorical_features + ["Survived"], columns=3)
)
cat_vs_num_plots

**Observations:**

- The distribution of log of fare looks close to normal
- The Categorical features vs log of fare shows that log of fare has more significance than fare
- We will use both fare and log of fare in our model initially

# Preprocessing

From the column description and quick look at the values, we distribute the features in below categories:

In [11]:
numeric_features = ["Age", "SibSp" ,"Parch", "Fare"]
categorical_features = ["Sex", "Embarked"]
ordinal_features = ["Pclass"]
remainder_features = []
drop_features = ["Name", "Ticket", "Cabin"]

We are dropping:
- `Name` as its a text field
- `Ticket` as its a text field
- `Cabin` because most of the values are `na`

In [12]:
numeric_pipeline_median = make_pipeline(
    SimpleImputer(strategy="median"), StandardScaler()
)
categorical_pipeline = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OneHotEncoder(handle_unknown="ignore"),
)
ordinal_pipeline = make_pipeline(SimpleImputer(strategy="most_frequent"))

transformers = ColumnTransformer(
    transformers=[
        ("num", numeric_pipeline_median, numeric_features),
        ("cat", categorical_pipeline, categorical_features),
        ("ord", ordinal_pipeline, ordinal_features)
    ],
    remainder="drop"
)

# Housekeeping

In [13]:
def store_cross_val_results(model_name, scores, results_dict):
    """
    Stores mean scores from cross_validate in results_dict for
    the given model model_name.

    Parameters
    ----------
    model_name :
        scikit-learn classification model
    scores : dict
        object return by `cross_validate`
    results_dict: dict
        dictionary to store results

    Returns
    ----------
        None

    """
    results_dict[model_name] = {
        "mean_train_accuracy": np.mean(scores["train_score"]),
        "mean_validation_accuracy": np.mean(scores["test_score"]),
        "mean_fit_time (s)": np.mean(scores["fit_time"]),
        "mean_score_time (s)": np.mean(scores["score_time"]),
        "std_train_score": scores["train_score"].std(),
        "std_test_score": scores["test_score"].std(),
    }

In [14]:
results_dict = {}
dummy_score = {
    "fit_time": np.array([0, 0, 0, 0, 0]),
    "score_time": np.array([0.5, 0.5, 1, 0, 0.5]),
    "test_score": np.array([0.1, 0.1, 0.1, 0.1, 0.1]),
    "train_score": np.array([0.99, 0.99, 0.99, 0.99, 0.99]),
}
expected_output = {
    "test model": {
        "mean_train_accuracy": 0.99,
        "mean_validation_accuracy": 0.1,
        "mean_fit_time (s)": 0.0,
        "mean_score_time (s)": 0.5,
        "std_train_score": 0.0,
        "std_test_score": 0.0,
    }
}

In [15]:
store_cross_val_results("test model", dummy_score, results_dict)
assert results_dict==expected_output, "Function test failed"
print("Success!")

Success!


# Building a baseline model

In [16]:
pipe_model = make_pipeline(transformers, DummyClassifier(strategy="most_frequent", random_state=2020))
scores = cross_validate(pipe_model, X_train, y_train, cv=5, return_train_score=True, n_jobs=-1)
results_dict={}
store_cross_val_results("Dummy model", scores, results_dict)
pd.DataFrame(results_dict).T

Unnamed: 0,mean_fit_time (s),mean_score_time (s),mean_train_accuracy,mean_validation_accuracy,std_test_score,std_train_score
Dummy model,0.021988,0.01148,0.623596,0.623599,0.002758,0.000688


# Trying other classifiers

In [17]:
models = {
    "Decision Tree": DecisionTreeClassifier(random_state=2020),
    "kNN": KNeighborsClassifier(n_jobs=-1),
    "SVC RBF": SVC(random_state=2020),
    "Naive Bayes": MultinomialNB(),
    "Logistic Regression": LogisticRegression(n_jobs=-1, max_iter=2000, random_state=2020),
    "Random Forest": RandomForestClassifier(n_estimators=1000, n_jobs=-1, random_state=2020)
}

In [18]:
for model in models:
    print(f"Working on {model}")
    pipe_model = make_pipeline(transformers, models[model])
    scores = cross_validate(pipe_model, X_train, y_train, return_train_score=True, cv=5, n_jobs=-1)
    store_cross_val_results(model, scores, results_dict)
print("Training completed")

Working on Decision Tree
Working on kNN
Working on SVC RBF
Working on Naive Bayes
Working on Logistic Regression
Working on Random Forest
Training completed


In [19]:
mean_scores_df=pd.DataFrame(results_dict).T
mean_scores_df

Unnamed: 0,mean_train_accuracy,mean_validation_accuracy,mean_fit_time (s),mean_score_time (s),std_train_score,std_test_score
Dummy model,0.623596,0.623599,0.021988,0.01148,0.000688,0.002758
Decision Tree,0.987712,0.779533,0.023524,0.010533,0.001916,0.040052
kNN,0.8606,0.790742,0.024745,0.123176,0.0113,0.02978
SVC RBF,0.833568,0.821649,0.031507,0.011711,0.006411,0.026366
Naive Bayes,,,0.029947,0.0,,
Logistic Regression,0.817419,0.806205,0.138478,0.020516,0.006676,0.031036
Random Forest,0.987712,0.813257,3.129975,0.214048,0.001916,0.040923


# Hyperparameter Tuning for SVC

In [20]:
# param_grid = {
#     "svc__C": 10.0 ** np.arange(-6, 6),
#     "svc__gamma": 10.0 ** np.arange(-6, 6)
# }

In [21]:
# pipe_model = make_pipeline(transformers, SVC(random_state=2020))
# random_search = RandomizedSearchCV(
#     pipe_model,
#     param_grid,
#     n_iter=100,
#     n_jobs=-1,
#     cv=5,
#     random_state=2020,
#     return_train_score=True
# )
# random_search.fit(X_train, y_train)
# print(f"Best parameter: {random_search.best_params_}")
# print(f"Best validation score: {round(random_search.best_score_, 3)}")

In [22]:
param_grid = {
    "svc__C": np.arange(-5, 5.125, 0.125),
    "svc__gamma": np.arange(0, 1, 0.1)
}

In [23]:
pipe_model = make_pipeline(transformers, SVC(random_state=2020))
grid_search = GridSearchCV(
    pipe_model,
    param_grid,
    n_jobs=-1,
    cv=5,
    return_train_score=True
)
grid_search.fit(X_train, y_train)
print(f"Best parameter: {grid_search.best_params_}")
print(f"Best validation score: {round(grid_search.best_score_, 3)}")

Best parameter: {'svc__C': 0.5, 'svc__gamma': 0.2}
Best validation score: 0.824


In [24]:
cv_scores = pd.DataFrame(grid_search.cv_results_)
cv_scores = cv_scores.loc[
    cv_scores["rank_test_score"] == 1,
    [
        "params",
        "mean_train_score",
        "mean_test_score",
        "mean_fit_time",
        "mean_score_time",
        "std_train_score",
        "std_test_score",
    ],
].set_index("params")
cv_scores.columns = [
    "mean_train_accuracy",
    "mean_validation_accuracy",
    "mean_fit_time (s)",
    "mean_score_time (s)",
    "std_train_score",
    "std_test_score",
]
mean_scores_df = pd.concat([mean_scores_df, cv_scores])
mean_scores_df

Unnamed: 0,mean_train_accuracy,mean_validation_accuracy,mean_fit_time (s),mean_score_time (s),std_train_score,std_test_score
Dummy model,0.623596,0.623599,0.021988,0.01148,0.000688,0.002758
Decision Tree,0.987712,0.779533,0.023524,0.010533,0.001916,0.040052
kNN,0.8606,0.790742,0.024745,0.123176,0.0113,0.02978
SVC RBF,0.833568,0.821649,0.031507,0.011711,0.006411,0.026366
Naive Bayes,,,0.029947,0.0,,
Logistic Regression,0.817419,0.806205,0.138478,0.020516,0.006676,0.031036
Random Forest,0.987712,0.813257,3.129975,0.214048,0.001916,0.040923
"{'svc__C': 0.5, 'svc__gamma': 0.2}",0.840941,0.824446,0.034434,0.013043,0.005729,0.029094


# Test results with SVC

In [25]:
final_estimator = grid_search.best_estimator_

final_estimator.fit(X_train, y_train)
scores = final_estimator.score(X_test, y_test)
print(f"Test accuracy for our best estimator is: {round(scores, 4)}")
print(f"Validation accuracy for our best estimator is: {round(mean_scores_df.mean_validation_accuracy[-1], 4)}")
print(f"Training accuracy for our best estimator is: {round(mean_scores_df.mean_train_accuracy[-1], 4)}")

Test accuracy for our best estimator is: 0.7989
Validation accuracy for our best estimator is: 0.8244
Training accuracy for our best estimator is: 0.8409


# Hyperparameter Tuning for Random Forest

In [26]:
param_grid = {
    "randomforestclassifier__n_estimators": np.arange(445, 451),
    "randomforestclassifier__min_samples_split": np.arange(5, 7)
}

In [27]:

pipe_model = make_pipeline(transformers, RandomForestClassifier(n_jobs=-1, random_state=2020))
grid_search = GridSearchCV(
    pipe_model,
    param_grid,
    n_jobs=-1,
    cv=5,
    return_train_score=True
)
grid_search.fit(X_train, y_train)
print(f"Best parameter: {grid_search.best_params_}")
print(f"Best validation score: {round(grid_search.best_score_, 3)}")

Best parameter: {'randomforestclassifier__min_samples_split': 6, 'randomforestclassifier__n_estimators': 447}
Best validation score: 0.834


In [28]:
cv_scores = pd.DataFrame(grid_search.cv_results_)
cv_scores = cv_scores.loc[
    cv_scores["rank_test_score"] == 1,
    [
        "params",
        "mean_train_score",
        "mean_test_score",
        "mean_fit_time",
        "mean_score_time",
        "std_train_score",
        "std_test_score",
    ],
].set_index("params")
cv_scores.columns = [
    "mean_train_accuracy",
    "mean_validation_accuracy",
    "mean_fit_time (s)",
    "mean_score_time (s)",
    "std_train_score",
    "std_test_score",
]
mean_scores_df = pd.concat([mean_scores_df, cv_scores.head(1)])
mean_scores_df

Unnamed: 0,mean_train_accuracy,mean_validation_accuracy,mean_fit_time (s),mean_score_time (s),std_train_score,std_test_score
Dummy model,0.623596,0.623599,0.021988,0.01148,0.000688,0.002758
Decision Tree,0.987712,0.779533,0.023524,0.010533,0.001916,0.040052
kNN,0.8606,0.790742,0.024745,0.123176,0.0113,0.02978
SVC RBF,0.833568,0.821649,0.031507,0.011711,0.006411,0.026366
Naive Bayes,,,0.029947,0.0,,
Logistic Regression,0.817419,0.806205,0.138478,0.020516,0.006676,0.031036
Random Forest,0.987712,0.813257,3.129975,0.214048,0.001916,0.040923
"{'svc__C': 0.5, 'svc__gamma': 0.2}",0.840941,0.824446,0.034434,0.013043,0.005729,0.029094
"{'randomforestclassifier__min_samples_split': 6, 'randomforestclassifier__n_estimators': 447}",0.935045,0.834285,1.485351,0.198963,0.006455,0.025151


# Test results with Random Forest

In [29]:
final_estimator = grid_search.best_estimator_

final_estimator.fit(X_train, y_train)
scores = final_estimator.score(X_test, y_test)
print(f"Test accuracy for our best estimator is: {round(scores, 4)}")
print(f"Validation accuracy for our best estimator is: {round(mean_scores_df.mean_validation_accuracy[-1], 4)}")
print(f"Training accuracy for our best estimator is: {round(mean_scores_df.mean_train_accuracy[-1], 4)}")

Test accuracy for our best estimator is: 0.8324
Validation accuracy for our best estimator is: 0.8343
Training accuracy for our best estimator is: 0.935


# Deployment Data

In [30]:
deploy_df = pd.read_csv("data/test.csv", index_col="PassengerId")
deploy_df.head()

Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [31]:
final_prediction = pd.DataFrame(
    final_estimator.predict(deploy_df), index=deploy_df.index, columns=["Survived"]
)
final_prediction.to_csv("result/submission.csv")

# Iteration 2

## Revision

- Based on our last iteration, Random Forest gave the best result
- However the training accuracy suggests some level of overfitting
- The validation and test accuracy is close to each other

## Overview

- In this iteration we will try to analyze the features in greater details
- We will also try to engineer some new features

## EDA - Numeric features

### Distribution

In [32]:
train_df = train_df.copy()
test_df = test_df.copy()
deploy_df = deploy_df.copy()
train_df["associated_members"] = pd.Series(
    train_df["SibSp"] + train_df["Parch"], name="associated_members"
)
test_df["associated_members"] = pd.Series(
    test_df["SibSp"] + test_df["Parch"], name="associated_members"
)
deploy_df["associated_members"] = pd.Series(
    deploy_df["SibSp"] + deploy_df["Parch"], name="associated_members"
)
data_df["associated_members"] = pd.Series(
    data_df["SibSp"] + data_df["Parch"], name="associated_members"
)
numeric_features.append("associated_members")

In [33]:
numeric_plots = alt.Chart(train_df).mark_line(interpolate="step").encode(
    x=alt.X(alt.repeat("repeat"), type="quantitative", bin=alt.Bin(maxbins=20)), y="count()", color="Survived:N"
).properties(width=200, height=100).repeat(repeat = numeric_features, columns=3)
numeric_plots

### Pairwise scatterplots

In [34]:
numeric_plots = (
    alt.Chart(train_df.reset_index())
    .mark_point(opacity=0.2, size=5)
    .encode(
        x=alt.X(alt.repeat("column"), type="quantitative"),
        y=alt.X(alt.repeat("row"), type="quantitative"),
        tooltip="PassengerId"
    )
    .properties(width=180, height=100)
    .repeat(column=numeric_features, row=numeric_features)
)
numeric_plots

### Correlation plot

In [35]:
corr_df = train_df.corr().stack().reset_index(name='corr')
corr_plot = (alt.Chart(corr_df).mark_rect().encode(
    x='level_0',
    y='level_1',
    tooltip='corr',
    color=alt.Color('corr', scale=alt.Scale(domain=(-1, 1), scheme='purpleorange')))
 .properties(width=200, height=200))

corr_plot

- We see very high correlation of `associated_members` with `SibSp` and `Parch`. Hence, we will drop `SibSp` and `Parch`.
- There is some correlation between `Fare` and `Pclass`

In [36]:
numeric_features.remove("SibSp")
numeric_features.remove("Parch")
drop_features = drop_features + ["SibSp", "Parch"]

## EDA - Categorical vs numerical features

Lets create a binary feature related to `is_alone` based on `associated_members`

In [37]:
train_df["is_alone"] = train_df.associated_members.apply(lambda x: 1 if x==0 else 0)
test_df["is_alone"] = test_df.associated_members.apply(lambda x: 1 if x==0 else 0)
deploy_df["is_alone"] = deploy_df.associated_members.apply(lambda x: 1 if x==0 else 0)

data_df["is_alone"] = data_df.associated_members.apply(lambda x: 1 if x==0 else 0)
ordinal_features.append("is_alone")

In [38]:
cat_vs_num_plots = (
    alt.Chart(train_df)
    .mark_boxplot()
    .encode(
        x=alt.X(alt.repeat("column"), type="quantitative"),
        y=alt.X(alt.repeat("row"), type="nominal")
    )
    .properties(width=180, height=100)
    .repeat(column=numeric_features, row=ordinal_features + categorical_features + ["Survived"])
)
cat_vs_num_plots

- Age has some relation with `Pclass` and `Sex`
- This can be used from advanced imputation

## EDA - Categorical vs categorical features

In [39]:
categorical_plots = (
    alt.Chart(train_df)
    .mark_circle()
    .encode(
        x=alt.X(alt.repeat("column"), type="nominal"),
        y=alt.X(alt.repeat("row"), type="nominal"),
        size="count()"
    )
    .properties(width=180, height=100)
    .repeat(
        column=ordinal_features + categorical_features + ["Survived"],
        row=ordinal_features + categorical_features + ["Survived"],
    )
)
categorical_plots

In [40]:
numeric_features

['Age', 'Fare', 'associated_members']

In [41]:
ordinal_features

['Pclass', 'is_alone']

In [42]:
categorical_features

['Sex', 'Embarked']

In [43]:
drop_features

['Name', 'Ticket', 'Cabin', 'SibSp', 'Parch']

## Cabin Type

In [44]:
train_df["cabin_type"] = train_df.Cabin.str.slice(stop=1)
test_df["cabin_type"] = test_df.Cabin.str.slice(stop=1)
deploy_df["cabin_type"] = deploy_df.Cabin.str.slice(stop=1)
data_df["cabin_type"] = data_df.Cabin.str.slice(stop=1)

## Log of Fare

In [45]:
train_df["fare_log"] = np.log(train_df.Fare + 1)
test_df["fare_log"] = np.log(test_df.Fare + 1)
deploy_df["fare_log"] = np.log(deploy_df.Fare + 1)
data_df["fare_log"] = np.log(data_df.Fare + 1)

## Modelling

In [46]:
numeric_features = ['Age', 'Fare', 'associated_members']
categorical_features = ['Sex', 'Embarked']
ordinal_features = ['Pclass', 'is_alone']
remainder_features = []
drop_features = ['Name', 'Ticket', 'Cabin', 'SibSp', 'Parch', "cabin_type", "fare_log"]
X_train, y_train = train_df.drop(columns=["Survived"]), train_df["Survived"]
X_test, y_test = test_df.drop(columns=["Survived"]), test_df["Survived"]

In [47]:
numeric_pipeline_median = make_pipeline(
    SimpleImputer(strategy="median"), StandardScaler()
)
categorical_pipeline = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OneHotEncoder(handle_unknown="ignore"),
)
ordinal_pipeline = make_pipeline(SimpleImputer(strategy="median"))

transformers = ColumnTransformer(
    transformers=[
        ("num", numeric_pipeline_median, numeric_features),
        ("cat", categorical_pipeline, categorical_features),
        ("ord", ordinal_pipeline, ordinal_features)
    ],
    remainder="drop"
)

In [48]:
pipe_model = make_pipeline(
    transformers,
    RandomForestClassifier(
        min_samples_split=6, n_estimators=1000, random_state=2020, n_jobs=-1
    ),
)
scores = cross_validate(
    pipe_model, X_train, y_train, cv=5, return_train_score=True, n_jobs=-1
)
results_dict={}
store_cross_val_results("RF Itr2", scores, results_dict)
pd.DataFrame(results_dict).T

Unnamed: 0,mean_fit_time (s),mean_score_time (s),mean_train_accuracy,mean_validation_accuracy,std_test_score,std_train_score
RF Itr2,3.182728,0.300122,0.936097,0.838481,0.019308,0.008199


## Hyperparameter Optimization

In [49]:
param_grid = {
    "randomforestclassifier__n_estimators": np.arange(940, 951, 1)
}
pipe_model = make_pipeline(transformers, RandomForestClassifier(min_samples_split=6, n_jobs=-1, random_state=2020))
grid_search = GridSearchCV(
    pipe_model,
    param_grid,
    n_jobs=-1,
    cv=5,
    return_train_score=True
)
grid_search.fit(X_train, y_train)
print(f"Best parameter: {grid_search.best_params_}")
print(f"Best validation score: {round(grid_search.best_score_, 3)}")

Best parameter: {'randomforestclassifier__n_estimators': 943}
Best validation score: 0.838


In [50]:
cv_scores = pd.DataFrame(grid_search.cv_results_)
cv_scores = cv_scores.loc[
    cv_scores["rank_test_score"] == 1,
    [
        "params",
        "mean_train_score",
        "mean_test_score",
        "mean_fit_time",
        "mean_score_time",
        "std_train_score",
        "std_test_score",
    ],
].set_index("params")
cv_scores.columns = [
    "mean_train_accuracy",
    "mean_validation_accuracy",
    "mean_fit_time (s)",
    "mean_score_time (s)",
    "std_train_score",
    "std_test_score",
]
mean_scores_df = pd.concat([mean_scores_df, cv_scores.head(1)])
mean_scores_df

Unnamed: 0,mean_train_accuracy,mean_validation_accuracy,mean_fit_time (s),mean_score_time (s),std_train_score,std_test_score
Dummy model,0.623596,0.623599,0.021988,0.01148,0.000688,0.002758
Decision Tree,0.987712,0.779533,0.023524,0.010533,0.001916,0.040052
kNN,0.8606,0.790742,0.024745,0.123176,0.0113,0.02978
SVC RBF,0.833568,0.821649,0.031507,0.011711,0.006411,0.026366
Naive Bayes,,,0.029947,0.0,,
Logistic Regression,0.817419,0.806205,0.138478,0.020516,0.006676,0.031036
Random Forest,0.987712,0.813257,3.129975,0.214048,0.001916,0.040923
"{'svc__C': 0.5, 'svc__gamma': 0.2}",0.840941,0.824446,0.034434,0.013043,0.005729,0.029094
"{'randomforestclassifier__min_samples_split': 6, 'randomforestclassifier__n_estimators': 447}",0.935045,0.834285,1.485351,0.198963,0.006455,0.025151
{'randomforestclassifier__n_estimators': 943},0.936449,0.838481,3.147063,0.32922,0.007794,0.019308


## Testing

In [51]:
final_estimator = grid_search.best_estimator_

final_estimator.fit(X_train, y_train)
scores = final_estimator.score(X_test, y_test)
print(f"Test accuracy for our best estimator is: {round(scores, 4)}")
print(f"Validation accuracy for our best estimator is: {round(mean_scores_df.mean_validation_accuracy[-1], 4)}")
print(f"Training accuracy for our best estimator is: {round(mean_scores_df.mean_train_accuracy[-1], 4)}")

Test accuracy for our best estimator is: 0.8045
Validation accuracy for our best estimator is: 0.8385
Training accuracy for our best estimator is: 0.9364


## Deployment

In [52]:
final_prediction = pd.DataFrame(
    final_estimator.predict(deploy_df), index=deploy_df.index, columns=["Survived"]
)
final_prediction.to_csv("result/submission.csv")

# Iteration 3

## Training on full data

In [53]:
X_train, y_train = data_df.drop(columns=["Survived"]), data_df["Survived"]

In [54]:
final_estimator = grid_search.best_estimator_
final_estimator.fit(X_train, y_train)
final_prediction = pd.DataFrame(
    final_estimator.predict(deploy_df), index=deploy_df.index, columns=["Survived"]
)
final_prediction.to_csv("result/submission.csv")

# Iteration 4 - Advanced age imputation

In [109]:
class AgeInpute:
    def __init__(self):
        self.female_median_age = None
        self.male_median_age = None
        self.global_median_age = None

    def fit(self, X):
        self.female_median_age = X.query("Sex == 'female'")["Age"].dropna().median()
        self.male_median_age = X.query("Sex == 'male'")["Age"].dropna().median()
        self.global_median_age = X["Age"].dropna().median()

    def transform(self, X):
        X["def_age"] = pd.Series(
            X_train.Sex.apply(
                lambda x: self.female_median_age
                if x == "female"
                else (self.male_median_age if x == "male" else self.global_median_age)
            ),
            name="def_age",
        )

        X["Age"] = X.Age.combine_first(X.def_age)
        X = X.drop(columns=["def_age"])
        return X
    
    def fit_transform(self, X):
        self.fit(X)
        return self.transform(X)

In [119]:
X_train, y_train = data_df.drop(columns=["Survived"]), data_df["Survived"]
X_train = X_train.loc[:, ["Sex", "Age"]]
X_train

Unnamed: 0_level_0,Sex,Age
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,male,22.0
2,female,38.0
3,female,26.0
4,female,35.0
5,male,35.0
...,...,...
887,male,27.0
888,female,19.0
889,female,
890,male,26.0


In [120]:
pipe_model = make_pipeline(
    AgeInpute(),
    RandomForestClassifier(min_samples_split=6, n_jobs=-1, random_state=2020),
)

In [122]:
scores = cross_validate(
    pipe_model, X_train, y_train, cv=5, return_train_score=True, n_jobs=-1
)
scores

{'fit_time': array([0.00084662, 0.00080657, 0.00080538, 0.00107169, 0.00113535]),
 'score_time': array([0., 0., 0., 0., 0.]),
 'test_score': array([nan, nan, nan, nan, nan]),
 'train_score': array([nan, nan, nan, nan, nan])}