In [577]:
import numpy as np

import matplotlib.pyplot as plt

import pandas as pd

from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder

In [578]:
data_original = pd.read_csv("./data/titanic.csv")

data = data_original.copy()

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [579]:
data.rename(
    {
        "PassengerId": "passenger_id",
        "Survived": "target",
        "Pclass": "socioeconomic_status",
        "Name": "name",
        "Sex": "sex",
        "Age": "age",
        "SibSp": "n_siblings_spouses",
        "Parch": "n_parents_children",
        "Ticket": "ticket_number",
        "Fare": "fare",
        "Cabin": "cabin",
        "Embarked": "embark_place",
    },
    axis=1,
    inplace=True,
)

In [580]:
data.dropna(subset=["target"], inplace=True)

X, y = data.drop(["target"], axis=1), data["target"]

X.sample(n=5)

Unnamed: 0,passenger_id,socioeconomic_status,name,sex,age,n_siblings_spouses,n_parents_children,ticket_number,fare,cabin,embark_place
766,767,1,"Brewe, Dr. Arthur Jackson",male,,0,0,112379,39.6,,C
547,548,2,"Padro y Manent, Mr. Julian",male,,0,0,SC/PARIS 2146,13.8625,,C
214,215,3,"Kiernan, Mr. Philip",male,,1,0,367229,7.75,,Q
187,188,1,"Romaine, Mr. Charles Hallace (""Mr C Rolmane"")",male,45.0,0,0,111428,26.55,,S
429,430,3,"Pickard, Mr. Berk (Berk Trembisky)",male,32.0,0,0,SOTON/O.Q. 392078,8.05,E10,S


In [581]:
def run_feature_engineering(X: pd.DataFrame, y: pd.Series):
    X_new = X.copy()

    X_new = X_new.drop("name", axis=1, errors="ignore")

    X_new["cabin_section"] = X_new["cabin"].str.slice(0, 1)
    X_new["cabin_number"] = X_new["cabin"].str.slice(1)
    X_new = X_new.drop("cabin", axis=1, errors="ignore")

    X_new = X_new.drop("ticket_number", axis=1, errors="ignore")

    y_new = y.copy()

    return X_new, y_new


X, y = run_feature_engineering(X, y)

X.sample(n=5)

Unnamed: 0,passenger_id,socioeconomic_status,sex,age,n_siblings_spouses,n_parents_children,fare,embark_place,cabin_section,cabin_number
580,581,2,female,25.0,1,1,30.0,S,,
436,437,3,female,21.0,2,2,34.375,S,,
109,110,3,female,,1,0,24.15,Q,,
810,811,3,male,26.0,0,0,7.8875,S,,
544,545,1,male,50.0,1,0,106.425,C,C,86.0


In [582]:
print("--------------------------------------------------")
print(f"Total Rows: {len(X)}")
print("--------------------------------------------------")

for label, content in X.items():
    unique = content.unique()
    n_unique = len(unique)
    print(f'Unique values in "{label}": {n_unique}')

print("--------------------------------------------------")

--------------------------------------------------
Total Rows: 891
--------------------------------------------------
Unique values in "passenger_id": 891
Unique values in "socioeconomic_status": 3
Unique values in "sex": 2
Unique values in "age": 89
Unique values in "n_siblings_spouses": 7
Unique values in "n_parents_children": 7
Unique values in "fare": 248
Unique values in "embark_place": 4
Unique values in "cabin_section": 9
Unique values in "cabin_number": 102
--------------------------------------------------


In [583]:
cat_features = [
    "socioeconomic_status",
    "sex",
    "embark_place",
    "cabin_section",
    "cabin_number",
]

num_features = [
    "passenger_id",
    "age",
    "n_siblings_spouses",
    "n_parents_children",
    "fare",
]

In [584]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [585]:
imputer = ColumnTransformer(
    [
        (
            "imputer_num",
            SimpleImputer(strategy="median", add_indicator=True),
            num_features,
        ),
        (
            "imputer_cat",
            SimpleImputer(strategy="constant", add_indicator=False, fill_value="N/A"),
            cat_features,
        ),
    ],
    remainder="passthrough",
    verbose_feature_names_out=False,
    verbose=False,
)

In [586]:
scaler = ColumnTransformer(
    [
        (
            "scaler_num",
            MinMaxScaler((0, 1)),
            num_features,
        ),
    ],
    remainder="passthrough",
    verbose_feature_names_out=False,
    verbose=False,
)

In [587]:
encoder = ColumnTransformer(
    [
        (
            "encoder_cat",
            OneHotEncoder(handle_unknown="ignore", sparse_output=False),
            cat_features,
        ),
    ],
    remainder="passthrough",
    verbose_feature_names_out=False,
    verbose=False,
)

In [588]:
estimator = LogisticRegression()

In [589]:
pipeline = Pipeline(
    [
        ("imputer", imputer),
        ("scaler", scaler),
        ("encoder", encoder),
        ("estimator", estimator),
    ],
).set_output(transform="pandas")

pipeline.fit(X_train, y_train)

print("------------------------------------------------------------")
print(f"Baseline Score (Test): {pipeline.score(X_test, y_test).round(4)}")
print(f"Baseline Score (Train): {pipeline.score(X_train, y_train).round(4)}")
print("------------------------------------------------------------")

------------------------------------------------------------
Baseline Score (Test): 0.8386
Baseline Score (Train): 0.8293
------------------------------------------------------------


In [590]:
model = GridSearchCV(
    pipeline,
    {
        "estimator__penalty": ["l2"],
        "estimator__C": [0.5, 1, 2],
        "estimator__solver": ["lbfgs", "liblinear", "newton-cg", "newton-cholesky"],
    },
    cv=5,
    verbose=2,
)

model.fit(X_train, y_train)

print("------------------------------------------------------------")
print(f"Hypertuned Score (Test): {model.score(X_test, y_test).round(4)}")
print(f"Hypertuned Score (Train): {model.score(X_train, y_train).round(4)}")
print(f"Hypertuned Parameters: {model.best_params_}")
print("------------------------------------------------------------")

Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV] END estimator__C=0.5, estimator__penalty=l2, estimator__solver=lbfgs; total time=   0.0s


[CV] END estimator__C=0.5, estimator__penalty=l2, estimator__solver=lbfgs; total time=   0.0s
[CV] END estimator__C=0.5, estimator__penalty=l2, estimator__solver=lbfgs; total time=   0.0s
[CV] END estimator__C=0.5, estimator__penalty=l2, estimator__solver=lbfgs; total time=   0.0s
[CV] END estimator__C=0.5, estimator__penalty=l2, estimator__solver=lbfgs; total time=   0.0s
[CV] END estimator__C=0.5, estimator__penalty=l2, estimator__solver=liblinear; total time=   0.0s
[CV] END estimator__C=0.5, estimator__penalty=l2, estimator__solver=liblinear; total time=   0.0s
[CV] END estimator__C=0.5, estimator__penalty=l2, estimator__solver=liblinear; total time=   0.0s
[CV] END estimator__C=0.5, estimator__penalty=l2, estimator__solver=liblinear; total time=   0.0s
[CV] END estimator__C=0.5, estimator__penalty=l2, estimator__solver=liblinear; total time=   0.0s
[CV] END estimator__C=0.5, estimator__penalty=l2, estimator__solver=newton-cg; total time=   0.0s
[CV] END estimator__C=0.5, estimator

In [591]:
best_estimator_params = model.best_params_

best_estimator = LogisticRegression(
    C=best_estimator_params["estimator__C"],
    penalty=best_estimator_params["estimator__penalty"],
    solver=best_estimator_params["estimator__solver"],
)

best_model = Pipeline(
    [
        ("imputer", imputer),
        ("scaler", scaler),
        ("encoder", encoder),
        ("estimator", estimator),
    ]
)

scores = {
    "accuracy": cross_val_score(
        best_model,
        X,
        y,
        cv=5,
        scoring="accuracy",
    ).mean(),
    "precision": cross_val_score(
        best_model,
        X,
        y,
        cv=5,
        scoring="precision",
    ).mean(),
    "recall": cross_val_score(
        best_model,
        X,
        y,
        cv=5,
        scoring="recall",
    ).mean(),
    "f1": cross_val_score(
        best_model,
        X,
        y,
        cv=5,
        scoring="f1",
    ).mean(),
    "roc_auc": cross_val_score(
        best_model,
        X,
        y,
        cv=5,
        scoring="roc_auc",
    ).mean(),
}

print("------------------------------------------------------------")
print(f"Accuracy: {abs(scores['accuracy'].round(4))}")
print(f"Precision: {abs(scores['precision'].round(4))}")
print(f"Recall: {abs(scores['recall'].round(4))}")
print(f"F1 Score: {scores['f1'].round(4)}")
print(f"ROC AUC: {abs(scores['roc_auc'].round(4))}")
print("------------------------------------------------------------")

------------------------------------------------------------
Accuracy: 0.7957
Precision: 0.7413
Recall: 0.7192
F1 Score: 0.7297
ROC AUC: 0.8504
------------------------------------------------------------
