Importing

In [148]:
import warnings
# import cmle
import random
import numpy as np
import pandas as pd
from sklearn.compose import make_column_transformer
from sklearn.dummy import DummyClassifier
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_validate, train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score



Preprocessing

In [None]:
train_df = pd.read_csv("data/train.csv")
test_df = pd.read_csv("data/test.csv")

X_train = train_df.drop(columns=["Survived"])
y_train = train_df["Survived"]

numeric_features = ["Age", "Pclass", "SibSp", "Parch", "Fare"]
categorical_features = ["Embarked"]  # Removed Cabin and Ticket because they were not helpful
binary_features = ["Sex"]

num_tran = StandardScaler()
cat_tran = make_pipeline(
    SimpleImputer(strategy="most_frequent", fill_value="missing"),
    OneHotEncoder(handle_unknown="ignore",sparse_output=False)
)
bin_tran = OneHotEncoder(drop="if_binary",sparse_output=False)

preprocessor = make_column_transformer(
    (num_tran, numeric_features),
    (cat_tran, categorical_features),
    (bin_tran, binary_features),
    remainder = "drop"
)

X_train_transformed = preprocessor.fit_transform(X_train)

Dummy Results

Decision Tree

In [None]:
dt_pipe = make_pipeline(
    preprocessor,
    DecisionTreeClassifier(max_depth=10, random_state=42)
)
dt_pipe.fit(X_train, y_train)
dt_predictions = dt_pipe.predict(test_df)

dt_submission_df = test_df[["PassengerId"]].copy()
dt_submission_df["Survived"] = dt_predictions
dt_submission_df.to_csv("submissions/deliverable_3/dt_submission.csv", index=False)
dt_score = dt_pipe.score(X_train, y_train)
cv_scores_dt = cross_val_score(dt_pipe, X_train, y_train, cv=5)
print(cv_scores_dt)
print(f"Decision Tree Training Accuracy: {dt_score:.4f}")
print(f"Decision Tree CV Average: {cv_scores_dt.mean():.4f}")

[0.77094972 0.79775281 0.80898876 0.75842697 0.81460674]
Decision Tree Training Accuracy: 0.9237
Decision Tree CV Average: 0.7901


## Random Forest

In [None]:
rf_pipe = make_pipeline(
    preprocessor,
    RandomForestClassifier(n_estimators=100, max_depth=8, random_state=42)
)
rf_pipe.fit(X_train, y_train)
rf_predictions = rf_pipe.predict(test_df)

rf_submission_df = test_df[["PassengerId"]].copy()
rf_submission_df["Survived"] = rf_predictions
rf_submission_df.to_csv("submissions/deliverable_3/rf_submission.csv", index=False)
rf_score = rf_pipe.score(X_train, y_train)
cv_scores_rf = cross_val_score(rf_pipe, X_train, y_train, cv=5)
print(f"Random Forest Training Accuracy: {rf_score:.4f}")
print(f"Random Forest CV Average: {cv_scores_rf.mean():.4f}")
print(f"CV Std Dev: {cv_scores_rf.std():.4f}")

Random Forest Training Accuracy: 0.9125
Random Forest CV Average: 0.8227
CV Std Dev: 0.0251
