In [13]:
import os, warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd

# Viz
import matplotlib.pyplot as plt
import seaborn as sns

# ML
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

DATA_PATH = 'data/train.csv'  # Kaggle Titanic train file
assert os.path.exists(DATA_PATH), f"Expected {DATA_PATH} to exist. Please add Kaggle's train.csv there."
df = pd.read_csv(DATA_PATH)
df.head()


AssertionError: Expected data/train.csv to exist. Please add Kaggle's train.csv there.

In [6]:
df.fillna({
    'Age': df['Age'].median(),
    'Embarked': df['Embarked'].mode()[0]
}, inplace=True)


In [7]:
features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
X = df[features]
y = df['Survived']

In [8]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [9]:
num_features = ['Age', 'SibSp', 'Parch', 'Fare']
cat_features = ['Pclass', 'Sex', 'Embarked']

num_pipeline = Pipeline([('scaler', StandardScaler())])
cat_pipeline = Pipeline([('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer([
    ('num', num_pipeline, num_features),
    ('cat', cat_pipeline, cat_features)
])


In [10]:
pipelines = {
    'RandomForest': Pipeline([
        ('pre', preprocessor),
        ('clf', RandomForestClassifier(n_estimators=100, random_state=42))
    ]),
    'LogisticRegression': Pipeline([
        ('pre', preprocessor),
        ('clf', LogisticRegression(max_iter=1000))
    ])
}

In [11]:
best_model = None
best_acc = 0

for name, pipe in pipelines.items():
    pipe.fit(X_train, y_train)
    preds = pipe.predict(X_test)
    acc = accuracy_score(y_test, preds)
    print(f"{name} accuracy: {acc:.4f}")
    if acc > best_acc:
        best_acc = acc
        best_model = pipe

print(f"Best model accuracy: {best_acc:.4f}")

RandomForest accuracy: 0.8156
LogisticRegression accuracy: 0.7989
Best model accuracy: 0.8156


In [12]:
joblib.dump(best_model, "../model.pkl")
print("Model saved to ../model.pkl")

Model saved to ../model.pkl
