In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [2]:
train_df = pd.read_csv(os.path.join(os.pardir,"data","train_cleaned.csv"))

low_mi_cols = [
    'Title_Rare', 'Embarked_Q', 'Deck_G', 'Deck_F', 'Deck_C',
    'Pclass_Title_3_Master', 'Pclass_Title_3_Mrs',
    'Pclass_Title_1_Rare', 'Pclass_Title_2_Rare'
]

train_df_filtered = train_df.drop(columns=low_mi_cols, errors='ignore')

# Define features and target
X = train_df_filtered.drop(columns='Survived')
y = train_df_filtered['Survived']

In [3]:
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_train, y_train)

# Predict on validation set
y_pred = model.predict(X_valid)

# Evaluate
print("Accuracy:", accuracy_score(y_valid, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_valid, y_pred))
print("Classification Report:\n", classification_report(y_valid, y_pred))

Accuracy: 0.8324022346368715
Confusion Matrix:
 [[100  10]
 [ 20  49]]
Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.91      0.87       110
           1       0.83      0.71      0.77        69

    accuracy                           0.83       179
   macro avg       0.83      0.81      0.82       179
weighted avg       0.83      0.83      0.83       179



In [5]:
from sklearn.model_selection import cross_val_score

cv_scores = cross_val_score(model, X, y, cv=5)
print("CV Accuracy (mean):", cv_scores.mean())

CV Accuracy (mean): 0.8249262444291006


In [6]:
# Predict on test data
test_cleaned = pd.read_csv(os.path.join(os.pardir,"data","test_cleaned.csv"))
test_df = pd.read_csv(os.path.join(os.pardir,"data","test.csv"))

# Align test features
X_test = test_cleaned[X.columns]  # ensure same columns and order

# Predict
test_preds = model.predict(X_test)

# Submit
submission = pd.DataFrame({
    'PassengerId': test_df['PassengerId'],
    'Survived': test_preds
})
submission.to_csv(r'../models/logistic-regression/submission.csv', index=False)