In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [2]:
train_df = pd.read_csv(os.path.join(os.pardir,"data","train_cleaned.csv"))

# Define features and target
X = train_df.drop(columns='Survived')
y = train_df['Survived']

In [3]:
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_train, y_train)

# Predict on validation set
y_pred = model.predict(X_valid)

# Evaluate
print("Accuracy:", accuracy_score(y_valid, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_valid, y_pred))
print("Classification Report:\n", classification_report(y_valid, y_pred))

Accuracy: 0.8100558659217877
Confusion Matrix:
 [[96 14]
 [20 49]]
Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.87      0.85       110
           1       0.78      0.71      0.74        69

    accuracy                           0.81       179
   macro avg       0.80      0.79      0.80       179
weighted avg       0.81      0.81      0.81       179



In [5]:
from sklearn.model_selection import cross_val_score

cv_scores = cross_val_score(model, X, y, cv=5)
print("CV Accuracy (mean):", cv_scores.mean())

CV Accuracy (mean): 0.8136902893729208


## Model Evaluation Summary

### Accuracy

Validation Accuracy: 81.0%
Cross-Validation Accuracy: 81.4%

- Results are consistent — the model generalizes well.

### Confusion Matrix

|  Metric  |  Meaning  |
|----|----|
|  TN = 96  |  Predicted non-survivor correctly  |
|  FP = 14  |  Predicted survivor, but they didn't survive  |
|  FN = 20  |  Predicted non-survivor, but they did survive  |
|  TP = 49  |  Correctly predicted survivors  |

### Classification Report

|  Class  |  Precision  |  Recall |  F1-Score  |
|----|----|----|----|
|  0 (Died)  |  0.83  |  0.87  |  0.85  |
|  1 (Survived)  |  0.78  |  0.71  |  0.74  |

- Good precision and recall for both classes
- Balanced model — no major bias toward survivors or non-survivors
- F1 for class 1 (survivors) could still improve


In [6]:
# Predict on test data
test_cleaned = pd.read_csv(os.path.join(os.pardir,"data","test_cleaned.csv"))
test_df = pd.read_csv(os.path.join(os.pardir,"data","test.csv"))
test_preds = model.predict(test_cleaned)

# Prepare submission
submission = pd.DataFrame({
    'PassengerId': test_df['PassengerId'],  # must come from raw test.csv
    'Survived': test_preds
})
submission.to_csv(r'../models/logistic-regression/submission.csv', index=False)