In [1]:
# Titanic - EDA & Feature Engineering
# Objective: Exploratory analysis and implementation of logistic regression from scratch
# Input: Raw data from Kaggle (train.csv, test.csv)

import sys
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

sys.path.append('../src/')
from models.logistic_regression import gradient_descent
from utils import sex_mapping, predict, extract_title, group_titles, group_mapping, impute_null_age_strategict

In [None]:
# =============================================================================
# 1. LOAD DATA
# =============================================================================

In [3]:
# Load datasets
train_df = pd.read_csv('../data/raw/train.csv')
test_df = pd.read_csv('../data/raw/test.csv')

print("Dataset shapes:")
print(f"Training set: {train_df.shape}")
print(f"Test set: {test_df.shape}")

Dataset shapes:
Training set: (891, 12)
Test set: (418, 11)


In [None]:
# =============================================================================
# 2. PREPARE BASE FEATURES
# =============================================================================

In [5]:
train_df_age_imputed = impute_null_age_strategict(train_df)

In [9]:
df = train_df_age_imputed.copy()

In [16]:
# Simple feature engineering for baseline
df['Sex_numeric'] = df['Sex'].map(sex_mapping)

# Select features for baseline model (Sex + Pclass)
X = df[['Sex_numeric', 'Pclass', 'Age']]
y = df['Survived']

print(f"\nFeature matrix shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"Missing values in features: {X.isnull().sum().sum()}")

# Train-validation split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


Feature matrix shape: (891, 3)
Target shape: (891,)
Missing values in features: 0


In [18]:
# =============================================================================
# 4. PREPARE DATA
# =============================================================================

In [20]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [29]:
# =============================================================================
# 5. RUN MODEL
# =============================================================================

In [22]:
# Parameters
initial_w = np.zeros(3)
initial_b = 0.
alpha = 0.1  # learning rate
num_iters = 1000

# Run gradient_descent
w_final, b_final, J_history = gradient_descent(X_train, y_train, initial_w, initial_b, alpha, num_iters)

print(f"\nFinal values:")
print(f"w: {w_final}")
print(f"b: {b_final}")

Iteration    0: Cost 4.296151
Iteration  100: Cost 12.053544
Iteration  200: Cost 11.025234
Iteration  300: Cost 0.808912
Iteration  400: Cost 2.462641
Iteration  500: Cost 0.609974
Iteration  600: Cost 11.293695
Iteration  700: Cost 3.008792
Iteration  800: Cost 4.019144
Iteration  900: Cost 12.008382

Final values:
w: Sex_numeric    10.279541
Pclass         -4.050736
Age            -1.260915
dtype: float64
b: 2.2064255395874395


In [None]:
# =============================================================================
# 6. TEST WITH VALIDATION DATASET
# =============================================================================

In [24]:
# Predict with validation set
predictions = predict(X_val, w_final, b_final)

# Calculate accuracy
accuracy = (predictions == y_val).mean()
print(f"Accuracy: {accuracy:.3f}")

Accuracy: 0.587


In [None]:
# =============================================================================
# 7. COMPARASION WITH SKLEARN
# =============================================================================

In [26]:
sklearn_model = LogisticRegression()
sklearn_model.fit(X_train, y_train)
sklearn_pred = sklearn_model.predict(X_val)
sklearn_acc = (sklearn_pred == y_val).mean()
print(f"Sklearn accuracy: {sklearn_acc:.3f}")

Sklearn accuracy: 0.810
