In [157]:
# Titanic - EDA & Feature Engineering
# Objective: Exploratory analysis and implementation of logistic regression from scratch
# Input: Raw data from Kaggle (train.csv, test.csv)

import sys
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

sys.path.append('../src/')
from models.basic_logistic_regression import gradient_descent as basic_gradient_descent
from models.logistic_regression import gradient_descent, StandardScaler
from utils import load_data, sex_mapping, predict, extract_title, group_titles, group_mapping, impute_null_age_strategict

In [117]:
# =============================================================================
# 1. LOAD DATA
# =============================================================================

In [119]:
# Load datasets
train_df, test_df = load_data()

print("Dataset shapes:")
print(f"Training set: {train_df.shape}")
print(f"Test set: {test_df.shape}")

Dataset shapes:
Training set: (891, 12)
Test set: (418, 11)


In [121]:
# =============================================================================
# 2. PREPARE BASE FEATURES
# =============================================================================

In [123]:
train_df_age_imputed = impute_null_age_strategict(train_df)

In [125]:
df = train_df_age_imputed.copy()

In [127]:
# Simple feature engineering for baseline
df['Sex_numeric'] = df['Sex'].map(sex_mapping)

# Select features for baseline model (Sex + Pclass)
X = df[['Sex_numeric', 'Pclass', 'Age']]
y = df['Survived']

print(f"\nFeature matrix shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"Missing values in features: {X.isnull().sum().sum()}")

# Train-validation split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


Feature matrix shape: (891, 3)
Target shape: (891,)
Missing values in features: 0


In [129]:
# =============================================================================
# 4. PREPARE DATA
# =============================================================================

In [131]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [133]:
# =============================================================================
# 5. RUN MODEL
# =============================================================================

In [135]:
# ====================================== RUN BASIC MODEL ======================================

In [182]:
# Parameters
initial_w = np.zeros(3)
initial_b = 0.
alpha = 0.1  # learning rate
num_iters = 1000

# Run gradient_descent
w_final1, b_final1, J_history1 = basic_gradient_descent(X_train, y_train, initial_w, initial_b, alpha, num_iters)

print(f"\nFinal values:")
print(f"w: {w_final1}")
print(f"b: {b_final1}")

Iteration    0: Cost 4.296151
Iteration  100: Cost 12.053544
Iteration  200: Cost 11.025234
Iteration  300: Cost 0.808912
Iteration  400: Cost 2.462641
Iteration  500: Cost 0.609974
Iteration  600: Cost 11.293695
Iteration  700: Cost 3.008792
Iteration  800: Cost 4.019144
Iteration  900: Cost 12.008382

Final values:
w: Sex_numeric    10.279541
Pclass         -4.050736
Age            -1.260915
dtype: float64
b: 2.2064255395874395


In [184]:
# ====================================== RUN IMRPOVED MODEL ======================================

In [186]:
# Parameters
initial_w = np.zeros(3)
initial_b = 0.
alpha = 0.1  # learning rate
num_iters = 1000

# Run gradient_descent
w_final2, b_final2, J_history2 = gradient_descent(X_train, y_train, initial_w, initial_b, alpha, num_iters)

print(f"\nFinal values:")
print(f"w: {w_final2}")
print(f"b: {b_final2}")

Iteration    0: Cost 0.682374
Iteration  100: Cost 0.469278
Iteration  200: Cost 0.461979
Iteration  300: Cost 0.461115
Iteration  400: Cost 0.460973
Iteration  500: Cost 0.460947
Iteration  600: Cost 0.460942
Iteration  700: Cost 0.460941
Iteration  800: Cost 0.460941
Iteration  900: Cost 0.460941

Final values:
w: Sex_numeric    1.216260
Pclass        -0.880432
Age           -0.328467
dtype: float64
b: -0.6830049568371815


In [188]:
# =============================================================================
# 6. TEST WITH VALIDATION DATASET
# =============================================================================

In [190]:
# Predict with validation set
predictions1 = predict(X_val, w_final1, b_final1)

# Calculate accuracy
accuracy_basic_model = (predictions1 == y_val).mean()

In [192]:
# Predict with validation set
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_val)
predictions2 = predict(X_scaled, w_final2, b_final2)

# Calculate accuracy
accuracy_improved_model = (predictions2 == y_val).mean()

In [194]:
sklearn_model = LogisticRegression()
sklearn_model.fit(X_train, y_train)
sklearn_pred = sklearn_model.predict(X_val)
sklearn_acc = (sklearn_pred == y_val).mean()

In [196]:
# =============================================================================
# 7. COMPARASION
# =============================================================================

In [198]:
print(f"Accuracy Basic Model: {accuracy_basic_model:.3f}")
print(f"Accuracy Improved Model: {accuracy_improved_model:.3f}")
print(f"Sklearn accuracy: {sklearn_acc:.3f}")

Accuracy Basic Model: 0.587
Accuracy Improved Model: 0.810
Sklearn accuracy: 0.810
