In [40]:
# Titanic - EDA & Logistic Regression Baseline
# Objective: Exploratory analysis and implementation of logistic regression from scratch
# Input: Raw data from Kaggle (train.csv, test.csv)

import os
import sys
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

sys.path.append('../src/')
from models.logistic_regression import gradient_descent
from utils import predict

In [42]:
# =============================================================================
# 1. LOAD DATA
# =============================================================================

In [44]:
# Load datasets
train_df = pd.read_csv('../data/raw/train.csv')
test_df = pd.read_csv('../data/raw/test.csv')

print("Dataset shapes:")
print(f"Training set: {train_df.shape}")
print(f"Test set: {test_df.shape}")

Dataset shapes:
Training set: (891, 12)
Test set: (418, 11)


In [46]:
# =============================================================================
# 2. EXPLORATORY DATA ANALYSIS
# =============================================================================

In [None]:
print("\n=== BASIC INFO ===")
print(train_df.info())

print("\n=== MISSING VALUES ===")
print(train_df.isnull().sum())

print("\n=== SURVIVAL RATE BY GENDER ===")
print(train_df.groupby('Sex')['Survived'].mean())

print("\n=== SURVIVAL RATE BY CLASS ===")
print(train_df.groupby('Pclass')['Survived'].mean())

# Family size analysis (quick insight)
train_df['FamilySize'] = train_df['SibSp'] + train_df['Parch'] + 1
print("\n=== SURVIVAL RATE BY FAMILY SIZE ===")
print(train_df.groupby('FamilySize')['Survived'].mean())

In [50]:
# =============================================================================
# 3. PREPARE BASE FEATURES
# =============================================================================

In [52]:
# Simple feature engineering for baseline
train_df['Sex_numeric'] = train_df['Sex'].map({'male': 0, 'female': 1})

# Select features for baseline model (Sex + Pclass)
X = train_df[['Sex_numeric', 'Pclass']]
y = train_df['Survived']

print(f"\nFeature matrix shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"Missing values in features: {X.isnull().sum().sum()}")

# Train-validation split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


Feature matrix shape: (891, 2)
Target shape: (891,)
Missing values in features: 0


In [54]:
# =============================================================================
# 4. PREPARE DATA
# =============================================================================

In [56]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [58]:
# =============================================================================
# 5. RUN MODEL
# =============================================================================

In [60]:
# Parameters
initial_w = np.zeros(2)
initial_b = 0.
alpha = 0.1  # learning rate
num_iters = 1000

# Run gradient_descent
w_final, b_final, J_history = gradient_descent(X_train, y_train, initial_w, initial_b, alpha, num_iters)

print(f"\nParâmetros finais:")
print(f"w: {w_final}")
print(f"b: {b_final}")

Iteration    0: Cost 0.675175
Iteration  100: Cost 0.528647
Iteration  200: Cost 0.492328
Iteration  300: Cost 0.478762
Iteration  400: Cost 0.473093
Iteration  500: Cost 0.470536
Iteration  600: Cost 0.469318
Iteration  700: Cost 0.468711
Iteration  800: Cost 0.468398
Iteration  900: Cost 0.468231

Parâmetros finais:
w: Sex_numeric    2.559679
Pclass        -0.865522
dtype: float64
b: 0.46693049304425166


In [62]:
# =============================================================================
# 6. TEST WITH VALIDATION DATASET
# =============================================================================

In [64]:
# Predict with validation set
predictions = predict(X_val, w_final, b_final)

# Calculate accuracy
accuracy = (predictions == y_val).mean()
print(f"Accuracy: {accuracy:.3f}")

Accuracy: 0.782


In [68]:
# =============================================================================
# 7. COMPARASION WITH SKLEARN
# =============================================================================

In [70]:
from sklearn.linear_model import LogisticRegression
sklearn_model = LogisticRegression()
sklearn_model.fit(X_train, y_train)
sklearn_pred = sklearn_model.predict(X_val)
sklearn_acc = (sklearn_pred == y_val).mean()
print(f"Sklearn accuracy: {sklearn_acc:.3f}")

Sklearn accuracy: 0.782
