In [None]:
# Titanic - EDA & Feature Engineering
# Objective: Exploratory analysis and implementation of logistic regression from scratch
# Input: Raw data from Kaggle (train.csv, test.csv)

import os
import sys
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

sys.path.append('../src/')
from models.logistic_regression import gradient_descent
from utils import sex_mapping, predict, extract_title, group_titles, group_mapping

In [None]:
# =============================================================================
# 1. LOAD DATA
# =============================================================================

In [None]:
# Load datasets
train_df = pd.read_csv('../data/raw/train.csv')
test_df = pd.read_csv('../data/raw/test.csv')

print("Dataset shapes:")
print(f"Training set: {train_df.shape}")
print(f"Test set: {test_df.shape}")

In [None]:
# =============================================================================
# 2. EXPLORATORY DATA ANALYSIS
# =============================================================================

In [None]:
# Lets extract Titles
train_df['Title'] = train_df['Name'].map(extract_title)
train_df.head()

In [None]:
# Check if some title is not mapped
train_df['Title'].isnull().sum()

In [None]:
train_df['Age'].isnull().sum()

In [None]:
print("\n=== SURVIVAL RATE BY Title ===")
train_df.groupby('Title')['Survived'].agg(['mean', 'count'])

# Family size analysis (quick insight)
# train_df['FamilySize'] = train_df['SibSp'] + train_df['Parch'] + 1
# print("\n=== SURVIVAL RATE BY FAMILY SIZE ===")
# print(train_df.groupby('FamilySize')['Survived'].mean())

In [None]:
train_df['Title_Group'] = train_df['Title'].apply(group_titles)
train_df.head()

In [None]:
print(train_df['Title_Group'].value_counts())
print("\nSurvival rate por grupo:")
print(train_df.groupby('Title_Group')['Survived'].mean().sort_values(ascending=False))

In [None]:
# Since we are considering to implement Age let's check the null values
train_df['Age'].isnull().sum()

In [None]:
import seaborn as sns
sns.boxplot(x='Survived', y='Age', data=train_df)

In [None]:
print(train_df.groupby('Survived')['Age'].describe())

In [None]:
# =============================================================================
# 3. PREPARE BASE FEATURES
# =============================================================================

In [11]:
# Lets extract Titles
train_df['Title'] = train_df['Name'].map(extract_title)
train_df['Title_Group'] = train_df['Title'].apply(group_titles)
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Sex_numeric,Title,Title_Group
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,0,Mr,Adult_Male
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,1,Mrs,Married_Female
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,1,Miss,Single_Female
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,1,Mrs,Married_Female
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,0,Mr,Adult_Male


In [23]:
# Simple feature engineering for baseline
train_df['Sex_numeric'] = train_df['Sex'].map(sex_mapping)
train_df['Title_Group_numeric'] = train_df['Title_Group'].map(group_mapping)

# Select features for baseline model (Sex + Pclass)
X = train_df[['Sex_numeric', 'Pclass', 'Title_Group_numeric']]
y = train_df['Survived']

print(f"\nFeature matrix shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"Missing values in features: {X.isnull().sum().sum()}")

# Train-validation split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


Feature matrix shape: (891, 3)
Target shape: (891,)
Missing values in features: 0


In [25]:
# =============================================================================
# 4. PREPARE DATA
# =============================================================================

In [27]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [29]:
# =============================================================================
# 5. RUN MODEL
# =============================================================================

In [31]:
# Parameters
initial_w = np.zeros(3)
initial_b = 0.
alpha = 0.1  # learning rate
num_iters = 1000

# Run gradient_descent
w_final, b_final, J_history = gradient_descent(X_train, y_train, initial_w, initial_b, alpha, num_iters)

print(f"\nFinal values:")
print(f"w: {w_final}")
print(f"b: {b_final}")

Iteration    0: Cost 0.674009
Iteration  100: Cost 0.508937
Iteration  200: Cost 0.487700
Iteration  300: Cost 0.476904
Iteration  400: Cost 0.471203
Iteration  500: Cost 0.468133
Iteration  600: Cost 0.466444
Iteration  700: Cost 0.465493
Iteration  800: Cost 0.464948
Iteration  900: Cost 0.464628

Parâmetros finais:
w: Sex_numeric            2.268361
Pclass                -0.843786
Title_Group_numeric    0.225303
dtype: float64
b: 0.31753008593804194


In [None]:
# =============================================================================
# 6. TEST WITH VALIDATION DATASET
# =============================================================================

In [33]:
# Predict with validation set
predictions = predict(X_val, w_final, b_final)

# Calculate accuracy
accuracy = (predictions == y_val).mean()
print(f"Accuracy: {accuracy:.3f}")

Accuracy: 0.782


In [None]:
# =============================================================================
# 7. COMPARASION WITH SKLEARN
# =============================================================================

In [35]:
from sklearn.linear_model import LogisticRegression
sklearn_model = LogisticRegression()
sklearn_model.fit(X_train, y_train)
sklearn_pred = sklearn_model.predict(X_val)
sklearn_acc = (sklearn_pred == y_val).mean()
print(f"Sklearn accuracy: {sklearn_acc:.3f}")

Sklearn accuracy: 0.782
