# Improved Normalization Workflow for Titanic Dataset

This notebook cleanly separates training/testing sets before scaling and applies different scaling techniques to appropriate columns.

## 1. Import Libraries

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler

## 2. Load processed Dataset

In [None]:
df = pd.read_csv('../data/processed/titanic_processed.csv')
df.head()

## 3. Split Features and Target

In [None]:
X = df.drop(columns='survived')
y = df['survived']

## 4. Train/Test Split (before scaling)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f'Train shape: {X_train.shape}, Test shape: {X_test.shape}')

## 5. Define Columns for Different Scaling

# Example: age and fare use standard scaling; other numeric features may use min-max

In [None]:
standardize_cols = ['age', 'fare']
normalize_cols = ['pclass']  # assuming pclass is numerical and should be bounded

# Keep all other columns unchanged (e.g., already one-hot encoded)

In [None]:
other_cols = [col for col in X.columns if col not in standardize_cols + normalize_cols]

## 6. Apply Scaling on Train Data

In [None]:
scaler_std = StandardScaler()
scaler_minmax = MinMaxScaler()

X_train_std = pd.DataFrame(scaler_std.fit_transform(X_train[standardize_cols]), columns=standardize_cols, index=X_train.index)
X_train_minmax = pd.DataFrame(scaler_minmax.fit_transform(X_train[normalize_cols]), columns=normalize_cols, index=X_train.index)
X_train_other = X_train[other_cols]

X_train_scaled = pd.concat([X_train_std, X_train_minmax, X_train_other], axis=1)

## 7. Apply Scaling on Test Data (use same transformers)

In [None]:
X_test_std = pd.DataFrame(scaler_std.transform(X_test[standardize_cols]), columns=standardize_cols, index=X_test.index)
X_test_minmax = pd.DataFrame(scaler_minmax.transform(X_test[normalize_cols]), columns=normalize_cols, index=X_test.index)
X_test_other = X_test[other_cols]

X_test_scaled = pd.concat([X_test_std, X_test_minmax, X_test_other], axis=1)

## 8. Save Scaled Datasets

In [None]:
train_scaled = X_train_scaled.copy()
train_scaled['survived'] = y_train
test_scaled = X_test_scaled.copy()
test_scaled['survived'] = y_test

train_scaled.to_csv('../data/processed/titanic_train_scaled.csv', index=False)
test_scaled.to_csv('../data/processed/titanic_test_scaled.csv', index=False)
print('Saved scaled train and test sets.')