## Import libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import os
sys.path.append("..")
from src import *

## Load dataset

In [2]:
X_num, X_cat, y, client_num, num_names, cat_names = load_data('../data/raw/BankChurners.csv')
#X_num: table of numerical data
#num_names: name of numerical columns
#X_cat: table of catagorical data
#cat_names: name of catagorical columns
#y: Atrition_Flag (0: Existing, 1: Attrited)
#client_num 

print("Original numerical shape:", X_num.shape)  # (10127, 11)
print("Original categorical shape:", X_cat.shape)  # (10127, 5)

Original numerical shape: (10127, 11)
Original categorical shape: (10127, 5)


## Preprocessing

### 1. Handle unknown values in categorical

In [3]:
X_cat_clean = handle_unknown(X_cat)
print("After handling unknown, check if any left:", np.sum((X_cat_clean == "Unknown") | (X_cat_clean == "")))

After handling unknown, check if any left: 0


### 2. Handle outlier

In [4]:
X_num_no_outlier = handle_outliers(X_num)
print("Outlier handled. Max Credit_Limit before:", X_num[:,5].max(), "after:", X_num_no_outlier[:,5].max())

Outlier handled. Max Credit_Limit before: 34516.0 after: 23836.25


### 3. One-hot encode categorical

In [None]:
X_cat_encoded, cat_encoded_names = one_hot_encode(X_cat_clean, cat_names)
print("One-hot encoded shape:", X_cat_encoded.shape)  # (10127, expanded columns by unique)

One-hot encoded shape: (10127, 20)


### 4. Feature engineering on numerical

In [6]:
X_num_engineered, new_feature_names = feature_engineering(X_num)
print("After engineering numerical shape:", X_num_engineered.shape)

After engineering numerical shape: (10127, 14)


### 5. Combine all features

In [7]:
X_combined = np.hstack([X_num_engineered, X_cat_encoded])
all_feature_names = num_names + new_feature_names + cat_encoded_names
print("Combined features shape:", X_combined.shape)

Combined features shape: (10127, 34)


### 6. Standardize the combined features

In [8]:
X_standardized = standardize(X_combined)
print("Standardized mean (should be ~0):", np.mean(X_standardized, axis=0)[:5])

Standardized mean (should be ~0): [-1.50475399e-15  5.73315574e-15 -7.01393016e-15 -1.01932879e-15
 -1.25478114e-15]


### 7. Train-val split

In [9]:
X_train, X_val, y_train, y_val = train_val_split(X_standardized, y, val_size=0.2)
print("Train shape:", X_train.shape, y_train.shape)
print("Val shape:", X_val.shape, y_val.shape)

Train shape: (8102, 34) (8102,)
Val shape: (2025, 34) (2025,)


### 8. Save to processed

In [10]:
os.makedirs('../data/processed', exist_ok=True)
np.save('../data/processed/X_train.npy', X_train)
np.save('../data/processed/X_val.npy', X_val)
np.save('../data/processed/y_train.npy', y_train)
np.save('../data/processed/y_val.npy', y_val)
np.save('../data/processed/all_feature_names.npy', np.array(all_feature_names))
print("Preprocessing complete. Files saved to data/processed/")

Preprocessing complete. Files saved to data/processed/
