# Strategy for KNN

for sure i will need to randomly undersample and SMOTE oversample the training set (TRAIN ONLY) to keep balance

Use correlation between features of the dataset

In [9]:
import pandas as pd
import numpy as np
import os
import pickle
import time
import random
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder, OneHotEncoder, OrdinalEncoder
from sklearn.pipeline import Pipeline as SkPipeline
from sklearn.model_selection import GridSearchCV
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import balanced_accuracy_score
from imblearn.pipeline import Pipeline as ImbPipeline # Replaces sklearn Pipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, TensorDataset

from tab_transformer_pytorch import TabTransformer

# load the dataset and fix values

In [10]:
# Use the GPU
if torch.backends.mps.is_available():
    print("MPS device is available.")
    device = torch.device("mps")
elif torch.cuda.is_available():
    print("CUDA device is available.")
    device = torch.device("cuda")
else:
    print("No GPU acceleration available.")
    device = torch.device("cpu")

# Fix the seed to have deterministic behaviour
def fix_random(seed: int) -> None:
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True  # slower

SEED = 1337
fix_random(SEED)

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

DATASET_PATH = "dataset_train/dataset.csv"
dataset = pd.read_csv(DATASET_PATH, delimiter=",")

print(f"Shape of the dataset: {dataset.shape}")
duplicates = dataset[dataset.duplicated()]
print(f"Number of duplicates in the dataset: {duplicates.shape[0]}")

MPS device is available.
Shape of the dataset: (148301, 145)
Number of duplicates in the dataset: 0


## split the dataset

In [11]:
X = dataset.drop(columns=["grade"])
y = dataset["grade"].map({"A": 6, "B": 5, "C": 4, "D": 3, "E": 2, "F": 1, "G": 0})

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [12]:
class NumericExtractor(BaseEstimator, TransformerMixin):
    """Extracts integers from strings using regex"""

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        for col in X.columns:
            X[col] = X[col].astype(str).str.extract(r"(\d+)").astype(float)
        return X

class CyclicalDateEncoder(BaseEstimator, TransformerMixin):
    """Converts mm-yyyy to year + sine/cosine month encoding."""

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        for col in X.columns:
            # errors="coerce" turns unparseable data/NaNs into NaT
            date_series = pd.to_datetime(X[col], format="%b-%Y", errors="coerce")
            # If date is NaT, these become NaN, which we handle in the pipeline later
            angle = 2 * np.pi * date_series.dt.month / 12

            X[f"{col}_year"] = date_series.dt.year
            X[f"{col}_month_sin"] = np.sin(angle)
            X[f"{col}_month_cos"] = np.cos(angle)
            
            X.drop(columns=[col], inplace=True)
        return X
    
class BinaryModeEncoder(BaseEstimator, TransformerMixin):
    """"Encodes 0 if value is mode, 1 if not"""
    def __init__(self):
        self.modes_ = {}

    def fit(self, X, y=None):
        # Calculate mode for each column and store it
        for col in X.columns:
            self.modes_[col] = X[col].mode()[0]
        return self

    def transform(self, X):
        X_copy = X.copy()
        for col, mode in self.modes_.items():
            # Apply: 1 if NOT the mode (least frequent), 0 if mode
            X_copy[col] = (X_copy[col] != mode).astype(int)
        return X_copy
    
class HighMissingDropper(BaseEstimator, TransformerMixin):
    """Drops columns with high missing percentage. Fits only on training data."""
    
    def __init__(self, threshold=20):
        self.threshold = threshold
        self.cols_to_drop_ = []

    def fit(self, X, y=None):
        # Calculate missing percentages only on training data
        missing_percentages = X.isna().mean() * 100
        self.cols_to_drop_ = missing_percentages[missing_percentages > self.threshold].index.tolist()
        return self

    def transform(self, X):
        X = X.copy()
        return X.drop(columns=self.cols_to_drop_)

In [13]:
redundant_cols = ['loan_title']
binary_cols = ["loan_payment_plan_flag", "listing_initial_status", "application_type_label",
               "hardship_flag_indicator", "disbursement_method_type", "debt_settlement_flag_indicator"]
one_hot_encoding_cols = ["borrower_housing_ownership_status", "borrower_income_verification_status",
                       "loan_status_current_code", "loan_purpose_category", "borrower_address_state"]
extract_fields = ["loan_contract_term_months", "borrower_profile_employment_length", "borrower_address_zip"]
date_fields = ["loan_issue_date", "credit_history_earliest_line", "last_payment_date", "last_credit_pull_date"]

In [14]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.model_selection import StratifiedKFold, cross_val_score

# --- 1. Define Encoders ---

# Numeric: Impute -> Scale (Move Scaling HERE)
numeric_pipe = SkPipeline([
    ('extract', NumericExtractor()),
    ('impute', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()) 
])

# Categorical/Date/Binary: Impute -> Ordinal Encode (Integers)
# We use handle_unknown='use_encoded_value' with unknown_value=-1 to handle new categories in test
# We will shift these +1 later so unknown becomes 0
ord_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1, dtype=np.float32)

categorical_pipe = SkPipeline([
    ('impute', SimpleImputer(strategy='constant', fill_value='missing')),
    ('ordinal', ord_encoder) 
])

# Re-using the same logic for dates/binary to keep them as integer embeddings
# (Assuming dates are feature-engineered or low cardinality. If raw dates, this is risky)
date_pipe = SkPipeline([
    ('impute', SimpleImputer(strategy='constant', fill_value='missing')),
    ('ordinal', ord_encoder)
])

binary_pipe = SkPipeline([
    ('binary_enc', BinaryModeEncoder()), # Ensure this outputs identifiable categories
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('ordinal', ord_encoder)
])

# --- 2. Build Preprocessor ---
# CRITICAL: We group all categorical outputs FIRST in the transformers list.
# This ensures the first N columns of our output matrix are categories, and the rest are continuous.

preprocessor = ColumnTransformer(
    transformers=[
        # --- GROUP A: CATEGORICALS ---
        ('cat_pipe', categorical_pipe, one_hot_encoding_cols),
        ('date_pipe', date_pipe, date_fields),
        ('bin_pipe', binary_pipe, binary_cols),
        
        # --- GROUP B: CONTINUOUS ---
        ('num_pipe', numeric_pipe, extract_fields),
        
        # Drop redundant
        ('drop_redundant', 'drop', redundant_cols)
    ],
    # Drop anything else (or passthrough if you are sure)
    remainder='drop' 
)

# Main Pipeline (Removed Global Scaler)
pipeline = SkPipeline([
    ('dropper', HighMissingDropper(threshold=20)),
    ('prep', preprocessor),
])

X_clean_train = pipeline.fit_transform(X_train, y_train)
X_clean_test = pipeline.transform(X_test)

print(X_clean_train.shape)

X_train_tensor = torch.tensor(X_clean_train, dtype=torch.float32)
Y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32)

X_test_tensor = torch.tensor(X_clean_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32)

test_loader = torch.utils.data.DataLoader(
    torch.utils.data.TensorDataset(X_test_tensor, y_test_tensor),
    shuffle=False,
)


(118640, 18)


In [16]:
# --- Step A: Get Vocab Sizes for the Model ---
def get_vocab_sizes(pipe_name):
    # Access: Pipeline -> ColumnTransformer -> Sub-Pipeline -> OrdinalEncoder
    enc = pipeline.named_steps['prep'].named_transformers_[pipe_name].named_steps['ordinal']
    # Add +1 to every count to reserve index 0 for "unknown"
    return [len(cats) + 1 for cats in enc.categories_]

# Collect vocab sizes
cat_vocab  = get_vocab_sizes('cat_pipe')
date_vocab = get_vocab_sizes('date_pipe')
bin_vocab  = get_vocab_sizes('bin_pipe')

categories_tuple = tuple(cat_vocab + date_vocab + bin_vocab)
num_cat_cols = len(categories_tuple)
num_cont_cols = X_clean_train.shape[1] - num_cat_cols
dim_out = len(np.unique(y_train)) 

print(f"Categories: {categories_tuple}")
print(f"Split: {num_cat_cols} Categorical, {num_cont_cols} Continuous")

# --- Step B: Helper to Split and Shift Data ---
def prepare_tensors(X_data, y_data):
    # Split the matrix
    X_cat_part = X_data[:, :num_cat_cols]
    X_cont_part = X_data[:, num_cat_cols:]
    
    # 1. Categorical: Cast to Long and Shift +1 (Unknown -1 becomes 0)
    x_categ = torch.tensor(X_cat_part, dtype=torch.long) + 1 
    
    # 2. Continuous: Cast to Float
    x_cont = torch.tensor(X_cont_part, dtype=torch.float32)
    
    # 3. Labels: Cast to Long
    y = torch.tensor(y_data.values, dtype=torch.long) 
    
    return x_categ, x_cont, y

# --- Step C: Create Loaders ---
# Train
x_train_cat, x_train_cont, y_train_t = prepare_tensors(X_clean_train, y_train)
train_dataset = TensorDataset(x_train_cat, x_train_cont, y_train_t)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)

# Test
x_test_cat, x_test_cont, y_test_t = prepare_tensors(X_clean_test, y_test)
test_loader = DataLoader(
    TensorDataset(x_test_cat, x_test_cont, y_test_t),
    batch_size=64, shuffle=False
)

Categories: (8, 5, 11, 16, 53, 140, 657, 136, 120, 3, 3, 3, 3, 3, 3)
Split: 15 Categorical, 3 Continuous


In [17]:
import torch.nn as nn
import torch.optim as optim
from tab_transformer_pytorch import TabTransformer

# --- Step D: Instantiate Model ---
model = TabTransformer(
    categories = categories_tuple,
    num_continuous = num_cont_cols,
    dim = 32,
    dim_out = dim_out,
    depth = 6,
    heads = 8,
    attn_dropout = 0.1,
    ff_dropout = 0.1,
    mlp_hidden_mults = (4, 2),
    mlp_act = nn.ReLU()
)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# --- Step E: Training Loop ---
EPOCHS = 20

for epoch in range(EPOCHS):
    model.train()
    total_loss = 0
    
    # CRITICAL: Loop unpacks 3 items now
    for batch_cat, batch_cont, batch_y in train_loader:
        batch_cat = batch_cat.to(device)
        batch_cont = batch_cont.to(device)
        batch_y = batch_y.to(device)
        
        # Forward pass: Pass TWO inputs (Cat, Cont)
        outputs = model(batch_cat, batch_cont)
        
        # Loss calculation
        loss = criterion(outputs, batch_y)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    avg_loss = total_loss / len(train_loader)
    print(f"Epoch [{epoch+1}/{EPOCHS}], Loss: {avg_loss:.4f}")

KeyboardInterrupt: 

In [None]:
# 1. Set model to evaluation mode
model.eval()

# 2. Disable gradient calculation for memory efficiency
with torch.no_grad():
    # Move test data to the same device as the model (CPU or GPU)
    X_test_tensor = X_test_tensor.to(device)
    
    # Forward pass
    outputs = model(X_test_tensor)
    
    # 3. Get predicted classes
    # For CrossEntropy (7 classes), we take the index of the highest logit
    _, y_pred_tensor = torch.max(outputs, 1)
    
    # 4. Convert to NumPy for Scikit-Learn
    # Ensure it's on CPU before converting to numpy
    y_pred = y_pred_tensor.cpu().numpy()
    
# 5. Calculate and print metrics
# Note: y_test should be your original labels or y_test_tensor.cpu().numpy()
acc = accuracy_score(y_test, y_pred)
print(f"Accuracy: {acc:.4f}")

bacc = balanced_accuracy_score(y_test, y_pred)
print(f"Balanced Accuracy: {bacc:.4f}")

f1 = f1_score(y_test, y_pred, average="weighted")
print(f"F1 score: {f1:.4f}")

Accuracy: 0.9174
Balanced Accuracy: 0.8825
F1 score: 0.9169
