# Strategy for KNN

for sure i will need to randomly undersample and SMOTE oversample the training set (TRAIN ONLY) to keep balance

Use correlation between features of the dataset

In [1]:
import pandas as pd
import numpy as np
import os
import pickle
import time
import random
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.pipeline import Pipeline as SkPipeline
from sklearn.model_selection import GridSearchCV
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import balanced_accuracy_score
from imblearn.pipeline import Pipeline as ImbPipeline # Replaces sklearn Pipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, TensorDataset

# load the dataset and fix values

In [2]:
# Use the GPU
if torch.backends.mps.is_available():
    print("MPS device is available.")
    device = torch.device("mps")
elif torch.cuda.is_available():
    print("CUDA device is available.")
    device = torch.device("cuda")
else:
    print("No GPU acceleration available.")
    device = torch.device("cpu")

# Fix the seed to have deterministic behaviour
def fix_random(seed: int) -> None:
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True  # slower

SEED = 1337
fix_random(SEED)

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

DATASET_PATH = "dataset_train/dataset.csv"
dataset = pd.read_csv(DATASET_PATH, delimiter=",")

print(f"Shape of the dataset: {dataset.shape}")
duplicates = dataset[dataset.duplicated()]
print(f"Number of duplicates in the dataset: {duplicates.shape[0]}")

MPS device is available.
Shape of the dataset: (148301, 145)
Number of duplicates in the dataset: 0


## split the dataset

In [3]:
X = dataset.drop(columns=["grade"])
y = dataset["grade"].map({"A": 6, "B": 5, "C": 4, "D": 3, "E": 2, "F": 1, "G": 0})

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [4]:
class NumericExtractor(BaseEstimator, TransformerMixin):
    """Extracts integers from strings using regex"""

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        for col in X.columns:
            X[col] = X[col].astype(str).str.extract(r"(\d+)").astype(float)
        return X

class CyclicalDateEncoder(BaseEstimator, TransformerMixin):
    """Converts mm-yyyy to year + sine/cosine month encoding."""

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        for col in X.columns:
            # errors="coerce" turns unparseable data/NaNs into NaT
            date_series = pd.to_datetime(X[col], format="%b-%Y", errors="coerce")
            # If date is NaT, these become NaN, which we handle in the pipeline later
            angle = 2 * np.pi * date_series.dt.month / 12

            X[f"{col}_year"] = date_series.dt.year
            X[f"{col}_month_sin"] = np.sin(angle)
            X[f"{col}_month_cos"] = np.cos(angle)
            
            X.drop(columns=[col], inplace=True)
        return X
    
class BinaryModeEncoder(BaseEstimator, TransformerMixin):
    """"Encodes 0 if value is mode, 1 if not"""
    def __init__(self):
        self.modes_ = {}

    def fit(self, X, y=None):
        # Calculate mode for each column and store it
        for col in X.columns:
            self.modes_[col] = X[col].mode()[0]
        return self

    def transform(self, X):
        X_copy = X.copy()
        for col, mode in self.modes_.items():
            # Apply: 1 if NOT the mode (least frequent), 0 if mode
            X_copy[col] = (X_copy[col] != mode).astype(int)
        return X_copy
    
class HighMissingDropper(BaseEstimator, TransformerMixin):
    """Drops columns with high missing percentage. Fits only on training data."""
    
    def __init__(self, threshold=20):
        self.threshold = threshold
        self.cols_to_drop_ = []

    def fit(self, X, y=None):
        # Calculate missing percentages only on training data
        missing_percentages = X.isna().mean() * 100
        self.cols_to_drop_ = missing_percentages[missing_percentages > self.threshold].index.tolist()
        return self

    def transform(self, X):
        X = X.copy()
        return X.drop(columns=self.cols_to_drop_)

In [5]:
redundant_cols = ['loan_title']
binary_cols = ["loan_payment_plan_flag", "listing_initial_status", "application_type_label",
               "hardship_flag_indicator", "disbursement_method_type", "debt_settlement_flag_indicator"]
one_hot_encoding_cols = ["borrower_housing_ownership_status", "borrower_income_verification_status",
                       "loan_status_current_code", "loan_purpose_category", "borrower_address_state"]
extract_fields = ["loan_contract_term_months", "borrower_profile_employment_length", "borrower_address_zip"]
date_fields = ["loan_issue_date", "credit_history_earliest_line", "last_payment_date", "last_credit_pull_date"]

In [6]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.model_selection import StratifiedKFold, cross_val_score

# Pipeline 1: Numerics (Extract -> Then Impute Median)
numeric_pipe = SkPipeline([
    ('extract', NumericExtractor()),
    ('impute', SimpleImputer(strategy='median')) # Handles NaNs created by extractor
])

# Pipeline 2: Categoricals (Impute "Missing" -> Then OHE)
categorical_pipe = SkPipeline([
    ('impute', SimpleImputer(strategy='constant', fill_value='missing')), # Prevent OHE crash
    ('ohe', OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore'))
])

# Pipeline 3: Dates (Encode -> Then Impute Median for Sin/Cos/Year)
date_pipe = SkPipeline([
    ('cyclical', CyclicalDateEncoder()),
    ('impute', SimpleImputer(strategy='median')) # Imputing sin/cos is mathematically valid
])

# Pipeline 4: Binary (Encode -> Impute)
binary_pipe = SkPipeline([
    ('binary_enc', BinaryModeEncoder()), 
    ('impute', SimpleImputer(strategy='most_frequent')) # Safety net
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num_pipe', numeric_pipe, extract_fields),
        ('cat_pipe', categorical_pipe, one_hot_encoding_cols),
        ('date_pipe', date_pipe, date_fields),
        ('bin_pipe', binary_pipe, binary_cols),
        ('drop_redundant', 'drop', redundant_cols)
    ], 
    # Use a median imputer for any columns not listed (passthrough)
    remainder=SimpleImputer(strategy='median') 
)

pipeline = SkPipeline([
        ('dropper', HighMissingDropper(threshold=20)),
        ('prep', preprocessor),
        ('scaler', StandardScaler()), # Scale BEFORE PCA
    ])

X_clean_train = pipeline.fit_transform(X_train, y_train)
X_clean_test = pipeline.transform(X_test)

print(X_clean_train.shape)
# print(X_clean_train)

X_train_tensor = torch.tensor(X_clean_train, dtype=torch.float32)
Y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32)

X_test_tensor = torch.tensor(X_clean_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32)

test_loader = torch.utils.data.DataLoader(
    torch.utils.data.TensorDataset(X_test_tensor, y_test_tensor),
    shuffle=False,
)

(118640, 173)


In [7]:
# Define the TabTransformer model
class TabTransformer(nn.Module):
    def __init__(self, num_features, num_classes, dim_embedding=64, num_heads=4, num_layers=4):
        super(TabTransformer, self).__init__()
        self.embedding = nn.Linear(num_features, dim_embedding)
        encoder_layer = nn.TransformerEncoderLayer(d_model=dim_embedding, nhead=num_heads, batch_first=True)
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.classifier = nn.Linear(dim_embedding, num_classes)

    def forward(self, x):
        x = self.embedding(x)
        x = x.unsqueeze(1)  # Adding a sequence length dimension
        x = self.transformer(x)
        x = torch.mean(x, dim=1)  # Pooling
        x = self.classifier(x)
        return x

In [8]:
BATCH_SIZE = 64
LEARNING_RATE = 1e-3
EPOCHS = 20

train_dataset = TensorDataset(X_train_tensor, Y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

# Parameters based on your preprocessed data
num_features = X_clean_train.shape[1]
# Set num_classes to 1 for binary classification (BCELoss) or N for multi-class (CrossEntropy)
num_classes = 7

model = TabTransformer(num_features=num_features, num_classes=num_classes)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

model.to(device)

for epoch in range(EPOCHS):
    model.train()
    total_loss = 0
    
    for batch_X, batch_y in train_loader:
        batch_X, batch_y = batch_X.to(device), batch_y.to(device)
        
        # Forward pass
        outputs = model(batch_X).squeeze() # Squeeze if binary to match shape
        loss = criterion(outputs, batch_y)
        
        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    avg_loss = total_loss / len(train_loader)
    print(f"Epoch [{epoch+1}/{EPOCHS}], Loss: {avg_loss:.4f}")


Epoch [1/20], Loss: 0.6421
Epoch [2/20], Loss: 0.4630
Epoch [3/20], Loss: 0.3926
Epoch [4/20], Loss: 0.3591
Epoch [5/20], Loss: 0.3392
Epoch [6/20], Loss: 0.3246
Epoch [7/20], Loss: 0.3124
Epoch [8/20], Loss: 0.3046
Epoch [9/20], Loss: 0.2973
Epoch [10/20], Loss: 0.2891
Epoch [11/20], Loss: 0.2829
Epoch [12/20], Loss: 0.2779
Epoch [13/20], Loss: 0.2722
Epoch [14/20], Loss: 0.2667
Epoch [15/20], Loss: 0.2623
Epoch [16/20], Loss: 0.2587
Epoch [17/20], Loss: 0.2533
Epoch [18/20], Loss: 0.2505
Epoch [19/20], Loss: 0.2443
Epoch [20/20], Loss: 0.2416


In [9]:
# 1. Set model to evaluation mode
model.eval()

# 2. Disable gradient calculation for memory efficiency
with torch.no_grad():
    # Move test data to the same device as the model (CPU or GPU)
    X_test_tensor = X_test_tensor.to(device)
    
    # Forward pass
    outputs = model(X_test_tensor)
    
    # 3. Get predicted classes
    # For CrossEntropy (7 classes), we take the index of the highest logit
    _, y_pred_tensor = torch.max(outputs, 1)
    
    # 4. Convert to NumPy for Scikit-Learn
    # Ensure it's on CPU before converting to numpy
    y_pred = y_pred_tensor.cpu().numpy()
    
# 5. Calculate and print metrics
# Note: y_test should be your original labels or y_test_tensor.cpu().numpy()
acc = accuracy_score(y_test, y_pred)
print(f"Accuracy: {acc:.4f}")

bacc = balanced_accuracy_score(y_test, y_pred)
print(f"Balanced Accuracy: {bacc:.4f}")

f1 = f1_score(y_test, y_pred, average="weighted")
print(f"F1 score: {f1:.4f}")

Accuracy: 0.9174
Balanced Accuracy: 0.8825
F1 score: 0.9169
