### Preliminaries

In [65]:
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F

from torch import nn
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [None]:
import importlib
import token_factory as tf
import dataloader_factory as dl
import model_factory as md

importlib.reload(tf)
importlib.reload(dl)
importlib.reload(md)

### Configuration

In [66]:
config = {
    # General
    "training_session": 1,
    "seed": 27,

    # Pseudo-Labeling Model
    "learning_rate": 3e-4,
    "confidence_threshold": 0.98,
    "epochs": 20,

    # Dataset
    "labeled_dataset_path": "../../datasets/tabular_classification/labeled.csv",
    "unlabeled_dataset_path": "../../datasets/tabular_classification/unlabeled.csv",
    "validation_set_percentage": 0.2,
    "batch_size": 64,

    # Tabular input
    "categorical_columns": [
        "EmploymentStatus",
        "EducationLevel",
        "MaritalStatus",
        "HomeOwnershipStatus",
        "BankruptcyHistory",
        "LoanPurpose",
        "PreviousLoanDefaults",
        "PaymentHistory",
        "LoanApproved"
    ],
    "numeric_columns": [
        "Age",
        "AnnualIncome",
        "CreditScore",
        "Experience",
        "LoanAmount",
        "LoanDuration",
        "NumberOfDependents",
        "MonthlyDebtPayments",
        "CreditCardUtilizationRate",
        "NumberOfOpenCreditLines",
        "NumberOfCreditInquiries",
        "DebtToIncomeRatio",
        "LengthOfCreditHistory",
        "SavingsAccountBalance",
        "CheckingAccountBalance",
        "TotalAssets",
        "TotalLiabilities",
        "MonthlyIncome",
        "UtilityBillsPaymentHistory",
        "JobTenure",
        "NetWorth",
        "BaseInterestRate",
        "InterestRate",
        "MonthlyLoanPayment",
        "TotalDebtToIncomeRatio",
        "RiskScore"
    ],
    "tabular_target_column": "LoanApproved",
}

### Training Logic

In [67]:
def train_one_epoch(model, loader, device, optimizer):
    model.train()
    
    loss_function = nn.CrossEntropyLoss()
    total_loss = 0.00
    for x, y in loader:
        x, y = x.to(device), y.to(device)

        optimizer.zero_grad()
        outputs = model(x)

        loss = loss_function(outputs, y)
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

    print(f"Total Loss: {total_loss:.4f}")

In [68]:
def evaluate(model, loader, device):
    model.eval()

    all_predictions, all_labels = [], []
    total_loss = 0.00

    with torch.no_grad():
        for x, y in loader:
            x, y = x.to(device), y.to(device)

            logits = model(x)
            predictions = torch.argmax(logits, dim=1)

            all_predictions.extend(predictions.cpu().numpy())
            all_labels.extend(y.cpu().numpy())

            loss = F.cross_entropy(logits, y.long())
            total_loss += loss.item()

    accuracy = np.mean(np.array(all_predictions) == np.array(all_labels))
    print(f"Validation Accuracy: {accuracy:.4f} | Loss: {total_loss:.4f}")
    
    return all_predictions, all_labels

In [69]:
def train_without_pseudo_labels(model, labeled_loader, validation_loader, unlabeled_loader, device):
    optimizer = torch.optim.Adam(model.parameters(), lr=config["learning_rate"])

    best_model_path = f"../../models/pseudo_label/best_model_tabular_{config["training_session"]}.pt"
    best_accuracy = 0.00

    # Train on labeled data
    for epoch in range(1, config["epochs"] + 1):
        print(f"--- Start of Epoch {epoch}! ---")

        train_one_epoch(model, labeled_loader, device, optimizer)

        predictions, labels = evaluate(model, validation_loader, device)
        validation_accuracy = accuracy_score(labels, predictions)

        if validation_accuracy > best_accuracy:
            best_accuracy = validation_accuracy
            torch.save(model.state_dict(), best_model_path)
    
            print(f"✅ Best model saved to {best_model_path} | Accuracy: {best_accuracy:.4f}")

    print(f"--- End of Training ---")
    
    # Generate pseudo-labels
    model.eval()
    pseudo_features, pseudo_labels = [], []
    with torch.no_grad():
        for x in unlabeled_loader:
            x = x.to(device)

            outputs = model(x)

            probabilities = torch.softmax(outputs, dim=1)
            confidence, predictions = torch.max(probabilities, dim=1)
            mask = confidence >= config["confidence_threshold"]

            pseudo_features.append(x[mask].cpu())
            pseudo_labels.append(predictions[mask].cpu())
            
    return pseudo_features, pseudo_labels

In [70]:
def train_with_pseudo(model, labeled_loader, validation_loader, device):
    optimizer = torch.optim.Adam(model.parameters(), lr=config["learning_rate"])

    best_model_path = f"../../models/pseudo_label/best_model_tabular_{config["training_session"]}_pseudo.pt"
    best_accuracy = 0.00

    # Fine-tune the model with pseudo-labeled data
    for epoch in range(1, config["epochs"] + 1):
        print(f"--- Start of Epoch {epoch}! ---")

        train_one_epoch(model, labeled_loader, device, optimizer)

        predictions, labels = evaluate(model, validation_loader, device)
        validation_accuracy = accuracy_score(labels, predictions)

        if validation_accuracy > best_accuracy:
            best_accuracy = validation_accuracy
            torch.save(model.state_dict(), best_model_path)
            print(f"✅ Best model saved to {best_model_path} | Accuracy: {best_accuracy:.4f}")

    print(f"--- End of Training ---")

### Training Main

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

from token_factory import token_factory
from model_factory import model_factory
from dataloader_factory import dataloader_factory, combined_dataloader_factory

# Load labeled and unlabeled dataset
labeled_dataframe = pd.read_csv(config["labeled_dataset_path"])
unlabeled_dataframe = pd.read_csv(config["unlabeled_dataset_path"])

# Split labeled dataset into train and validation sets
train_dataframe, validation_dataframe = train_test_split(
    labeled_dataframe,
    test_size=config["validation_set_percentage"],
    stratify=labeled_dataframe[config["tabular_target_column"]],
    random_state=config["seed"]
)

# Obtain the tokenizer for tabular inputs
tokenizer = token_factory(
    categorical_columns=config["categorical_columns"],
    numeric_columns=config["numeric_columns"],
    target_column=config["tabular_target_column"],
)

# Fit only on training dataframe
tokenizer.fit(train_dataframe)

# Tokenize features
X_train = tokenizer.transform(train_dataframe)
y_train = tokenizer.transform_target(train_dataframe)

X_validation = tokenizer.transform(validation_dataframe)
y_validation = tokenizer.transform_target(validation_dataframe)

X_unlabeled = tokenizer.transform(unlabeled_dataframe)

# Create dataloaders
labeled_loader, unlabeled_loader, validation_loader = dataloader_factory(
    X_train=X_train, y_train=y_train, 
    X_validation=X_validation, y_validation=y_validation, 
    X_unlabeled=X_unlabeled, batch_size=config["batch_size"]
)

# Create MLP model
input_dim = labeled_dataframe.drop(columns=[config["tabular_target_column"]]).shape[1]
num_classes = labeled_dataframe[config["tabular_target_column"]].nunique()
model = model_factory(
    input_dim=input_dim,
    num_classes=num_classes,
).to(device)

# Train on labeled data and generate pseudolabels
X_pseudo, y_pseudo = train_without_pseudo_labels(model, labeled_loader, validation_loader, unlabeled_loader, device)

# No pseudolabels were generated because model is not confident
if len(X_pseudo) == 0:
    print("No pseudo-labels generated")
    exit(0)

# Combine generated pseudolabels with original labeled dataset
X_combined = torch.cat([
    torch.tensor(X_train, dtype=torch.float32),  
    torch.cat(X_pseudo, dim=0)            
], dim=0)
y_combined = torch.cat([
    torch.tensor(y_train, dtype=torch.long),
    torch.cat(y_pseudo, dim=0)
], dim=0)

# Create dataloader for combined dataset
labeled_loader = combined_dataloader_factory(
    X_combined=X_combined, y_combined=y_combined, batch_size=config["batch_size"]
)

# Train on labeled data with pseudolabels
train_with_pseudo(model, labeled_loader, validation_loader, device)