In [25]:
import torch
import pandas as pd

from torch import nn
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split

In [26]:
config = {
    # General
    "training_session": 1,

    # Pseudo-Labeling Model
    "pre_trained": True,
    "learning_rate": 3e-4,
    "confidence_threshold": 0.98,
    "epochs": 20,

    # Dataset
    "input_type": "tabular",
    "dataset_path": "../../datasets/loan.csv",
    "num_labels": 0.3,
    "batch_size": 64,

    # Image input

    # Text input

    # Tabular input
    "categorical_columns": [
        "ApplicationDate", "EmploymentStatus", "EducationLevel",
        "MaritalStatus", "HomeOwnershipStatus", "LoanPurpose"
    ],
    "numeric_columns": [
        "Age", "AnnualIncome", "CreditScore", "Experience", "LoanAmount",
        "LoanDuration", "NumberOfDependents", "MonthlyDebtPayments",
        "CreditCardUtilizationRate", "NumberOfOpenCreditLines", "NumberOfCreditInquiries",
        "DebtToIncomeRatio", "BankruptcyHistory", "PreviousLoanDefaults", "PaymentHistory",
        "LengthOfCreditHistory", "SavingsAccountBalance", "CheckingAccountBalance",
        "TotalAssets", "TotalLiabilities", "MonthlyIncome", "UtilityBillsPaymentHistory",
        "JobTenure", "NetWorth", "BaseInterestRate", "InterestRate",
        "MonthlyLoanPayment", "TotalDebtToIncomeRatio", "RiskScore"
    ],
    "target_column": "LoanApproved",
}

In [27]:
def train_one_epoch(model, train_loader, device, optimizer):
    loss_function = nn.CrossEntropyLoss()
    for x, y in train_loader:
        x, y = x.to(device), y.to(device)
        optimizer.zero_grad()
        outputs = model(x)
        loss = loss_function(outputs, y)
        loss.backward()
        optimizer.step()

In [28]:
def evaluate(model, loader, device):
    all_preds, all_labels = [], []
    with torch.no_grad():
        for x, y in loader:
            x, y = x.to(device), y.to(device)
            outputs = model(x)
            preds = torch.argmax(outputs, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(y.cpu().numpy())
    return all_preds, all_labels

In [29]:
def train_without_pseudo(model, train_loader, val_loader, unlabeled_loader, device, config):
    optimizer = torch.optim.Adam(model.parameters(), lr=config["learning_rate"])
    best_model_path = f"../../models/pseudo_label/best_model_{config["input_type"]}_{config["training_session"]}.pt"
    val_accuracy = 0

    # Train on labeled data
    for epoch in range(config["epochs"]):
        model.train()
        train_one_epoch(model=model, train_loader=train_loader, device=device, optimizer=optimizer)

        model.eval()
        predictions, labels = evaluate(model, val_loader, device)
        accuracy = accuracy_score(labels, predictions)
        report = classification_report(labels, predictions, zero_division=0)
        print(f"Epoch {epoch+1}: Val Acc = {accuracy:.4f}")
        # print(report)

        if accuracy > val_accuracy:
            val_accuracy = accuracy
            torch.save(model.state_dict(), best_model_path)
            print(f"✅ Best model saved to {best_model_path} | Accuracy: {val_accuracy:.4f}")
    
    # Generate pseudo-labels
    model.eval()
    pseudo_features, pseudo_labels = [], []
    with torch.no_grad():
        for x in unlabeled_loader:
            x = x.to(device)
            outputs = model(x)
            probs = torch.softmax(outputs, dim=1)
            confidence, preds = torch.max(probs, dim=1)
            mask = confidence >= config["confidence_threshold"]
            pseudo_features.append(x[mask].cpu())
            pseudo_labels.append(preds[mask].cpu())
            
    return pseudo_features, pseudo_labels

In [30]:
def train_with_pseudo(model, combined_loader, val_loader, device, config):
    optimizer = torch.optim.Adam(model.parameters(), lr=config["learning_rate"])
    best_model_path = f"../../models/pseudo_label/best_model_{config["input_type"]}_{config["training_session"]}_pseudo.pt"
    val_accuracy = 0

    # Fine-tune the model with pseudo-labeled data
    for epoch in range(config["epochs"]):
        model.train()
        train_one_epoch(model=model, train_loader=combined_loader, device=device, optimizer=optimizer)

        model.eval()
        predictions, labels = evaluate(model, val_loader, device)
        accuracy = accuracy_score(labels, predictions)
        report = classification_report(labels, predictions, zero_division=0)
        print(f"Epoch {epoch+1}: Val Acc = {accuracy:.4f} (with pseudo-labels)")
        print(report)

        if accuracy > val_accuracy:
            val_accuracy = accuracy
            torch.save(model.state_dict(), best_model_path)
            print(f"✅ Best model saved to {best_model_path} | Accuracy: {val_accuracy:.4f}")

In [31]:
import importlib
import token_factory as tf
import dataloader_factory as dl
import model_factory as md

importlib.reload(tf)
importlib.reload(dl)
importlib.reload(md)

<module 'model_factory' from '/Users/dundale/Downloads/bpi-ssl/simulation/pseudo_label/model_factory.py'>

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

from token_factory import token_factory
from model_factory import model_factory
from dataloader_factory import dataloader_factory, combined_dataloader_factory

if config["input_type"] == "image":
    ...

elif config["input_type"] == "text":
    ...

elif config["input_type"] == "tabular":
    # Split labeled and unlabeled
    df = pd.read_csv(config["dataset_path"])
    unlabeled_size = 1.0 - config.get("num_labels", 0.3)
    df_labeled, df_unlabeled = train_test_split(df, test_size=unlabeled_size, stratify=df[config["target_column"]])

    # Further split labeled for validation
    df_train, df_val = train_test_split(df_labeled, test_size=0.2, stratify=df_labeled[config["target_column"]])


    # Tokenize input
    tokenizer = token_factory(
        "tabular", 
        categorical_columns=config["categorical_columns"],
        numeric_columns=config["numeric_columns"],
        target_column=config["target_column"],
    )

    # Fit tokenizer on training data
    tokenizer.fit(df_train)

    # Tokenize features
    X_train = tokenizer.transform(df_train)
    y_train = tokenizer.transform_target(df_train)

    X_val = tokenizer.transform(df_val)
    y_val = tokenizer.transform_target(df_val)

    X_unlabeled = tokenizer.transform(df_unlabeled)
    
    # Generate dataloaders
    train_loader, val_loader, unlabeled_loader = dataloader_factory(X_train, y_train, X_val, y_val, X_unlabeled)
    
    # Create model
    input_dim = train_loader.dataset[0][0].shape[0]
    num_classes = df[config["target_column"]].nunique()
    model = model_factory(
        "tabular",
        input_dim=input_dim,
        num_classes=num_classes,
    ).to(device)

    pseudo_features, pseudo_labels = train_without_pseudo(model, train_loader, val_loader, unlabeled_loader, device, config)
    if len(pseudo_features) == 0:
        print("No pseudo-labels generated.")
        exit(0)

    X_combined = torch.cat([
        torch.tensor(X_train, dtype=torch.float32),  
        torch.cat(pseudo_features, dim=0)            
    ], dim=0)
    y_combined = torch.cat([
        torch.tensor(y_train, dtype=torch.long),
        torch.cat(pseudo_labels, dim=0)
    ], dim=0)
    combined_loader = combined_dataloader_factory(X_combined=X_combined, y_combined=y_combined)

    train_with_pseudo(model, combined_loader, val_loader, device, config)

else:
    raise ValueError(f"Unsupported input type: {config["input_type"]}")