In [249]:
import copy
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
import torch.optim as optim

from sklearn.model_selection import train_test_split

In [250]:
# config = {
#     # General
#     "training_session": 2,

#     # Mean-Teacher Model
#     "pre_trained": True,
#     "learning_rate": 3e-4,
#     "alpha": 0.99,
#     "lambda_u": 1.0,
#     "epochs": 10,

#     # Dataset
#     "input_type": "tabular",
#     "dataset_path": "../../datasets/credit_data.csv",
#     "num_labels": 0.6,
#     "batch_size": 64,

#     # Image input
#     "image_classes": ["form", "invoice", "memo", "letter"],
#     "image_size": 224,

#     # Text input

#     # Tabular input
#     "categorical_columns": [
#         "Gender", "Existing Customer", "State",
#         "City", "Employment Profile", "Occupation"
#     ],
#     "numeric_columns": [
#         "Age", "Income", "Credit Score", "Credit History Length", "Number of Existing Loans",
#         "Loan Amount", "Loan Tenure", "LTV Ratio"
#     ],
#     "target_column": "Profile Score",
#     "is_target_categorical": False, 
# }

# config = {
#     # General
#     "training_session": 5,

#     # Mean-Teacher Model
#     "pre_trained": True,
#     "learning_rate": 3e-4,
#     "alpha": 0.99,
#     "lambda_u": 1.0,
#     "epochs": 20,

#     # Dataset
#     "input_type": "tabular",
#     "dataset_path": "../../datasets/loan.csv",
#     "num_labels": 0.4,
#     "batch_size": 64,

#     # Image input
#     "image_classes": ["form", "invoice", "memo", "letter"],
#     "image_size": 224,

#     # Text input
#     "text_column": "Sentence",
#     "text_target_column": "Sentiment",

#     # Tabular input
#     "categorical_columns": [
#         "ApplicationDate", "EmploymentStatus", "EducationLevel",
#         "MaritalStatus", "HomeOwnershipStatus", "LoanPurpose"
#     ],
#     "numeric_columns": [
#         "Age", "AnnualIncome", "CreditScore", "Experience", "LoanAmount",
#         "LoanDuration", "NumberOfDependents", "MonthlyDebtPayments",
#         "CreditCardUtilizationRate", "NumberOfOpenCreditLines", "NumberOfCreditInquiries",
#         "DebtToIncomeRatio", "BankruptcyHistory", "PreviousLoanDefaults", "PaymentHistory",
#         "LengthOfCreditHistory", "SavingsAccountBalance", "CheckingAccountBalance",
#         "TotalAssets", "TotalLiabilities", "MonthlyIncome", "UtilityBillsPaymentHistory",
#         "JobTenure", "NetWorth", "BaseInterestRate", "InterestRate",
#         "MonthlyLoanPayment", "TotalDebtToIncomeRatio", "RiskScore"
#     ],
#     "target_column": "LoanApproved",
#     "is_target_categorical": True, 
# }

config = {
    # General
    "training_session": 1,

    # Mean-Teacher Model
    "pre_trained": False,
    "learning_rate": 3e-4,
    "alpha": 0.99,
    "lambda_u": 1.0,
    "epochs": 20,

    # Dataset
    "input_type": "text",
    "dataset_path": "../../datasets/sentiments.csv",
    "num_labels": 0.4,
    "batch_size": 64,

    # Image input
    "image_classes": [],
    "image_size": 224,

    # Text input
    "text_column": "Sentence",
    "text_target_column": "Sentiment",

    # Tabular input
    "categorical_columns": [],
    "numeric_columns": [],
    "target_column": "",
    "is_target_categorical": True, 
}

In [251]:
def update_ema(student_model, teacher_model, alpha=0.99):
    for student_param, teacher_param in zip(student_model.parameters(), teacher_model.parameters()):
        teacher_param.data = alpha * teacher_param.data + (1 - alpha) * student_param.data

In [252]:
def train_one_epoch(student_model, teacher_model, lb_loader, ulb_loader, optimizer, device, epoch, alpha):
    student_model.train()
    teacher_model.train()

    is_regression = True if config["input_type"] == "tabular" and not config["is_target_categorical"] else False
    total_loss = 0
    for batch_idx, ((x_lb, y_lb), (x_ulb_w, x_ulb_s)) in enumerate(zip(lb_loader, ulb_loader)):
        print(f"🟡 Batch {batch_idx}")
        
        # TODO: Change
        if config["input_type"] == "text" and config["pre_trained"]:
            x_lb = {k: v.to(device) for k, v in x_lb.items()}
            x_ulb_w = {k: v.to(device) for k, v in x_ulb_w.items()}
            x_ulb_s = {k: v.to(device) for k, v in x_ulb_s.items()}
        else:
            print(x_ulb_w)
            x_lb = x_lb.to(device)
            x_ulb_w = x_ulb_w.to(device)
            x_ulb_s = x_ulb_s.to(device)

        y_lb = y_lb.to(device)

        if is_regression:
            y_lb = y_lb.float().unsqueeze(1).to(device)
        else:
            y_lb = y_lb.to(device)

        # Supervised loss
        logits_lb = student_model(x_lb)
        loss_sup = F.mse_loss(logits_lb, y_lb) if is_regression else F.cross_entropy(logits_lb, y_lb)

        # Unsupervised loss (consistency)
        if is_regression:
            with torch.no_grad():
                pseudo_labels = teacher_model(x_ulb_w)
            logits_ulb_s = student_model(x_ulb_s)
            loss_unsup = F.mse_loss(logits_ulb_s, pseudo_labels)
        else:
            with torch.no_grad():
                logits_ulb_w = teacher_model(x_ulb_w)
                pseudo_labels = torch.softmax(logits_ulb_w, dim=1)
            logits_ulb_s = student_model(x_ulb_s)
            loss_unsup = F.mse_loss(torch.softmax(logits_ulb_s, dim=1), pseudo_labels)

        # Total loss
        loss = loss_sup + config["lambda_u"] * loss_unsup
        total_loss += loss.item()

        # Backprop
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # EMA update
        update_ema(student_model, teacher_model, alpha)

    print(f'Epoch {epoch} | Total Loss: {total_loss:.4f}')

In [253]:
def evaluate(model, val_loader, device):
    model.eval()
    all_preds = []
    all_labels = []
    total_loss = 0.0

    is_regression = True if config["input_type"] == "tabular" and not config["is_target_categorical"] else False
    with torch.no_grad():
        for x, y in val_loader:
            if config["input_type"] == "text" and config["pre_trained"]:
                x = {k: v.to(device) for k, v in x.items()}
            else:
                x = x.to(device)
            y = y.to(device)
            logits = model(x)

            if is_regression:
                loss = F.mse_loss(logits.squeeze(), y.float())
                preds = logits.squeeze()
            else:
                loss = F.cross_entropy(logits, y.long())
                preds = torch.argmax(logits, dim=1)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(y.cpu().numpy())
            total_loss += loss.item()

    if is_regression:
        mae = np.mean(np.abs(np.array(all_preds) - np.array(all_labels)))
        print(f"Validation MAE: {mae:.4f} | Loss: {total_loss:.4f}")
        return mae, total_loss
    else:
        acc = np.mean(np.array(all_preds) == np.array(all_labels))
        print(f"Validation Accuracy: {acc:.4f} | Loss: {total_loss:.4f}")
        return acc, total_loss

In [254]:
def train_mean_teacher(student_model, lb_loader, ulb_loader, val_loader, device, epochs, alpha):
    teacher_model = copy.deepcopy(student_model)
    for param in teacher_model.parameters():
        param.requires_grad = False

    optimizer = optim.Adam(student_model.parameters(), lr=config["learning_rate"])

    best_val_accuracy = 0
    best_model_path = f"../../models/mean_teacher/best_model_{config["input_type"]}_{config["training_session"]}.pt"
    for epoch in range(1, epochs + 1):
        print(f"🟡 Start of epoch {epoch}!")
        
        train_one_epoch(student_model, teacher_model, lb_loader, ulb_loader, optimizer, device, epoch, alpha)

        # TODO: Update logic to account for regression vs classification
        val_accuracy, _ = evaluate(student_model, val_loader, device)
        if val_accuracy > best_val_accuracy:
            best_val_accuracy = val_accuracy
            torch.save(student_model.state_dict(), best_model_path)
            print(f"✅ Best model saved to {best_model_path} | Accuracy: {val_accuracy:.4f}")

    return student_model, teacher_model

In [255]:
import importlib
import token_factory as tf
import dataloader_factory as dl
import model_factory as md

importlib.reload(tf)
importlib.reload(dl)
importlib.reload(md)

<module 'model_factory' from '/Users/dundale/Downloads/bpi-ssl/simulation/mean_teacher/model_factory.py'>

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

from token_factory import token_factory
from model_factory import model_factory
from dataloader_factory import dataloader_factory

if config["input_type"] == "image":
    # Obtain the base transform for image inputs
    base_transform = token_factory(
        "image", 
        image_size=(config["image_size"], config["image_size"])
    )
    
    # Create MLP model
    model = model_factory(
        "image", 
        num_classes=len(config["image_classes"]), 
        pretrained=config["pre_trained"]
    ).to(device)

    # Create dataloaders
    lb_loader, ulb_loader, val_loader = dataloader_factory(
        "image",
        num_labels=config["num_labels"],
        image_classes=config["image_classes"],
        batch_size=config["batch_size"],
        dataset_path=config["dataset_path"],
        base_transform=base_transform
    )

    # Train Mean Teacher
    trained_student, trained_teacher = train_mean_teacher(
        model, lb_loader, ulb_loader, val_loader, device, config["epochs"], config["alpha"]
    )

elif config["input_type"] == "text":
    # Split dataset into labeled, unlabeled, and validation data
    df = pd.read_csv(config["dataset_path"])
    unlabeled_size = 1.0 - config.get("num_labels", 0.3)
    df_labeled, df_unlabeled = train_test_split(df, test_size=unlabeled_size, stratify=df[config["text_target_column"]], random_state=42)
    df_train, df_val = train_test_split(df_labeled, test_size=0.2, stratify=df_labeled[config["text_target_column"]])

    # Instantiate tokenizer
    tokenizer = token_factory(
        "text",
        text_column=config["text_column"],
        target_column=config["text_target_column"],
        pretrained=config["pre_trained"],
    )

    # Fit only on labeled training data
    tokenizer.fit(df_train)  

    X_train = tokenizer.transform(df_train)
    y_train = tokenizer.transform_target(df_train)

    X_val = tokenizer.transform(df_val)
    y_val = tokenizer.transform_target(df_val)

    X_unlabeled = df_unlabeled[config["text_target_column"]].tolist()

    # Pass the underlying tensors to the dataloader_factory
    lb_loader, val_loader, ulb_loader = dataloader_factory(
        "text",
        X_train=X_train, y_train=y_train,
        X_val=X_val, y_val=y_val,
        X_unlabeled=X_unlabeled,
        tokenizer=tokenizer,
        batch_size=config["batch_size"]
    )

    # Determine number of classes
    num_classes = len(np.unique(y_train.numpy())) 
    input_dim = X_train.shape[1] if not config["pre_trained"] else None 

    # Instantiate model
    model = model_factory(
        "text",
        num_classes=num_classes,
        pretrained=config["pre_trained"],
        tfidf_dim=input_dim
    ).to(device)

    # Train Mean Teacher
    trained_student, trained_teacher = train_mean_teacher(
        model, lb_loader, ulb_loader, val_loader, device, config["epochs"], alpha=config["alpha"]
    )

elif config["input_type"] == "tabular":
    is_regression = not config["is_target_categorical"]

    # Split the dataset into labeled, unlabeled, and validation data
    df = pd.read_csv(config["dataset_path"])
    unlabeled_size = 1.0 - config.get("num_labels", 0.3)
    df_labeled, df_unlabeled = train_test_split(df, test_size=unlabeled_size, stratify=df[config["target_column"]], random_state=42)
    df_train, df_val = train_test_split(df_labeled, test_size=0.2, stratify=df_labeled[config["target_column"]])

    # Obtain the tokenizer for tabular inputs and tokenize input
    tokenizer = token_factory(
        "tabular", 
        categorical_columns=config["categorical_columns"],
        numeric_columns=config["numeric_columns"],
        target_column=config["target_column"],
        is_target_categorical=config["is_target_categorical"]
    )

    # Fit tokenizer on training data
    tokenizer.fit(df_train)

    # Tokenize features
    X_train = tokenizer.transform(df_train)
    y_train = tokenizer.transform_target(df_train)

    X_val = tokenizer.transform(df_val)
    y_val = tokenizer.transform_target(df_val)

    X_unlabeled = tokenizer.transform(df_unlabeled)
    
    X_train = torch.tensor(X_train, dtype=torch.float32)
    y_train = torch.tensor(y_train, dtype=torch.float32 if not config["is_target_categorical"] else torch.long)

    X_val = torch.tensor(X_val, dtype=torch.float32)
    if is_regression:
        y_val = torch.tensor(y_val.to_numpy(), dtype=torch.float32 if not config["is_target_categorical"] else torch.long)
    else:
        y_val = torch.tensor(y_val, dtype=torch.float32 if not config["is_target_categorical"] else torch.long)

    X_unlabeled = torch.tensor(X_unlabeled, dtype=torch.float32)

    # Generate dataloaders
    lb_loader, ulb_loader, val_loader = dataloader_factory(
        "tabular", X_train=X_train, y_train=y_train, 
        X_val=X_val, y_val=y_val, X_unlabeled=X_unlabeled, 
        batch_size=config["batch_size"]
    )
    
    # Create model
    input_dim = df.drop(columns=[config["target_column"]]).shape[1]
    num_classes = df[config["target_column"]].nunique()
    model = model_factory(
        "tabular",
        input_dim=input_dim,
        num_classes=num_classes,
        regression=is_regression
    ).to(device)

    # Train Mean Teacher
    trained_student, trained_teacher = train_mean_teacher(
        model, lb_loader, ulb_loader, val_loader, device, config["epochs"], alpha=config["alpha"]
    )

else:
    raise ValueError(f"Unsupported input type: {config["input_type"]}")