# Setup

## Imports

In [1]:
# Import importlib to reload modules and sys and os to add the path for other imports
import importlib
import sys
import os
import torch

# Append the parent directory to the path to import the necessary modules
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

# Import utilities
from utils import setuputil, trainutil, inferutil
from classes.models import SimpleGeluEmbed

# Reload the necessary modules to ensure they are up-to-date
importlib.reload(setuputil)
importlib.reload(trainutil)
importlib.reload(inferutil)
importlib.reload(SimpleGeluEmbed)

# Import the required utils
from utils.setuputil import setup_simple_config, display_simple_config
from utils.trainutil import train_model
from utils.inferutil import infer_one, infer_full

# Import the SimpleGeluEmbedAdd class
from classes.models.SimpleGeluEmbed import SimpleGeluEmbedAdd

In [30]:
# Define setup config
setup_config = {
    # Environment and Model Info
    "env": "local",                
    "approach": "simple",         
    "model_name": "SimpleGeluEmbedAdd",
    
    # System Configuration
    "device": "cuda:0",
    "threads": 12,
    "seed": 42,
    
    # Data Configuration
    "data_dir": "../../data/farzan",
    "data_ds": 2000,
    
    # Model Parameters
    "rows": 100,
    "cols": 100,
    "tokens": 32,
    
    # Vocabulary Parameters
    "vocab_size": 150000,
    "vocab_space": True,
    "vocab_case": "both",
    
    # Training Parameters
    "batch": 40,
    "lr": 5e-3,
    "mu": 0.25,
    "epochs": 20,
    "patience": 2,
    "save_int": 10,
    "save_dir": '../models/'
}

# Define setup config
setup_config2 = {
    # Environment and Model Info
    "env": "local",                
    "approach": "simple",         
    "model_name": "SimpleGeluEmbedAdd",
    
    # System Configuration
    "device": "cuda:0",
    "threads": 12,
    "seed": 0,
    
    # Data Configuration
    "data_dir": "../../data/farzan",
    "data_ds": 2000,
    
    # Model Parameters
    "rows": 100,
    "cols": 100,
    "tokens": 32,
    
    # Vocabulary Parameters
    "vocab_size": 150000,
    "vocab_space": True,
    "vocab_case": "both",
    
    # Training Parameters
    "batch": 40,
    "lr": 5e-3,
    "mu": 0.25,
    "epochs": 20,
    "patience": 2,
    "save_int": 10,
    "save_dir": '../models/'
}

# Define setup config
setup_config3 = {
    # Environment and Model Info
    "env": "local",                
    "approach": "simple",         
    "model_name": "SimpleGeluEmbedAdd",
    
    # System Configuration
    "device": "cuda:0",
    "threads": 12,
    "seed": 0,
    
    # Data Configuration
    "data_dir": "../../data/farzan",
    "data_ds": 1000,
    
    # Model Parameters
    "rows": 100,
    "cols": 100,
    "tokens": 32,
    
    # Vocabulary Parameters
    "vocab_size": 150000,
    "vocab_space": True,
    "vocab_case": "both",
    
    # Training Parameters
    "batch": 40,
    "lr": 5e-3,
    "mu": 0.25,
    "epochs": 20,
    "patience": 2,
    "save_int": 10,
    "save_dir": '../models/'
}

# Derive Part to use form the simple_setup func

In [31]:
import os
import torch
from utils.selfutil import set_seed, get_vocab, create_embeddings, get_fileList
from classes.SpreadsheetDataLoader import SpreadsheetDataLoader

def setup_simple_config(setup_config):
    """
    Function to set up the configuration for a simple model, including:
    - Seed initialization
    - Vocabulary and embedding matrix setup
    - Dynamic dataloader creation based on `data_ds`

    Args:
        setup_config (dict): Configuration dictionary with required keys.

    Returns:
        dict: Fully configured dictionary for the model setup.
    """

    ######## INITIALIZE CONFIG ########
    config = {}

    ######## ENVIRONMENT & MODEL INFO ########
    valid_envs = ["gcp", "bvm", "local", "colab"]
    if setup_config["env"] not in valid_envs:
        raise ValueError(f"ERR: env must be one of {valid_envs}")
    config["env"] = setup_config["env"]

    valid_approaches = ["simple", "saffu", "bert"]
    if setup_config["approach"] not in valid_approaches:
        raise ValueError(f"ERR: approach must be one of {valid_approaches}")
    config["approach"] = setup_config["approach"]

    config["model_name"] = setup_config["model_name"]

    ######## DATA DIR ########
    # Validate and set data_dir
    if os.path.isdir(setup_config["data_dir"]):
        config["data_dir"] = setup_config["data_dir"]
    else:
        raise ValueError(f"ERR: data_dir '{setup_config['data_dir']}' is not a valid path")

    ######## DEVICE ########
    if (
        setup_config["device"].startswith("cuda")
        and torch.cuda.is_available()
        and int(setup_config["device"].split(":")[1]) < torch.cuda.device_count()
    ):
        config["DEVICE"] = torch.device(setup_config["device"])
    elif (
        setup_config["device"].startswith("mps")
        and hasattr(torch.backends, "mps")
        and torch.backends.mps.is_available()
    ):
        config["DEVICE"] = torch.device("mps")
    else:
        config["DEVICE"] = torch.device("cpu")
    config["seed"] = setup_config["seed"]

    ######## THREADS ########
    if not isinstance(setup_config["threads"], (int, float)):
        raise ValueError("ERR: threads must be a number")
    if os.cpu_count() - int(setup_config["threads"]) < 4:
        raise ValueError(
            f"ERR: Using {int(setup_config['threads'])} threads would leave insufficient free threads."
        )
    config["THREADS"] = max(1, int(setup_config["threads"]))

    ######## SEED ########
    set_seed(config["seed"])

    ######## DATALOADER SETUP ########
    train_dir = os.path.join(config["data_dir"], "all_train")
    val_dir = os.path.join(config["data_dir"], "all_val")
    test_dir = os.path.join(config["data_dir"], "all_test")

    data_ds = setup_config["data_ds"]
    if isinstance(data_ds, str) and data_ds in ["manual", "all"]:
        # Setup directories for train, val, and test
        config["train_dir"] = os.path.join(config["data_dir"], f"{data_ds}_train")
        config["val_dir"] = os.path.join(config["data_dir"], f"{data_ds}_val")
        config["test_dir"] = os.path.join(config["data_dir"], f"{data_ds}_test")

        train_files, _ = get_fileList(config["train_dir"])
        val_files, _ = get_fileList(config["val_dir"])
        test_files, _ = get_fileList(config["test_dir"])
    elif isinstance(data_ds, int):
        # Validate that data_ds is less than 2450
        if data_ds > 2450:
            raise ValueError(f"ERR: data_ds '{data_ds}' exceeds the maximum allowed files (2450).")

        # Check if data_ds can be split evenly into 80-10-10
        train_size = int(data_ds * 0.8)
        val_size = int(data_ds * 0.1)
        test_size = int(data_ds * 0.1)

        # Check for residuals
        if train_size + val_size + test_size != data_ds:
            raise ValueError(f"ERR: {data_ds} cannot be evenly split into 80-10-10.")

        # Dynamically select files using get_fileList
        train_files, _ = get_fileList(train_dir, num_files=train_size, seed=config["seed"])
        val_files, _ = get_fileList(val_dir, num_files=val_size, seed=config["seed"])
        test_files, _ = get_fileList(test_dir, num_files=test_size, seed=config["seed"])
    else:
        raise ValueError(
            f"ERR: data_ds '{data_ds}' must be a valid integer <= 2450 or one of ['manual', 'all']"
        )

    ######## VOCAB ########
    # Validate vocab parameters
    if not isinstance(setup_config["vocab_size"], int) or not 4 <= setup_config["vocab_size"] <= 2000000:
        raise ValueError(f"ERR: vocab_size '{setup_config['vocab_size']}' must be an integer between 4 and 2,000,000")

    vocab_space = setup_config.get("vocab_space", True)
    if not isinstance(vocab_space, bool):
        vocab_space = True

    vocab_case = setup_config.get("vocab_case", "lower")
    if vocab_case not in ["both", "upper", "lower"]:
        vocab_case = "lower"

    # Generate vocab object using train_files as the source
    config["vocab"] = get_vocab(
        train_files,
        setup_config["vocab_size"],
        space=vocab_space,
        case=vocab_case,
        threads=config["THREADS"]
    )

    ######## WVS ########
    config["wvs"] = create_embeddings(config["vocab"])
    config["vocab_size"] = config["wvs"].shape[0]
    config["vocab_space"] = vocab_space
    config["vocab_case"] = vocab_case

    ######## CREATE DATA LOADERS ########
    config["train_loader"] = SpreadsheetDataLoader(
        train_files, config["vocab"], setup_config["rows"], setup_config["cols"], setup_config["tokens"], threads=config["THREADS"]
    )
    config["val_loader"] = SpreadsheetDataLoader(
        val_files, config["vocab"], setup_config["rows"], setup_config["cols"], setup_config["tokens"], threads=config["THREADS"]
    )
    config["test_loader"] = SpreadsheetDataLoader(
        test_files, config["vocab"], setup_config["rows"], setup_config["cols"], setup_config["tokens"], threads=config["THREADS"]
    )

    ######## TRAINING PARAMETERS ########
    config["batch"] = setup_config["batch"]
    config["lr"] = setup_config["lr"]
    config["mu"] = setup_config["mu"]
    config["epochs"] = setup_config["epochs"]
    config["patience"] = setup_config["patience"]
    config["save_int"] = setup_config["save_int"]
    config["save_dir"] = setup_config["save_dir"]

    ######## SAVE NAME ########
    case_prefix = {"both": "b", "upper": "u", "lower": "l"}[config["vocab_case"]]
    space_str = "Sp" if config["vocab_space"] else "Nsp"
    vocab_str = f"{case_prefix}{space_str}{config['vocab_size']//1000}k"

    save_name = "__".join([
        "_".join([config["env"], config["approach"], config["model_name"], f"s{config['seed']}"]),
        "_".join([str(config["data_ds"]), f"{setup_config['rows']}x{setup_config['cols']}x{setup_config['tokens']}"]),
        vocab_str,
        f"b{setup_config['batch']}lr{setup_config['lr']:.0e}e{setup_config['epochs']}p{setup_config['patience']}"
    ])
    config["save_name"] = save_name

    return config


In [32]:
setup_simple_config(setup_config)

TypeError: stat: path should be string, bytes, os.PathLike or integer, not list

In [24]:
dynamic_dataloader_setup(setup_config2)

Seed set to 0
Train directory: ../../data/farzan/all_train
Validation directory: ../../data/farzan/all_val
Test directory: ../../data/farzan/all_test
Success: 2000 can be split into 80-10-10.
Selected Train files: 1600
Selected Validation files: 200
Selected Test files: 200

First 10 Train files:
['../../data/farzan/all_train/GTO_data_Deposit%20Caulobacter%20Biofilms%2007302018.xlsx', '../../data/farzan/all_train/July%202014%20Summary%20and%20Assignment%20Report%20-%20FINAL%20508.xls', '../../data/farzan/all_train/LED%20lamp%20Bg%20high%20and%20low%20RH.xlsx', '../../data/farzan/all_train/2019sipawardsv2.xlsx', '../../data/farzan/all_train/HEROV1_2024_MCRruns.xlsx', '../../data/farzan/all_train/WallaceMichelle_A-6hdz_Data_20170508.xlsx', '../../data/farzan/all_train/idrportfolio-by-age.xls', '../../data/farzan/all_train/RME%20Sensitivity%20Outputs%20MP%201-25-22.xlsx', '../../data/farzan/all_train/darrell_schoolcraft_000_1_1_1.pst.306.xls', '../../data/farzan/all_train/bias_data_for_hc

In [25]:
dynamic_dataloader_setup(setup_config3)

Seed set to 0
Train directory: ../../data/farzan/all_train
Validation directory: ../../data/farzan/all_val
Test directory: ../../data/farzan/all_test
Success: 1000 can be split into 80-10-10.
Selected Train files: 800
Selected Validation files: 100
Selected Test files: 100

First 10 Train files:
['../../data/farzan/all_train/GTO_data_Deposit%20Caulobacter%20Biofilms%2007302018.xlsx', '../../data/farzan/all_train/July%202014%20Summary%20and%20Assignment%20Report%20-%20FINAL%20508.xls', '../../data/farzan/all_train/LED%20lamp%20Bg%20high%20and%20low%20RH.xlsx', '../../data/farzan/all_train/2019sipawardsv2.xlsx', '../../data/farzan/all_train/HEROV1_2024_MCRruns.xlsx', '../../data/farzan/all_train/WallaceMichelle_A-6hdz_Data_20170508.xlsx', '../../data/farzan/all_train/idrportfolio-by-age.xls', '../../data/farzan/all_train/RME%20Sensitivity%20Outputs%20MP%201-25-22.xlsx', '../../data/farzan/all_train/darrell_schoolcraft_000_1_1_1.pst.306.xls', '../../data/farzan/all_train/bias_data_for_hcl

In [26]:
# Define setup config
setup_config4 = {
    # Environment and Model Info
    "env": "local",                
    "approach": "simple",         
    "model_name": "SimpleGeluEmbedAdd",
    
    # System Configuration
    "device": "cuda:0",
    "threads": 12,
    "seed": 0,
    
    # Data Configuration
    "data_dir": "../../data/farzan",
    "data_ds": 'manual',
    
    # Model Parameters
    "rows": 100,
    "cols": 100,
    "tokens": 32,
    
    # Vocabulary Parameters
    "vocab_size": 150000,
    "vocab_space": True,
    "vocab_case": "both",
    
    # Training Parameters
    "batch": 40,
    "lr": 5e-3,
    "mu": 0.25,
    "epochs": 20,
    "patience": 2,
    "save_int": 10,
    "save_dir": '../models/'
}

In [27]:
dynamic_dataloader_setup(setup_config4)

Seed set to 0
Train directory: ../../data/farzan/all_train
Validation directory: ../../data/farzan/all_val
Test directory: ../../data/farzan/all_test
Success: data_ds 'manual' is valid.
Train files: 40
Validation files: 5
Test files: 5

First 10 Train files:
['../../data/farzan/manual_train/advanced-placement-science-enrollment.xlsx', '../../data/farzan/manual_train/portfolio-by-school-type.xls', '../../data/farzan/manual_train/dlportfolio-by-debt-size.xls', '../../data/farzan/manual_train/dlbyforbearancetype copy.xls', '../../data/farzan/manual_train/harassment-bullying-on-basis-of-disability-disciplined copy.xlsx', '../../data/farzan/manual_train/dlbydefermenttype copy.xls', '../../data/farzan/manual_train/portfolio-by-age copy.xls', '../../data/farzan/manual_train/harassment-bullying-on-basis-of-race-disciplined.xlsx', '../../data/farzan/manual_train/advanced-placement-enrollment (1).xlsx', '../../data/farzan/manual_train/advanced-placement-science-enrollment (1) copy.xlsx']

First 