In [None]:
# ---- Mount Drive and set up folders
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

import os
PROJ = "/content/drive/MyDrive/dissertation"
os.makedirs(f"{PROJ}/data", exist_ok=True)
os.makedirs(f"{PROJ}/outputs", exist_ok=True)

print(" Drive mounted and folders ready!")


Mounted at /content/drive
 Drive mounted and folders ready!


In [None]:
# ---- Load config file
import json

CONFIG_PATH = "/content/drive/MyDrive/dissertation/config.json"

with open(CONFIG_PATH) as f:
    cfg = json.load(f)

print(" Config loaded successfully!")
print(json.dumps(cfg, indent=2))


âœ… Config loaded successfully!
{
  "datasets": {
    "adult": {
      "path": "data/Adult_clean.csv",
      "target": "income",
      "embed_candidates": [
        "occupation",
        "workclass",
        "native_country"
      ]
    },
    "petfinder": {
      "path": "data/Petfinder_clean.csv",
      "target": "AdoptionSpeed_bin",
      "embed_candidates": [
        "Breed1",
        "Color1",
        "MaturitySize"
      ]
    },
    "breast": {
      "path": "data/Breast_clean.csv",
      "target": "OS5yr_bin",
      "embed_candidates": [
        "TNM_PATH_T",
        "TNM_PATH_N",
        "TUMOR_SIZE"
      ]
    }
  },
  "splits": {
    "test_size": 0.2,
    "val_size": 0.2,
    "random_state": 42,
    "stratify": true
  }
}


In [None]:
# ---- Load datasets + split function
import pandas as pd
from sklearn.model_selection import train_test_split

def load_dataset(key):
    meta = cfg["datasets"][key]
    path = "/content/drive/MyDrive/dissertation/" + meta["path"]
    df = pd.read_csv(path, low_memory=False)
    y = df[meta["target"]]
    X = df.drop(columns=[meta["target"]])
    print(f"Loaded {key}: {df.shape[0]} rows, {df.shape[1]} columns")
    return X, y, meta

def make_splits(X, y, cfg):
    strat = y if cfg["splits"]["stratify"] else None
    X_tr, X_tmp, y_tr, y_tmp = train_test_split(
        X, y, test_size=cfg["splits"]["val_size"] + cfg["splits"]["test_size"],
        random_state=cfg["splits"]["random_state"], stratify=strat)
    rel = cfg["splits"]["test_size"] / (cfg["splits"]["val_size"] + cfg["splits"]["test_size"])
    X_val, X_te, y_val, y_te = train_test_split(
        X_tmp, y_tmp, test_size=rel, random_state=cfg["splits"]["random_state"],
        stratify=(y_tmp if strat is not None else None))
    return (X_tr, y_tr), (X_val, y_val), (X_te, y_te)


In [None]:
with open(f"{PROJ}/outputs/breast_cleaning_notes.txt", "a") as f:
    f.write("\n\n[2025-11-04] Updated embedding features: TNM_PATH_T, TNM_PATH_N, hospid (replaces tumor_size).")
