In [None]:
# ==================================================
# 04d_synthetic_eval_embeddings.ipynb
# Compare CTGAN/TVAE synthetic quality across
# different representations of the 3 high-card
# categorical candidates per dataset.
# ==================================================

!pip -q install numpy==1.26.4 pandas==2.2.2 scikit-learn==1.5.2 lightgbm==4.5.0 \
               sdv==1.16.0 ctgan==0.10.2

from google.colab import drive
drive.mount('/content/drive', force_remount=True)

import os, time
from pathlib import Path

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, f1_score, accuracy_score, precision_score, recall_score

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from lightgbm import LGBMClassifier

from sdv.single_table import CTGANSynthesizer, TVAESynthesizer
from sdv.metadata import SingleTableMetadata

# ----- Paths / constants -----
RSEED = 42
np.random.seed(RSEED)

PROJ    = "/content/drive/MyDrive/dissertation"
DATA_DIR = f"{PROJ}/data"
EMB_DIR  = f"{PROJ}/outputs/embeddings"  # W2V / FT / N2V prefixed files
OUT_ROOT = f"{PROJ}/outputs/synthetic_eval_4d"
os.makedirs(OUT_ROOT, exist_ok=True)

print("Project dir:", PROJ)
print("Output dir :", OUT_ROOT)


Mounted at /content/drive
Project dir: /content/drive/MyDrive/dissertation
Output dir : /content/drive/MyDrive/dissertation/outputs/synthetic_eval_4d


In [None]:
# ==========================================
# Dataset config + 3 candidate categorical
# columns per dataset (as in 03a).
# ==========================================

DATASETS = {
    "adult": {
        "path": f"{DATA_DIR}/Adult_clean.csv",
        "target_candidates": ["income_bin", "Income_bin", "income", "Income", "target", "class"]
    },
    "petfinder": {
        "path": f"{DATA_DIR}/Petfinder_clean.csv",
        "target_candidates": ["AdoptionSpeed_bin", "adoption_bin", "AdoptionSpeed"]
    },
    "breast": {
        "path": f"{DATA_DIR}/Breast_clean.csv",
        "target_candidates": ["OS5yr_bin", "os5yr_bin", "OS5yr", "target"]
    }
}

# These are exactly the three high-card candidates we used in 03a
CANDIDATE_CATS = {
    "adult":      ["occupation", "workclass", "native_country"],
    "petfinder":  ["Breed1", "Color1", "MaturitySize"],
    "breast":     ["TNM_PATH_T", "TNM_PATH_N", "hospid"],
}

# Views to compare:
# - numeric_only       : baseline, like 04c
# - w2v / fasttext / node2vec : attach 64-dim embeddings per candidate column
FEATURE_VIEWS = ["numeric_only", "w2v", "node2vec"]

# Mapping from view -> file template for the prefixed embedding CSV
EMBED_FILE_TPL = {
    "w2v":      "{ds}_w2v_64d_prefixed.csv",
    "fasttext": "{ds}_fasttext_64d_prefixed.csv",
    "node2vec": "{ds}_node2vec_64d_prefixed.csv",
}

# Synthetic + model config (roughly mirroring 04c, but lighter)
CONFIG = {
    # use a slightly smaller train fraction to reduce rows seen by the generators
    "train_frac": 0.5,

    # CTGAN: fewer epochs since we only use it for numeric_only
    "ctgan": {
        "epochs": 30,       # was 40/60
        "batch_size": 256,
        "pac": 1,
        "verbose": True
    },

    # TVAE: still main workhorse, but cut epochs
    "tvae": {
        "epochs": 30,       # was 60
        "batch_size": 256,
        "compress_dims": [128, 64]  # was [256, 128]
    },

    "models": {
        "LR": {
            "type": "lr",
            "params": {"max_iter": 1000, "n_jobs": -1}
        },
        "LinSVM": {
            "type": "linsvm",
            "params": {"C": 1.0}
        },
        "RF": {
            "type": "rf",
            "params": {
                "n_estimators": 400,
                "max_depth": None,
                "n_jobs": -1,
                "random_state": RSEED
            }
        },
        "LGBM": {
            "type": "lgbm",
            "params": {
                "n_estimators": 500,
                "num_leaves": 63,
                "random_state": RSEED
            }
        }
    }
}



In [None]:
# ====================
# Small helper utils
# ====================

RUNLOG = []

def _timed(name):
    """Simple timing decorator that logs wall-clock time."""
    def deco(fn):
        def wrapper(*args, **kwargs):
            t0 = time.time()
            print(f"\n‚è±  [{name}] start")
            out = fn(*args, **kwargs)
            dt = time.time() - t0
            print(f" [{name}] done in {dt:0.1f}s")
            RUNLOG.append({"step": name, "seconds": dt})
            return out
        return wrapper
    return deco

def pick_target(df, candidates):
    """Pick the first existing column from candidates as the binary target."""
    for c in candidates:
        if c in df.columns:
            return c, df
    raise ValueError(f"None of {candidates} found in dataframe columns: {list(df.columns)}")

def make_model(tag, X_train, y_train):
    """Instantiate and fit a classifier from CONFIG['models']."""
    cfg = CONFIG["models"][tag]
    t = cfg["type"]
    params = cfg["params"].copy()
    if t == "lr":
        model = LogisticRegression(**params)
        model.fit(X_train, y_train)
        return model
    if t == "linsvm":
        model = LinearSVC(**params)
        model.fit(X_train, y_train)
        return model
    if t == "rf":
        model = RandomForestClassifier(**params)
        model.fit(X_train, y_train)
        return model
    if t == "lgbm":
        model = LGBMClassifier(**params)
        model.fit(X_train, y_train)
        return model
    raise ValueError(t)

def _pred_proba_or_decision(model, X):
    """Get a 1D score for AUC / thresholding."""
    # Try predict_proba
    if hasattr(model, "predict_proba"):
        return model.predict_proba(X)[:, 1]
    # Else decision_function
    if hasattr(model, "decision_function"):
        s = model.decision_function(X)
        # if shape is (n,), return directly; if (n,2), take second column
        if s.ndim == 1:
            return s
        return s[:, 1]
    # Fallback: use hard predictions (not ideal for AUC but avoids crash)
    return model.predict(X)

def score_suite(tag, X_train, y_train, X_test, y_test, feature_cols):
    """Train all models on X_train and evaluate on X_test."""
    rows = []
    Xtr = X_train[feature_cols].values
    Xte = X_test[feature_cols].values

    for mtag in CONFIG["models"].keys():
        print(f"   ‚Üí [{tag}] model={mtag}")
        model = make_model(mtag, Xtr, y_train.values)
        scores = _pred_proba_or_decision(model, Xte)

        # convert scores ‚Üí predicted labels using 0.5 threshold
        # (for non-proba scores this is just sign-based)
        if scores.min() >= 0 and scores.max() <= 1:
            y_hat = (scores >= 0.5).astype(int)
        else:
            # decision-style scores, threshold at 0
            y_hat = (scores >= 0).astype(int)

        auc  = roc_auc_score(y_test, scores)
        f1   = f1_score(y_test, y_hat)
        acc  = accuracy_score(y_test, y_hat)
        prec = precision_score(y_test, y_hat)
        rec  = recall_score(y_test, y_hat)

        rows.append({
            "Tag": tag,
            "Model": mtag,
            "AUC": auc,
            "F1": f1,
            "ACC": acc,
            "PREC": prec,
            "REC": rec
        })

    return pd.DataFrame(rows)


In [None]:
# ==================================================
# Embedding helpers: W2V / FastText / Node2Vec views
# ==================================================

def load_prefixed_embedding_matrix(ds_name, view):
    """
    Load a {ds}_{view}_64d_prefixed.csv file from EMB_DIR
    and return a DataFrame indexed by 'token' (e.g. 'occupation:Exec-managerial').
    """
    if view not in EMBED_FILE_TPL:
        raise ValueError(f"View {view} is not a prefixed embedding view.")
    fname = EMBED_FILE_TPL[view].format(ds=ds_name)
    path = os.path.join(EMB_DIR, fname)
    if not os.path.exists(path):
        raise FileNotFoundError(f"Embedding file not found: {path}")
    E = pd.read_csv(path)
    if "token" not in E.columns:
        E = E.rename(columns={E.columns[0]: "token"})
    E = E.set_index("token")
    print(f"   Loaded {view} embeddings for {ds_name} ‚Üí {E.shape}")
    return E

def make_view_numeric(df_raw, ds_name, view):
    """
    Given the original cleaned dataframe, attach embeddings for the
    three candidate categorical columns depending on 'view'.

    - numeric_only: return numeric-only columns (baseline).
    - w2v/fasttext/node2vec: join the corresponding embedding vectors
      and drop the original candidate columns.
    """
    df = df_raw.copy()
    cats = CANDIDATE_CATS[ds_name]

    # Make sure candidate cols exist
    missing = [c for c in cats if c not in df.columns]
    if missing:
        raise ValueError(f"{ds_name}: missing candidate columns {missing} in dataframe.")

    # Ensure they are clean strings (so they match embedding tokens)
    for c in cats:
        df[c] = df[c].astype(str).str.strip().replace({"": "Unknown"}).fillna("Unknown")

    if view == "numeric_only":
        # Just keep numeric columns (like 04c did)
        num_cols = df.select_dtypes(include=["number", "bool"]).columns.tolist()
        X_view = df[num_cols].copy()
        return X_view, num_cols

    # For embedding-based views
    if view in ["w2v", "fasttext", "node2vec"]:
        E = load_prefixed_embedding_matrix(ds_name, view)

        # For each candidate column, extract its subset of rows from E
        # and join as numeric features.
        for col in cats:
            prefix = f"{col}:"
            sub = E[E.index.str.startswith(prefix)].copy()
            if sub.empty:
                print(f"   No embedding rows found for prefix '{prefix}' in {view} for {ds_name}")
                continue

            # Strip the prefix to get the raw category value
            sub["__key__"] = sub.index.str[len(prefix):]
            sub = sub.set_index("__key__")

            # Rename columns so they carry (col, view, dim) info
            new_cols = {
                old: f"{col}_{view}_{i}"
                for i, old in enumerate(sub.columns)
            }
            sub = sub.rename(columns=new_cols)

            # Join onto df by the raw categorical value
            df = df.join(sub, on=col)

        # Drop the original candidate categorical columns; they are now embedded
        df = df.drop(columns=cats, errors="ignore")

        # Keep only numeric columns in the final X
        num_cols = df.select_dtypes(include=["number", "bool"]).columns.tolist()
        X_view = df[num_cols].copy()
        return X_view, num_cols

    raise ValueError(f"Unsupported view: {view}")


In [None]:
# ==================================================
# CTGAN / TVAE + TSTR/TRTS for a single dataset/view
# ==================================================

def _labels_ok(y):
    """Check that we have a proper binary label with both classes present."""
    vals = pd.Series(y).dropna().unique().tolist()
    vals = sorted(vals)
    return len(vals) == 2 and set(vals) == {0, 1}

@_timed("run_dataset_view")
def run_dataset_view(ds_name, view):
    ds_cfg  = DATASETS[ds_name]
    out_dir = f"{OUT_ROOT}/{ds_name}_{view}"
    os.makedirs(out_dir, exist_ok=True)

    print(f"\n==============================")
    print(f" Dataset = {ds_name.upper()} | View = {view}")
    print(f"==============================")

    # ----- Load full cleaned dataset -----
    df_full = pd.read_csv(ds_cfg["path"], low_memory=False)
    print("Raw shape:", df_full.shape)

    # ----- Pick target -----
    target, df_full = pick_target(df_full, ds_cfg["target_candidates"])
    print(f"Using target column: {target}")
    y = df_full[target].astype(int)
    X_raw = df_full.drop(columns=[target])

    # ----- Apply view (numeric_only vs embeddings) -----
    X_view, num_cols = make_view_numeric(X_raw, ds_name, view)
    print(f"View '{view}' num_cols: {len(num_cols)}")

    # Basic cleanup for numeric columns
    X_view = X_view.replace([np.inf, -np.inf], np.nan)
    X_view = X_view.fillna(X_view.median(numeric_only=True))

    # ----- Train/val/test splits -----
    X_tr, X_tmp, y_tr, y_tmp = train_test_split(
        X_view, y, train_size=CONFIG["train_frac"],
        stratify=y, random_state=RSEED
    )
    X_va, X_te, y_va, y_te = train_test_split(
        X_tmp, y_tmp, test_size=0.5,
        stratify=y_tmp, random_state=RSEED
    )
    print("Shapes (train / val / test):", X_tr.shape, X_va.shape, X_te.shape)

    # Sanity check labels
    if not _labels_ok(y_tr):
        raise ValueError("Training labels are not proper binary {0,1}.")

    # ----- SDV metadata (numeric + categorical target) -----
    @_timed(f"{ds_name}_{view}:build_metadata")
    def build_metadata_full(Xtrain, ytrain):
        train_full = Xtrain.copy()
        train_full[target] = ytrain.values

        meta = SingleTableMetadata()
        meta.detect_from_dataframe(train_full)

        # Force all feature columns to be numerical, target categorical
        for c in num_cols:
            meta.update_column(c, sdtype="numerical")
        meta.update_column(target, sdtype="categorical")
        meta.validate()
        return meta, train_full

    meta_full, train_full = build_metadata_full(X_tr, y_tr)

    # ----- Fit CTGAN / TVAE -----
    ctgan = None
    tvae  = None

    # Only use CTGAN on the compact numeric-only view
    use_ctgan = (view == "numeric_only")

    @_timed(f"{ds_name}_{view}:fit_CTGAN")
    def fit_ctgan(train_full, meta):
        cfg = CONFIG["ctgan"]
        synth = CTGANSynthesizer(
            metadata=meta,
            epochs=cfg["epochs"],
            batch_size=cfg["batch_size"],
            pac=cfg["pac"],
            verbose=cfg["verbose"],
        )
        synth.fit(train_full)
        return synth

    @_timed(f"{ds_name}_{view}:fit_TVAE")
    def fit_tvae(train_full, meta):
        cfg = CONFIG["tvae"]
        synth = TVAESynthesizer(
            metadata=meta,
            epochs=cfg["epochs"],
            batch_size=cfg["batch_size"],
            compress_dims=cfg["compress_dims"],
        )
        synth.fit(train_full)
        return synth

    if use_ctgan:
        ctgan = fit_ctgan(train_full, meta_full)
    else:
        print(f"Skipping CTGAN for view={view} (too many columns / too wide).")

    tvae = fit_tvae(train_full, meta_full)

        # ----- Sample synthetic data -----
    MAX_SYNTH_ROWS = 15000  # cap to keep generation + training cheaper

    def sample_synth(synth, n_rows, tag):
        print(f"\nSampling {n_rows} rows from {tag} ...")
        df_syn = synth.sample(num_rows=n_rows)
        if target not in df_syn.columns:
            raise ValueError(f"{tag}: target column '{target}' missing in synthetic sample.")
        # Clean target
        df_syn = df_syn.dropna(subset=[target])
        df_syn[target] = df_syn[target].round().astype(int)
        return df_syn

    X_syn_ctgan = y_syn_ctgan = None
    X_syn_tvae  = y_syn_tvae  = None

    # --- CTGAN synthetic (only for numeric_only view) ---
    if ctgan is not None:
        try:
            n_ctgan = min(len(X_tr), MAX_SYNTH_ROWS)
            df_ctgan = sample_synth(
                ctgan,
                n_rows=n_ctgan,
                tag=f"{ds_name}_{view}_CTGAN"
            )
            y_syn_ctgan = df_ctgan[target].astype(int)
            X_syn_ctgan = df_ctgan[num_cols].copy()
            X_syn_ctgan = X_syn_ctgan.replace([np.inf, -np.inf], np.nan)
            X_syn_ctgan = X_syn_ctgan.fillna(X_syn_ctgan.median(numeric_only=True))
        except Exception as e:
            print("‚ö†Ô∏è  CTGAN sampling failed:", e)

    # --- TVAE synthetic (all views) ---
    try:
        n_tvae = min(len(X_tr), MAX_SYNTH_ROWS)
        df_tvae = sample_synth(
            tvae,
            n_rows=n_tvae,
            tag=f"{ds_name}_{view}_TVAE"
        )
        y_syn_tvae = df_tvae[target].astype(int)
        X_syn_tvae = df_tvae[num_cols].copy()
        X_syn_tvae = X_syn_tvae.replace([np.inf, -np.inf], np.nan)
        X_syn_tvae = X_syn_tvae.fillna(X_syn_tvae.median(numeric_only=True))
    except Exception as e:
        print("  TVAE sampling failed:", e)


    # ----- TSTR / TRTS evaluation -----
    summaries = []

    if X_syn_ctgan is not None and _labels_ok(y_syn_ctgan):
        summaries.append(
            score_suite(
                tag="tstr_ctgan",
                X_train=X_syn_ctgan, y_train=y_syn_ctgan,
                X_test=X_te,       y_test=y_te,
                feature_cols=num_cols,
            )
        )
        summaries.append(
            score_suite(
                tag="trts_ctgan",
                X_train=X_tr,      y_train=y_tr,
                X_test=X_syn_ctgan, y_test=y_syn_ctgan,
                feature_cols=num_cols,
            )
        )
    else:
        print("  Skipping CTGAN TSTR/TRTS for this view (no CTGAN model).")

    if X_syn_tvae is not None and _labels_ok(y_syn_tvae):
        summaries.append(
            score_suite(
                tag="tstr_tvae",
                X_train=X_syn_tvae, y_train=y_syn_tvae,
                X_test=X_te,        y_test=y_te,
                feature_cols=num_cols,
            )
        )
        summaries.append(
            score_suite(
                tag="trts_tvae",
                X_train=X_tr,       y_train=y_tr,
                X_test=X_syn_tvae,  y_test=y_syn_tvae,
                feature_cols=num_cols,
            )
        )
    else:
        print("  Skipping TVAE TSTR/TRTS: synthetic labels not bi-class or sampling failed.")

    if len(summaries) == 0:
        print("  No valid TSTR/TRTS summaries for this dataset/view.")
        summary = pd.DataFrame(columns=["Tag","Model","AUC","F1","ACC","PREC","REC"])
    else:
        summary = pd.concat(summaries, axis=0, ignore_index=True)

    # Add metadata columns so we can aggregate later
    summary.insert(0, "Dataset", ds_name.upper())
    summary.insert(1, "View", view)

    out_csv = os.path.join(out_dir, "summary_tstr_trts.csv")
    summary.to_csv(out_csv, index=False)
    print("\n Saved summary ‚Üí", out_csv)

    return summary


In [None]:
# ==========================================
# Run for all datasets √ó views and aggregate
# ==========================================

all_summaries = []

for ds in ["adult", "petfinder", "breast"]:
    for view in FEATURE_VIEWS:
        try:
            summary = run_dataset_view(ds, view)
            all_summaries.append(summary)
        except Exception as e:
            print(f"\n Failed for ds={ds}, view={view}: {e}")

if len(all_summaries):
    combined = pd.concat(all_summaries, axis=0, ignore_index=True)
    agg_path = os.path.join(OUT_ROOT, "summary_all_datasets_views.csv")
    combined.to_csv(agg_path, index=False)
    print("\n========================================")
    print("Combined summary saved ‚Üí", agg_path)
    print("========================================")
    display(combined.head())
else:
    print("No successful runs to aggregate.")

print("\nRunlog:")
for r in RUNLOG:
    print(f" - {r['step']}: {r['seconds']:.1f}s")



‚è±Ô∏è  [run_dataset_view] start

 Dataset = ADULT | View = numeric_only
Raw shape: (48842, 14)
Using target column: income
View 'numeric_only' num_cols: 5
Shapes (train / val / test): (24421, 5) (12210, 5) (12211, 5)

‚è±Ô∏è  [adult_numeric_only:build_metadata] start
‚úÖ [adult_numeric_only:build_metadata] done in 0.0s

‚è±Ô∏è  [adult_numeric_only:fit_CTGAN] start


Gen. (-0.91) | Discrim. (-0.09): 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [03:34<00:00,  7.14s/it]


‚úÖ [adult_numeric_only:fit_CTGAN] done in 282.1s

‚è±Ô∏è  [adult_numeric_only:fit_TVAE] start
‚úÖ [adult_numeric_only:fit_TVAE] done in 68.9s

Sampling 15000 rows from adult_numeric_only_CTGAN ...

Sampling 15000 rows from adult_numeric_only_TVAE ...
   ‚Üí [tstr_ctgan] model=LR
   ‚Üí [tstr_ctgan] model=LinSVM
   ‚Üí [tstr_ctgan] model=RF
   ‚Üí [tstr_ctgan] model=LGBM
[LightGBM] [Info] Number of positive: 7030, number of negative: 7970
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001976 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 671
[LightGBM] [Info] Number of data points in the train set: 15000, number of used features: 5
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.468667 -> initscore=-0.125498
[LightGBM] [Info] Start training from score -0.125498
   ‚Üí [trts_ctgan] model=LR
   ‚Üí [trts_ctgan] model=LinSVM
   ‚Üí [trts_ctgan] model=RF
   ‚Üí [trts_ctgan] model=LGBM
[LightGBM] [In



‚úÖ [adult_w2v:fit_TVAE] done in 2075.0s

Sampling 15000 rows from adult_w2v_TVAE ...
‚ö†Ô∏è  Skipping CTGAN TSTR/TRTS for this view (no CTGAN model).
   ‚Üí [tstr_tvae] model=LR
   ‚Üí [tstr_tvae] model=LinSVM
   ‚Üí [tstr_tvae] model=RF
   ‚Üí [tstr_tvae] model=LGBM
[LightGBM] [Info] Number of positive: 163, number of negative: 14837
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.039369 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 49092
[LightGBM] [Info] Number of data points in the train set: 15000, number of used features: 196
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.010867 -> initscore=-4.511129
[LightGBM] [Info] Start training from score -4.511129
   ‚Üí [trts_tvae] model=LR
   ‚Üí [trts_tvae] model=LinSVM
   ‚Üí [trts_tvae] model=RF
   ‚Üí [trts_tvae] model=LGBM
[LightGBM] [Info] Number of positive: 5844, number of negative: 18577
[LightGBM] [Info] Auto-choosing row-wise multi-th



‚úÖ [adult_node2vec:fit_TVAE] done in 2210.3s

Sampling 15000 rows from adult_node2vec_TVAE ...
‚ö†Ô∏è  Skipping CTGAN TSTR/TRTS for this view (no CTGAN model).
   ‚Üí [tstr_tvae] model=LR
   ‚Üí [tstr_tvae] model=LinSVM
   ‚Üí [tstr_tvae] model=RF
   ‚Üí [tstr_tvae] model=LGBM
[LightGBM] [Info] Number of positive: 116, number of negative: 14884
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.040765 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 49093
[LightGBM] [Info] Number of data points in the train set: 15000, number of used features: 196
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.007733 -> initscore=-4.854452
[LightGBM] [Info] Start training from score -4.854452
   ‚Üí [trts_tvae] model=LR
   ‚Üí [trts_tvae] model=LinSVM
   ‚Üí [trts_tvae] model=RF
   ‚Üí [trts_tvae] model=LGBM
[LightGBM] [Info] Number of positive: 5844, number of negative: 18577
[LightGBM] [Info] Auto-choosing row-wis

Gen. (-0.56) | Discrim. (-0.07): 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [00:45<00:00,  1.52s/it]


‚úÖ [petfinder_numeric_only:fit_CTGAN] done in 62.6s

‚è±Ô∏è  [petfinder_numeric_only:fit_TVAE] start
‚úÖ [petfinder_numeric_only:fit_TVAE] done in 12.5s

Sampling 5768 rows from petfinder_numeric_only_CTGAN ...

Sampling 5768 rows from petfinder_numeric_only_TVAE ...
   ‚Üí [tstr_ctgan] model=LR
   ‚Üí [tstr_ctgan] model=LinSVM
   ‚Üí [tstr_ctgan] model=RF
   ‚Üí [tstr_ctgan] model=LGBM
[LightGBM] [Info] Number of positive: 2660, number of negative: 3108
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000520 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 177
[LightGBM] [Info] Number of data points in the train set: 5768, number of used features: 3
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.461165 -> initscore=-0.155653
[LightGBM] [Info] Start training from score -0.155653
   ‚Üí [trts_ctgan] model=LR
   ‚Üí [trts_ctgan] model=LinSVM
   ‚Üí [trts_ctgan] model=RF


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


   ‚Üí [trts_ctgan] model=LGBM
[LightGBM] [Info] Number of positive: 1380, number of negative: 4388
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000335 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 123
[LightGBM] [Info] Number of data points in the train set: 5768, number of used features: 3
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.239251 -> initscore=-1.156790
[LightGBM] [Info] Start training from score -1.156790
‚ö†Ô∏è  Skipping TVAE TSTR/TRTS: synthetic labels not bi-class or sampling failed.

üßæ Saved summary ‚Üí /content/drive/MyDrive/dissertation/outputs/synthetic_eval_4d/petfinder_numeric_only/summary_tstr_trts.csv
‚úÖ [run_dataset_view] done in 82.2s

‚è±Ô∏è  [run_dataset_view] start

 Dataset = PETFINDER | View = w2v
Raw shape: (11537, 14)
Using target column: AdoptionSpeed_bin
   Loaded w2v embeddings for petfinder ‚Üí (176, 64)
View 'w2v' num_cols: 195
Shapes (train / val 



‚úÖ [petfinder_w2v:fit_TVAE] done in 549.1s

Sampling 5768 rows from petfinder_w2v_TVAE ...
‚ö†Ô∏è  Skipping CTGAN TSTR/TRTS for this view (no CTGAN model).
‚ö†Ô∏è  Skipping TVAE TSTR/TRTS: synthetic labels not bi-class or sampling failed.
‚ö†Ô∏è  No valid TSTR/TRTS summaries for this dataset/view.

üßæ Saved summary ‚Üí /content/drive/MyDrive/dissertation/outputs/synthetic_eval_4d/petfinder_w2v/summary_tstr_trts.csv
‚úÖ [run_dataset_view] done in 555.3s

‚è±Ô∏è  [run_dataset_view] start

 Dataset = PETFINDER | View = node2vec
Raw shape: (11537, 14)
Using target column: AdoptionSpeed_bin
   Loaded node2vec embeddings for petfinder ‚Üí (176, 64)
View 'node2vec' num_cols: 195
Shapes (train / val / test): (5768, 195) (2884, 195) (2885, 195)

‚è±Ô∏è  [petfinder_node2vec:build_metadata] start
‚úÖ [petfinder_node2vec:build_metadata] done in 0.3s
Skipping CTGAN for view=node2vec (too many columns / too wide).

‚è±Ô∏è  [petfinder_node2vec:fit_TVAE] start




‚úÖ [petfinder_node2vec:fit_TVAE] done in 544.4s

Sampling 5768 rows from petfinder_node2vec_TVAE ...
‚ö†Ô∏è  Skipping CTGAN TSTR/TRTS for this view (no CTGAN model).
‚ö†Ô∏è  Skipping TVAE TSTR/TRTS: synthetic labels not bi-class or sampling failed.
‚ö†Ô∏è  No valid TSTR/TRTS summaries for this dataset/view.

üßæ Saved summary ‚Üí /content/drive/MyDrive/dissertation/outputs/synthetic_eval_4d/petfinder_node2vec/summary_tstr_trts.csv
‚úÖ [run_dataset_view] done in 550.8s

‚è±Ô∏è  [run_dataset_view] start

 Dataset = BREAST | View = numeric_only
Raw shape: (59784, 126)
Using target column: OS5yr_bin
View 'numeric_only' num_cols: 68
Shapes (train / val / test): (29892, 68) (14946, 68) (14946, 68)

‚è±Ô∏è  [breast_numeric_only:build_metadata] start
‚úÖ [breast_numeric_only:build_metadata] done in 0.2s

‚è±Ô∏è  [breast_numeric_only:fit_CTGAN] start


Gen. (0.37) | Discrim. (-0.41): 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [12:27<00:00, 24.90s/it]


‚úÖ [breast_numeric_only:fit_CTGAN] done in 990.7s

‚è±Ô∏è  [breast_numeric_only:fit_TVAE] start
‚úÖ [breast_numeric_only:fit_TVAE] done in 501.2s

Sampling 15000 rows from breast_numeric_only_CTGAN ...

Sampling 15000 rows from breast_numeric_only_TVAE ...
   ‚Üí [tstr_ctgan] model=LR
   ‚Üí [tstr_ctgan] model=LinSVM
   ‚Üí [tstr_ctgan] model=RF
   ‚Üí [tstr_ctgan] model=LGBM
[LightGBM] [Info] Number of positive: 7187, number of negative: 7813
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.009646 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3703
[LightGBM] [Info] Number of data points in the train set: 15000, number of used features: 60
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.479133 -> initscore=-0.083515
[LightGBM] [Info] Start training from score -0.083515
   ‚Üí [trts_ctgan] model=LR
   ‚Üí [trts_ctgan] model=LinSVM
  



‚úÖ [breast_w2v:fit_TVAE] done in 4043.3s

Sampling 15000 rows from breast_w2v_TVAE ...
‚ö†Ô∏è  Skipping CTGAN TSTR/TRTS for this view (no CTGAN model).
   ‚Üí [tstr_tvae] model=LR
   ‚Üí [tstr_tvae] model=LinSVM
   ‚Üí [tstr_tvae] model=RF


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


   ‚Üí [tstr_tvae] model=LGBM
[LightGBM] [Info] Number of positive: 2, number of negative: 14998
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.088468 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 51122
[LightGBM] [Info] Number of data points in the train set: 15000, number of used features: 240
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.000133 -> initscore=-8.922525
[LightGBM] [Info] Start training from score -8.922525


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


   ‚Üí [trts_tvae] model=LR
   ‚Üí [trts_tvae] model=LinSVM
   ‚Üí [trts_tvae] model=RF
   ‚Üí [trts_tvae] model=LGBM
[LightGBM] [Info] Number of positive: 8985, number of negative: 20907
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.178032 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 21930
[LightGBM] [Info] Number of data points in the train set: 29892, number of used features: 252
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.300582 -> initscore=-0.844528
[LightGBM] [Info] Start training from score -0.844528

üßæ Saved summary ‚Üí /content/drive/MyDrive/dissertation/outputs/synthetic_eval_4d/breast_w2v/summary_tstr_trts.csv
‚úÖ [run_dataset_view] done in 4159.1s

‚è±Ô∏è  [run_dataset_view] start

 Dataset = BREAST | View = node2vec
Raw shape: (59784, 126)
Using target column: OS5yr_bin
   Loaded node2vec embeddings for breast ‚Üí (1374, 64)
View 'node2vec' num_cols: 260
Shapes (train / v



‚úÖ [breast_node2vec:fit_TVAE] done in 3528.6s

Sampling 15000 rows from breast_node2vec_TVAE ...
‚ö†Ô∏è  Skipping CTGAN TSTR/TRTS for this view (no CTGAN model).
   ‚Üí [tstr_tvae] model=LR
   ‚Üí [tstr_tvae] model=LinSVM
   ‚Üí [tstr_tvae] model=RF


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


   ‚Üí [tstr_tvae] model=LGBM
[LightGBM] [Info] Number of positive: 1, number of negative: 14999
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.094647 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 51095
[LightGBM] [Info] Number of data points in the train set: 15000, number of used features: 241
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.000067 -> initscore=-9.615739
[LightGBM] [Info] Start training from score -9.615739


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


   ‚Üí [trts_tvae] model=LR
   ‚Üí [trts_tvae] model=LinSVM
   ‚Üí [trts_tvae] model=RF
   ‚Üí [trts_tvae] model=LGBM
[LightGBM] [Info] Number of positive: 8985, number of negative: 20907
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.113561 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 22097
[LightGBM] [Info] Number of data points in the train set: 29892, number of used features: 252
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.300582 -> initscore=-0.844528
[LightGBM] [Info] Start training from score -0.844528

üßæ Saved summary ‚Üí /content/drive/MyDrive/dissertation/outputs/synthetic_eval_4d/breast_node2vec/summary_tstr_trts.csv
‚úÖ [run_dataset_view] done in 3639.1s

Combined summary saved ‚Üí /content/drive/MyDrive/dissertation/outputs/synthetic_eval_4d/summary_all_datasets_views.csv


  combined = pd.concat(all_summaries, axis=0, ignore_index=True)


Unnamed: 0,Dataset,View,Tag,Model,AUC,F1,ACC,PREC,REC
0,ADULT,numeric_only,tstr_ctgan,LR,0.827569,0.58447,0.762919,0.503337,0.696783
1,ADULT,numeric_only,tstr_ctgan,LinSVM,0.827804,0.585604,0.763328,0.503949,0.698836
2,ADULT,numeric_only,tstr_ctgan,RF,0.802009,0.553022,0.723856,0.45132,0.713895
3,ADULT,numeric_only,tstr_ctgan,LGBM,0.808542,0.558088,0.738023,0.467918,0.691307
4,ADULT,numeric_only,trts_ctgan,LR,0.830725,0.533964,0.684867,0.8699,0.385206



Runlog:
 - adult_numeric_only:build_metadata: 0.0s
 - adult_numeric_only:fit_CTGAN: 282.1s
 - adult_numeric_only:fit_TVAE: 68.9s
 - run_dataset_view: 389.6s
 - adult_w2v:build_metadata: 0.5s
 - adult_w2v:fit_TVAE: 2075.0s
 - run_dataset_view: 2211.0s
 - adult_node2vec:build_metadata: 0.6s
 - adult_node2vec:fit_TVAE: 2210.3s
 - run_dataset_view: 2347.5s
 - petfinder_numeric_only:build_metadata: 0.0s
 - petfinder_numeric_only:fit_CTGAN: 62.6s
 - petfinder_numeric_only:fit_TVAE: 12.5s
 - run_dataset_view: 82.2s
 - petfinder_w2v:build_metadata: 0.4s
 - petfinder_w2v:fit_TVAE: 549.1s
 - run_dataset_view: 555.3s
 - petfinder_node2vec:build_metadata: 0.3s
 - petfinder_node2vec:fit_TVAE: 544.4s
 - run_dataset_view: 550.8s
 - breast_numeric_only:build_metadata: 0.2s
 - breast_numeric_only:fit_CTGAN: 990.7s
 - breast_numeric_only:fit_TVAE: 501.2s
 - run_dataset_view: 1578.7s
 - breast_w2v:build_metadata: 1.2s
 - breast_w2v:fit_TVAE: 4043.3s
 - run_dataset_view: 4159.1s
 - breast_node2vec:build_