In [None]:
# ----------------------------
# 0) Install dependencies
# ----------------------------
!pip install -q transformers datasets iterative-stratification sentencepiece --upgrade
!pip install -q torch torchvision torchaudio --upgrade
!pip install -q scikit-learn imbalanced-learn seaborn

In [None]:
# ----------------------------
# 1) Imports & config
# ----------------------------
import os, random, math, time, re, warnings, shutil, itertools, string
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
from collections import Counter, defaultdict
from tqdm.auto import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from transformers import AutoTokenizer, AutoModel
from datasets import load_dataset
from sklearn.metrics import f1_score, classification_report, multilabel_confusion_matrix, precision_recall_fscore_support
from sklearn.feature_extraction.text import CountVectorizer
import scipy.sparse as sp
import matplotlib.pyplot as plt
import seaborn as sns

from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from torch.cuda.amp import autocast as amp_autocast
from torch.cuda.amp import autocast, GradScaler

In [None]:
# ----------------------------
# 2) Configuration & Grid
# ----------------------------
CONFIG = {
    "BERT_MAIN": "bert-base-uncased",
    "BERT_FOR_NODE_EMB": "distilbert-base-uncased",
    "VOCAB_SIZE": 5000,
    "WINDOW": 20,
    "MIN_COOC": 3,
    "MAX_SEQ_LEN": 200,
    "PATIENCE": 3
}

GRID = {
    "lr": [1e-5, 2e-5],
    "dropout": [0.2],
    "max_len": [200, 256],
    "npmi_th": [0.2],
    "gcn_hidden": [128],
    "gcn_layers": [1, 2],
    "graph_scale": [0.0, 0.5, 1.0]
}

RUN = {"max_vocab": CONFIG["VOCAB_SIZE"], "num_folds": 5, "epochs": 8, "batch_size": 16}

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

SEED = 42
def set_seed(s=SEED):
    random.seed(s); np.random.seed(s); torch.manual_seed(s)
    if torch.cuda.is_available(): torch.cuda.manual_seed_all(s)
set_seed()

print("Device:", DEVICE)

In [None]:
# ----------------------------
# 3) Cleaning
# ----------------------------
nltk.download("stopwords")
nltk.download("wordnet")

STOPWORDS = set(stopwords.words("english"))
LEMM = WordNetLemmatizer()

def clean_for_graph(text):
    text = str(text).lower()
    text = re.sub(r"http\S+|www\.\S+", " ", text)
    text = text.encode("ascii", "ignore").decode()
    text = re.sub(r"[0-9]+", " ", text)
    text = text.translate(str.maketrans("", "", string.punctuation))
    words = text.split()
    words = [w for w in words if w not in STOPWORDS]
    words = [LEMM.lemmatize(w) for w in words]
    text = " ".join(words)
    text = re.sub(r"\s+", " ", text).strip()
    return text

def clean_for_bert(text):
    text = str(text)
    text = re.sub(r"http\S+|www\.\S+", " ", text)
    text = text.replace("\\", " ")
    text = text.encode("ascii", "ignore").decode()
    text = re.sub(r"\s+", " ", text)
    return text.strip()

In [None]:
# ----------------------------
# 4) Load dataset
# ----------------------------
def load_dataset_hf_to_df():
    ds = load_dataset("jingjietan/essays-big5")
    df = ds["train"].to_pandas()
    required = ['text','O','C','E','A','N']
    for c in required:
        if c not in df.columns: raise ValueError(f"Missing {c}")
    df = df[required].copy()
    for c in ['O','C','E','A','N']:
        df[c] = df[c].astype(float); df[c] = (df[c] >= 0.5).astype(int)
    df = df.reset_index(drop=True)
    return df

print("Loading dataset...")
df = load_dataset_hf_to_df()
print("Dataset shape:", df.shape)

print("\nDataset:")
display(df[["text", "O", "C", "E", "A", "N"]].head(5))

df["clean_text_graph"] = df["text"].apply(clean_for_graph)
df["clean_text_bert"]  = df["text"].apply(clean_for_bert)

print("\nSample after cleaning:")
for i in range(3):
    print(f"\n--- Dataset {i} ---")
    print("Original :", df.loc[i, "text"][:200])
    print("Graph    :", df.loc[i, "clean_text_graph"][:200])
    print("BERT     :", df.loc[i, "clean_text_bert"][:200])

In [None]:
# ----------------------------
# 4) Dataset distribution
# ----------------------------
def plot_label_distribution(df, traits=['O','C','E','A','N']):
    label_counts = {}

    for t in traits:
        label_counts[t] = df[t].value_counts().sort_index()

    fig, ax = plt.subplots(figsize=(8, 5))
    bar_width = 0.35
    x = range(len(traits))

    zeros = [label_counts[t].get(0, 0) for t in traits]
    ones  = [label_counts[t].get(1, 0) for t in traits]

    bars_zero = ax.bar([i - bar_width/2 for i in x], zeros, width=bar_width, label='Label 0')
    bars_one  = ax.bar([i + bar_width/2 for i in x], ones,  width=bar_width, label='Label 1')

    for bar in bars_zero:
        height = bar.get_height()
        ax.annotate(f'{height}',
                    xy=(bar.get_x() + bar.get_width() / 2, height),
                    xytext=(0, 3),
                    textcoords="offset points",
                    ha='center', va='bottom', fontsize=9)

    for bar in bars_one:
        height = bar.get_height()
        ax.annotate(f'{height}',
                    xy=(bar.get_x() + bar.get_width() / 2, height),
                    xytext=(0, 3),
                    textcoords="offset points",
                    ha='center', va='bottom', fontsize=9)

    ax.set_xticks(x)
    ax.set_xticklabels(traits)
    ax.set_ylabel("Total Data")
    ax.set_title("Distribution Label for Each Big Five Trait")
    ax.legend()

    plt.tight_layout()
    plt.show()

In [None]:
# ----------------------------
# 4) Dataset distribution FULL
# ----------------------------
ds = load_dataset("jingjietan/essays-big5")

df_full = pd.concat([
    ds['train'].to_pandas(),
    ds['validation'].to_pandas(),
    ds['test'].to_pandas(),
], ignore_index=True)

plot_label_distribution(df_full)

In [None]:
# ----------------------------
# 4) Data validation
# ----------------------------
if "text" not in df.columns:
    raise ValueError("Column 'text' is missing from the dataframe!")

invalid_text = df["text"].isna() | (df["text"].astype(str).str.strip() == "")
if invalid_text.any():
    print(f"Detected {invalid_text.sum()} rows with empty text. These rows will be removed!")
    df = df[~invalid_text].reset_index(drop=True)

traits = ['O', 'C', 'E', 'A', 'N']

for t in traits:
    if t not in df.columns:
        raise ValueError(f"Label column '{t}' is missing from the dataframe!")
    unique_vals = sorted(df[t].unique().tolist())

    if not set(unique_vals).issubset({0, 1}):
        raise ValueError(f"Label '{t}' contains non-binary values: {unique_vals}")

print("Data validation completed: 'text' is valid and labels are binary.")

In [None]:
# ----------------------------
# 5) Simple tokenizer for vocab build
# ----------------------------
word_pattern = re.compile(r"\w+")

def tokenize_simple(text):
    return word_pattern.findall(text.lower())

print("Samples after tokenization:")
for i in range(5):
    txt = df["clean_text_graph"].iloc[i]
    tokens = tokenize_simple(txt)

    print(f"\n--- Sample {i+1} ---")
    print("Clean Graph Text :", txt)
    print("Tokens           :", tokens)

In [None]:
# ----------------------------
# 6) NPMI adjacency + helpers
# ----------------------------
def build_vocab_and_cooc(texts, max_vocab):
    counter = Counter()

    for t in texts:
        counter.update(tokenize_simple(t))
    most = [w for w, _ in counter.most_common(max_vocab)]
    vocab = {w: i for i, w in enumerate(most)}
    return vocab, counter

def build_cooccurrence_counts(texts, vocab, window=20):
    cooc = defaultdict(int)
    freq = defaultdict(int)
    total_windows = 0

    for doc in texts:
        toks = [t for t in tokenize_simple(doc) if t in vocab]
        n = len(toks)
        for i, w in enumerate(toks):
            freq[w] += 1
            start = max(0, i - window)
            end = min(n, i + window + 1)
            for j in range(start, end):
                if i == j:
                    continue
                cooc[(w, toks[j])] += 1
                total_windows += 1

    if total_windows == 0:
        total_windows = 1
    return cooc, freq, total_windows

def build_npmi_adjacency(
    texts,
    max_vocab=5000,
    window=20,
    min_cooc=3,
    npmi_th=0.15
):
    vocab, _ = build_vocab_and_cooc(texts, max_vocab)

    if len(vocab) == 0:
        return None, vocab
    cooc, freq, total_windows = build_cooccurrence_counts(
        texts, vocab, window
    )

    rows, cols, vals = [], [], []

    for (w, u), c in cooc.items():
        if c < min_cooc:
            continue
        p_ij = c / total_windows
        p_i = freq[w] / total_windows
        p_j = freq[u] / total_windows
        if p_ij <= 0 or p_i <= 0 or p_j <= 0:
            continue
        pmi = math.log(p_ij / (p_i * p_j) + 1e-12)
        denom = -math.log(p_ij + 1e-12)
        if denom <= 0:
            continue
        npmi = pmi / denom
        if npmi >= npmi_th:
            i = vocab[w]
            j = vocab[u]
            rows += [i, j]
            cols += [j, i]
            vals += [npmi, npmi]

    V = len(vocab)

    if len(rows) == 0:
        A = sp.eye(V, format="csr")
    else:
        A = sp.coo_matrix((vals, (rows, cols)), shape=(V, V)).tocsr()
        A = A + sp.eye(V)
        deg = np.array(A.sum(axis=1)).flatten()
        deg_inv_sqrt = np.power(deg, -0.5)
        deg_inv_sqrt[np.isinf(deg_inv_sqrt)] = 0.0
        D = sp.diags(deg_inv_sqrt)
        A = D.dot(A).dot(D)

    return A, vocab

In [None]:
# ----------------------------
# 6) Top NPMI
# ----------------------------
def extract_top_npmi_edges(
    texts,
    max_vocab=5000,
    window=20,
    min_cooc=3,
    npmi_th=0.15,
    top_k=20
):
    A, vocab = build_npmi_adjacency(
        texts,
        max_vocab=max_vocab,
        window=window,
        min_cooc=min_cooc,
        npmi_th=npmi_th
    )

    if A is None or len(vocab) == 0:
        print("No vocab found.")
        return

    inv_vocab = {i: w for w, i in vocab.items()}
    A_coo = A.tocoo()
    edges = []

    for i, j, val in zip(A_coo.row, A_coo.col, A_coo.data):
        if i >= j:
            continue
        edges.append((
            inv_vocab[i],
            inv_vocab[j],
            float(val)
        ))

    edges = sorted(edges, key=lambda x: x[2], reverse=True)
    print(f"Vocab size : {len(vocab)}")
    print(f"Total edges: {len(edges)}")
    print("\nTop NPMI edges:")

    for w1, w2, val in edges[:top_k]:
        print(f"('{w1}', '{w2}') → weight = {val:.4f}")

In [None]:
# ----------------------------
# 6) Print NPMI examples
# ----------------------------
texts = df["clean_text_graph"].tolist()
A, vocab = build_npmi_adjacency(texts, max_vocab=5000, window=20, min_cooc=3, npmi_th=0.15)

print("Vocab size:", len(vocab))
print("Adjacency shape:", A.shape)

print("\nSamples of word ID:")
print(list(vocab.items())[:10])

print("\nSamples after NPMI:")
extract_top_npmi_edges(
    texts,
    max_vocab=5000,
    window=20,
    min_cooc=3,
    npmi_th=0.15,
    top_k=10
)

In [None]:
# ----------------------------
# 7) Node embeddings from BERT
# ----------------------------
def build_node_embeddings_from_bert(
    vocab_words,
    bert_model_name,
    device=DEVICE,
    batch_size=128,
    shared_model=None,
    shared_tokenizer=None
):
    tokenizer = shared_tokenizer or AutoTokenizer.from_pretrained(bert_model_name) # distilbert-base-uncased
    model = shared_model or AutoModel.from_pretrained(bert_model_name).to(device) # distilbert-base-uncased
    model.eval()

    embs = []

    with torch.no_grad():
        for i in range(0, len(vocab_words), batch_size):
            batch_words = vocab_words[i:i+batch_size]

            enc = tokenizer(
                batch_words,
                padding=True,
                return_tensors="pt"
            )

            input_ids = enc["input_ids"].to(device)
            attn_mask = enc["attention_mask"].to(device)

            out = model(input_ids=input_ids, attention_mask=attn_mask)
            token_embs = out.last_hidden_state

            mask = attn_mask.unsqueeze(-1)
            pooled = (token_embs * mask).sum(dim=1) / mask.sum(dim=1)

            embs.append(pooled.cpu().numpy())

    if len(embs) == 0:
        return np.zeros((0, model.config.hidden_size), dtype=np.float32)

    return np.vstack(embs).astype(np.float32)

In [None]:
# ----------------------------
# 8) Dataset class
# ----------------------------
class PersonalityDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=200, graph_feats=None):
        self.texts = texts
        self.labels = labels.astype(np.float32)
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.graph_feats = graph_feats

        self.graph_dim = graph_feats.shape[1] if graph_feats is not None else None

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])

        enc = self.tokenizer(
            text,
            padding='max_length',
            truncation=True,
            max_length=self.max_len,
            return_tensors='pt'
        )

        item = {k: v.squeeze(0) for k, v in enc.items()}

        if self.graph_feats is not None:
            item['graph'] = torch.tensor(self.graph_feats[idx], dtype=torch.float32)
        else:
            item['graph'] = torch.zeros(self.graph_dim, dtype=torch.float32) if self.graph_dim else torch.zeros((1,), dtype=torch.float32)

        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float32)

        return item

In [None]:
# ----------------------------
# 9) GCN + VGCN_BERT model
# ----------------------------
class SimpleGCN(nn.Module):
    def __init__(self, in_dim, hidden_dim, n_layers=1):
        super().__init__()
        self.layers = nn.ModuleList()
        if n_layers==1:
            self.layers.append(nn.Linear(in_dim, hidden_dim))
        else:
            self.layers.append(nn.Linear(in_dim, hidden_dim))
            for _ in range(n_layers-1):
                self.layers.append(nn.Linear(hidden_dim, hidden_dim))
        self.act = nn.ReLU()
        self.out_features = hidden_dim

    def forward(self, X, A):
        with amp_autocast(enabled=False):
            X = X.float()
            if isinstance(A, torch.Tensor):
                A = A.float()
            h = X
            for l in self.layers:
                if isinstance(A, torch.Tensor) and A.is_sparse:
                    h = torch.sparse.mm(A, h)
                else:
                    h = A @ h
                h = l(h); h = self.act(h)
            return h

class VGCN_BERT_Model(nn.Module):
    def __init__(self, bert_model, gcn, hidden_dim, num_classes, dropout=0.3):
        super(VGCN_BERT_Model, self).__init__()

        self.bert = bert_model
        self.gcn = gcn

        bert_hidden = bert_model.config.hidden_size

        self.graph_pool = nn.Linear(gcn.out_features, hidden_dim)

        self.classifier = nn.Sequential(
            nn.Dropout(dropout),
            nn.Linear(bert_hidden + hidden_dim, num_classes)
        )

    def forward(
        self,
        input_ids,
        attention_mask,
        graph_node_emb=None,
        A=None,
        graph_feat=None,
        graph_scale=1.0
    ):
        """
        graph_scale:
        - 0.0  : graph contribution OFF
        - 0–1  : weighted fusion
        - 1.0  : full graph contribution
        """

        # BERT
        bert_out = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        cls = bert_out.last_hidden_state[:, 0, :]

        # GRAPH (GCN)
        if (graph_node_emb is not None) and (A is not None):
            node_emb = graph_node_emb.to(cls.device)
            A = A.to(cls.device)

            gcn_out = self.gcn(node_emb, A) * graph_scale

            pooled = gcn_out.mean(dim=0)
            pooled = self.graph_pool(pooled)
            pooled = pooled.unsqueeze(0).expand(cls.size(0), -1)

            fused = torch.cat([cls, pooled], dim=1)

        # GRAPH (NON-GCN, OPTIONAL)
        elif graph_feat is not None:
            graph_feat = graph_feat.to(cls.device) * graph_scale
            fused = torch.cat([cls, graph_feat], dim=1)

        # TEXT ONLY
        else:
            fused = torch.cat(
                [cls, torch.zeros(cls.size(0), self.graph_pool.out_features).to(cls.device)],
                dim=1
            )

        # CLASSIFICATION
        logits = self.classifier(fused)
        return logits

In [None]:
# ----------------------------
# 10) helpers: sparse convert, graph feat builder, metrics
# ----------------------------
def sparse_scipy_to_torch_sparse(A_csr, device=DEVICE):
# Fungsi: mengubah adjacency matrix dari format SciPy sparse (CSR/COO) menjadi PyTorch sparse tensor
# Dipakai untuk: torch.sparse.mm(A, h) di SimpleGCN.forward()
    A_coo = A_csr.tocoo()
    indices = torch.LongTensor([A_coo.row, A_coo.col])
    values = torch.FloatTensor(A_coo.data)
    shape = A_coo.shape
    A_torch = torch.sparse.FloatTensor(indices, values, torch.Size(shape)).to(device)
    return A_torch

def compute_sample_graph_feats(bow_matrix, node_embs_for_graph):
# Fungsi: membangun fitur graf per dokumen (per esai) berbasis BoW + embedding node (kata)
# Dipakai untuk: jalur graph_feat pada VGCN_BERT_Model.forward() (opsional), yaitu menggabungkan [CLS] dengan embedding dokumen dari “graph feature”
    rows = []
    for i in range(bow_matrix.shape[0]):
        row = bow_matrix.getrow(i)
        idxs = row.indices
        if len(idxs)==0:
            rows.append(np.zeros(node_embs_for_graph.shape[1], dtype=np.float32)); continue
        freqs = row.data.astype(float)
        emb = np.zeros(node_embs_for_graph.shape[1], dtype=np.float32)
        for k, f in zip(idxs, freqs):
            emb += node_embs_for_graph[k] * float(f)
        emb = emb / (freqs.sum() + 1e-9)
        norm = np.linalg.norm(emb)
        if norm > 0: emb = emb / (norm + 1e-9)
        rows.append(emb)
    return np.vstack(rows).astype(np.float32)

def tune_thresholds(y_true, y_probs, low=0.2, high=0.7, step=0.01):
# Fungsi: mencari threshold terbaik per label untuk memaksimalkan F1
# Dipakai untuk: threshold tuning OOF agar F1 meningkat dibanding threshold default 0.5
    best = [0.5]*y_true.shape[1]
    for i in range(y_true.shape[1]):
        best_f1 = -1; best_t = 0.5
        for t in np.arange(low, high, step):
            p = (y_probs[:,i] >= t).astype(int)
            f1 = f1_score(y_true[:,i], p, zero_division=0)
            if f1 > best_f1: best_f1 = f1; best_t = t
        best[i] = best_t
    return best

def compute_pos_weight(y):
# Fungsi: menghitung pos_weight untuk BCEWithLogitsLoss agar menangani imbalance label
# Dipakai untuk: membuat loss lebih “memperhatikan” kelas positif jika lebih jarang
    N = y.shape[0]; pos = y.sum(axis=0); neg = N - pos
    pw = (neg / (pos + 1e-6)); pw = np.clip(pw, 1.0, 100.0)
    return torch.tensor(pw, dtype=torch.float32, device=DEVICE)

def plot_f1_vs_thresholds(y_true, y_probs, trait_names=['O','C','E','A','N'], low=0.1, high=0.9, step=0.02):
# Fungsi: mem-plot grafik F1 vs threshold untuk tiap trait
# Dipakai untuk: menunjukkan bahwa threshold optimal tidak selalu 0.5 dan melihat bentuk trade-off
    thresholds = np.arange(low, high, step)
    n = len(trait_names)
    plt.figure(figsize=(5*n,4))
    for i, trait in enumerate(trait_names):
        scores = []
        for t in thresholds:
            p = (y_probs[:,i] >= t).astype(int)
            scores.append(f1_score(y_true[:,i], p, zero_division=0))
        ax = plt.subplot(1, n, i+1)
        ax.plot(thresholds, scores, marker='o')
        ax.set_title(trait)
        ax.set_xlabel('Threshold')
        ax.set_ylabel('F1')
        ax.grid(True)
    plt.tight_layout()
    plt.show()

In [None]:
# ----------------------------
# 11) Grid search + Training (per-fold NPMI)
# ----------------------------
def run_grid_search_and_train(df, grid, bert_name_main, bert_name_node_emb, max_vocab, window, min_cooc, num_folds, epochs, batch_size, patience):
    X = df['clean_text_graph'].values
    Y = df[['O','C','E','A','N']].values
    mskf = MultilabelStratifiedKFold(n_splits=num_folds, shuffle=True, random_state=SEED)

    tokenizer_cache = {}
    def get_tokenizer(name, max_len):
        key = (name, max_len)
        if key not in tokenizer_cache:
            tokenizer_cache[key] = AutoTokenizer.from_pretrained(name)
        return tokenizer_cache[key]

    # embedding kata (node) untuk vocabulary graph
    node_tokenizer_shared = AutoTokenizer.from_pretrained(bert_name_node_emb)
    node_model_shared = AutoModel.from_pretrained(bert_name_node_emb).to(DEVICE)
    node_model_shared.eval()

    # global vocab candidates (hanya untuk fallback kalau build_npmi_adjacency gagal)
    vocab_full, _ = build_vocab_and_cooc(X, max_vocab=max_vocab)
    vocab_words_full = sorted(vocab_full.keys(), key=lambda w: vocab_full[w])
    print("Global vocab candidates:", len(vocab_words_full))

    # kombinasi konfigurasi grid hyperparameter
    keys = list(grid.keys())
    combos = list(itertools.product(*[grid[k] for k in keys]))
    total_configs = len(combos)
    results = []

    graph_cache = {}

    cfg_id = 0
    for combo in combos:
        cfg_id += 1
        cfg = dict(zip(keys, combo))
        lr = cfg['lr']; dropout = cfg['dropout']; max_len = cfg['max_len']; npmi_th = cfg['npmi_th']; gcn_hidden = cfg['gcn_hidden']; gcn_layers = cfg['gcn_layers']; graph_scale = cfg['graph_scale']
        print("\n" + "="*160)
        print(f"Running Config {cfg_id}/{total_configs}: {cfg}")
        print("="*160)
        oof_preds = np.zeros_like(Y, dtype=float)
        oof_trues = np.zeros_like(Y, dtype=int)
        fold_scores = []
        fold_infos = []

        # Loop fold cross validation
        fold_idx = 0
        for train_idx, val_idx in mskf.split(X, Y):
            fold_idx += 1
            print("\n" + "-"*160)
            print(f"--- Fold {fold_idx}/{num_folds} (cfg {cfg_id}) ---")
            start_fold = time.time()
            X_train, X_val = X[train_idx], X[val_idx]
            Y_train, Y_val = Y[train_idx], Y[val_idx]

            cache_key = (fold_idx, npmi_th)
            if cache_key in graph_cache:
                A_train, vocab_train_words, node_embs_for_graph = graph_cache[cache_key]
                print(" Loaded adjacency & node-embeds from cache.")
            else:
                # Graph
                A_train, vocab_train = build_npmi_adjacency(X_train, max_vocab=max_vocab, window=window, min_cooc=min_cooc, npmi_th=npmi_th)
                if A_train is None:
                    V = len(vocab_words_full)
                    A_train = sp.eye(V, format='csr')
                    vocab_train_words = vocab_words_full
                else:
                    vocab_train_words = sorted(vocab_train.keys(), key=lambda w: vocab_train[w])
                # Node embedding untuk graph
                node_embs_for_graph = build_node_embeddings_from_bert(vocab_train_words, bert_name_node_emb, device=DEVICE, batch_size=128, shared_model=node_model_shared, shared_tokenizer=node_tokenizer_shared)
                if node_embs_for_graph.shape[0] == 0:
                    node_embs_for_graph = np.random.normal(size=(len(vocab_train_words), node_model_shared.config.hidden_size)).astype(np.float32)
                graph_cache[cache_key] = (A_train, vocab_train_words, node_embs_for_graph)
                print(" Cache saved for (fold, npmi_th) =", cache_key)

            node_embs_for_graph_t = torch.tensor(node_embs_for_graph, dtype=torch.float32, device=DEVICE)
            A_train_torch = sparse_scipy_to_torch_sparse(A_train, device=DEVICE)

            # graph feature per dokumen (BoW-weighted node embedding)
            vect = CountVectorizer(vocabulary=vocab_train_words)
            X_train_bow = vect.fit_transform(X_train)
            X_val_bow = vect.transform(X_val)
            graph_feat_train = compute_sample_graph_feats(X_train_bow, node_embs_for_graph)
            graph_feat_val = compute_sample_graph_feats(X_val_bow, node_embs_for_graph)
            graph_feat_train_scaled = graph_feat_train * graph_scale
            graph_feat_val_scaled = graph_feat_val * graph_scale

            # Siapkan tokenizer utama & DataLoader
            tokenizer_main = get_tokenizer(bert_name_main, max_len)
            bert_main_model = AutoModel.from_pretrained(bert_name_main).to(DEVICE)
            gcn_instance = SimpleGCN(in_dim=node_embs_for_graph.shape[1], hidden_dim=gcn_hidden, n_layers=gcn_layers)

            # Dataset + DataLoader per fold
            train_ds = PersonalityDataset(X_train, Y_train, tokenizer_main, max_len=max_len, graph_feats=graph_feat_train_scaled)
            val_ds   = PersonalityDataset(X_val, Y_val, tokenizer_main, max_len=max_len, graph_feats=graph_feat_val_scaled)
            train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True, pin_memory=True)
            val_loader = DataLoader(val_ds, batch_size=batch_size, shuffle=False, pin_memory=True)

            # model VGCN-BERT dan komponen training
            model = VGCN_BERT_Model(
                bert_model=bert_main_model,
                gcn=gcn_instance,
                hidden_dim=gcn_hidden,
                num_classes=5,
                dropout=dropout
            ).to(DEVICE)
            # loss:
            pos_weight = compute_pos_weight(Y_train)
            criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)
            # optimizer:
            optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=1e-3)
            scaler = GradScaler()
            total_steps = max(1, len(train_loader) * epochs)
            scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=max(1, total_steps)) # lr scheduler

            best_val = 0.0; no_imp = 0; best_state = None; best_thresholds = [0.5]*5

            # Training loop + Early stopping
            for epoch in range(epochs):
                t0 = time.time()
                # training
                model.train()
                train_losses = []
                for batch in train_loader:
                    optimizer.zero_grad()
                    labels = batch.pop('labels').to(DEVICE)
                    graph_feats_batch = batch.pop('graph').to(DEVICE)
                    input_ids = batch['input_ids'].to(DEVICE); attention_mask = batch['attention_mask'].to(DEVICE)
                    with autocast():
                        out = model(input_ids=input_ids, attention_mask=attention_mask, graph_node_emb=node_embs_for_graph_t, A=A_train_torch, graph_feat=graph_feats_batch)
                        logits = out
                        loss = criterion(logits, labels)
                    scaler.scale(loss).backward()
                    scaler.unscale_(optimizer)
                    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                    scaler.step(optimizer); scaler.update()
                    scheduler.step()
                    train_losses.append(loss.item())

                # validation
                model.eval()
                preds = []; trues = []
                with torch.no_grad():
                    for vb in val_loader:
                        v_labels = vb['labels'].numpy()
                        v_graph_feats = vb['graph'].to(DEVICE)
                        v_input_ids = vb['input_ids'].to(DEVICE); v_attn = vb['attention_mask'].to(DEVICE)
                        out = model(input_ids=v_input_ids, attention_mask=v_attn, graph_node_emb=node_embs_for_graph_t, A=A_train_torch, graph_feat=v_graph_feats)
                        probs = torch.sigmoid(out).cpu().numpy()
                        preds.append(probs); trues.append(v_labels)
                preds = np.vstack(preds); trues = np.vstack(trues)
                # threshold tuning per epoch:
                thrs = tune_thresholds(trues, preds, low=0.2, high=0.7, step=0.01)
                pred_labels = (preds >= thrs).astype(int)
                val_macro = f1_score(trues, pred_labels, average='macro', zero_division=0)
                val_weighted = f1_score(trues, pred_labels, average='weighted', zero_division=0)
                epoch_time = time.time() - t0
                improved_marker = ""

                if val_macro > best_val + 1e-5:
                    best_val = val_macro
                    best_state = {'state_dict': model.state_dict(), 'thresholds': thrs}
                    no_imp = 0
                    improved_marker = "improved \u2713"
                else:
                    no_imp += 1

                print(f"Epoch {epoch+1} | train_loss {np.mean(train_losses):.4f} | val_macro {val_macro:.4f} | val_weighted {val_weighted:.4f} | time {epoch_time:.1f}s {improved_marker}")

                if no_imp >= patience:
                    print("Early stopping (patience=",patience,") at epoch", epoch+1)
                    break

            fold_time = time.time() - start_fold
            print(f"Best val (fold {fold_idx}) macro F1: {best_val:.4f} | time {fold_time:.1f}s")

            model.load_state_dict(best_state['state_dict'])
            model.eval()
            preds = []
            with torch.no_grad():
                for vb in val_loader:
                    v_graph_feats = vb['graph'].to(DEVICE)
                    v_input_ids = vb['input_ids'].to(DEVICE); v_attn = vb['attention_mask'].to(DEVICE)
                    out = model(input_ids=v_input_ids, attention_mask=v_attn, graph_node_emb=node_embs_for_graph_t, A=A_train_torch, graph_feat=v_graph_feats)
                    probs = torch.sigmoid(out).cpu().numpy()
                    preds.append(probs)
            preds = np.vstack(preds)

            oof_preds[val_idx] = preds
            oof_trues[val_idx] = Y_val

            fold_scores.append(best_val)
            fold_infos.append({"fold": fold_idx, "vocab_size": len(vocab_train_words), "best_val": best_val})

            del model; torch.cuda.empty_cache()

        # Evaluasi agregat OOF untuk konfigurasi itu
        oof_preds_clipped = np.clip(oof_preds, 1e-6, 1-1e-6)
        best_thrs_cfg = tune_thresholds(oof_trues, oof_preds_clipped, low=0.2, high=0.7, step=0.01)
        oof_pred_labels_cfg = (oof_preds_clipped >= best_thrs_cfg).astype(int)
        macro_f1_cfg = f1_score(oof_trues, oof_pred_labels_cfg, average='macro', zero_division=0)
        weighted_f1_cfg = f1_score(oof_trues, oof_pred_labels_cfg, average='weighted', zero_division=0)
        print("\nCONFIG RESULT:", cfg)
        print("Mean fold val macro:", np.mean(fold_scores), "std:", np.std(fold_scores))
        print("Aggregated OOF Macro F1:", macro_f1_cfg, "Weighted F1:", weighted_f1_cfg)

        # save
        cfg_out = {
            "config": cfg,
            "fold_scores": fold_scores,
            "fold_infos": fold_infos,
            "mean_fold": float(np.mean(fold_scores)),
            "std_fold": float(np.std(fold_scores)),
            "oof_macro": float(macro_f1_cfg),
            "oof_weighted": float(weighted_f1_cfg),
            "thresholds": [float(t) for t in best_thrs_cfg]
        }
        np.save(f"vgcn_cfg_{cfg_id}_oof_preds.npy", oof_preds_clipped)
        np.save(f"vgcn_cfg_{cfg_id}_oof_trues.npy", oof_trues)
        results.append(cfg_out)
        with open(f"vgcn_cfg_{cfg_id}_summary.txt","w") as f:
            f.write(str(cfg_out))
        print("Saved OOF & summary for config", cfg_id)

    # choose best
    best = max(results, key=lambda r: r['oof_macro'])
    best_idx = results.index(best) + 1
    print("\n" + "="*80)
    print("BEST CONFIG index:", best_idx, "config:", best['config'])
    print("Best OOF Macro F1:", best['oof_macro'], "Weighted:", best['oof_weighted'])
    print("="*80)

    return {"results": results, "best_idx": best_idx, "best": best, "best_oof_preds_file": f"vgcn_cfg_{best_idx}_oof_preds.npy", "best_oof_trues_file": f"vgcn_cfg_{best_idx}_oof_trues.npy"}

In [None]:
# ----------------------------
# 12) Run grid search + training
# ----------------------------
out = run_grid_search_and_train(df,
                               grid=GRID,
                               bert_name_main=CONFIG["BERT_MAIN"],
                               bert_name_node_emb=CONFIG["BERT_FOR_NODE_EMB"],
                               max_vocab=RUN["max_vocab"],
                               window=CONFIG["WINDOW"],
                               min_cooc=CONFIG["MIN_COOC"],
                               num_folds=RUN["num_folds"],
                               epochs=RUN["epochs"],
                               batch_size=RUN["batch_size"],
                               patience=CONFIG["PATIENCE"])

In [None]:
# ----------------------------
# 13) Load best OOF, final error analysis & confusion plots
# ----------------------------
best_idx = out['best_idx']
oof_preds = np.load(f"vgcn_cfg_{best_idx}_oof_preds.npy")
oof_trues = np.load(f"vgcn_cfg_{best_idx}_oof_trues.npy")
best_thresholds = out['best']['thresholds']
oof_pred_labels = (oof_preds >= best_thresholds).astype(int)

print("\nFinal best config:", out['best']['config'])
print("Final thresholds:", best_thresholds)

In [None]:
# ----------------------------
# 14) Evaluation metrics per trait (best config)
# ----------------------------
from sklearn.metrics import accuracy_score, precision_score, recall_score

y_true = oof_trues
y_pred = oof_pred_labels
trait_names = ['O','C','E','A','N']

metrics = []
accs = []

for i, t in enumerate(trait_names):
    y_t = y_true[:, i]
    y_p = y_pred[:, i]

    prec = precision_score(y_t, y_p, zero_division=0)
    rec  = recall_score(y_t, y_p, zero_division=0)
    f1   = f1_score(y_t, y_p, zero_division=0)
    acc  = accuracy_score(y_t, y_p)

    metrics.append([t, prec, rec, f1, acc])
    accs.append(acc)

metrics_table = pd.DataFrame(
    metrics,
    columns=['Trait', 'Precision', 'Recall', 'F1', 'Accuracy']
)

macro_accuracy = np.mean(accs)

supports = [len(y_true[:, i]) for i in range(y_true.shape[1])]
weighted_accuracy = np.average(accs, weights=supports)

macro_avg = [
    "Macro Average",
    precision_score(y_true, y_pred, average='macro', zero_division=0),
    recall_score(y_true, y_pred, average='macro', zero_division=0),
    f1_score(y_true, y_pred, average='macro', zero_division=0),
    macro_accuracy
]
weighted_avg = [
    "Weighted Average",
    precision_score(y_true, y_pred, average='weighted', zero_division=0),
    recall_score(y_true, y_pred, average='weighted', zero_division=0),
    f1_score(y_true, y_pred, average='weighted', zero_division=0),
    weighted_accuracy
]

metrics_table = pd.concat(
    [metrics_table, pd.DataFrame([macro_avg, weighted_avg], columns=metrics_table.columns)],
    ignore_index=True
)

print("Evaluation Metrics per Trait (Best Config):")
display(metrics_table)

In [None]:
# ----------------------------
# 15) Plot F1 vs Threshold (per trait)
# ----------------------------
plot_f1_vs_thresholds(oof_trues, oof_preds)

print("\nPer-trait F1 before (0.5) and after (tuned):")
for i, t in enumerate(['O','C','E','A','N']):
    f_before = f1_score(oof_trues[:,i], (oof_preds[:,i] >= 0.5).astype(int), zero_division=0)
    f_after = f1_score(oof_trues[:,i], (oof_preds[:,i] >= best_thresholds[i]).astype(int), zero_division=0)
    print(f"{t}: before={f_before:.4f} → after={f_after:.4f}")

In [None]:
# ----------------------------
# 16) Error Analysis & Confusion Matrix (all traits)
# ----------------------------
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import precision_recall_fscore_support, multilabel_confusion_matrix

def error_analysis_all_traits(y_true, y_pred_labels, thresholds=None, trait_names=('O','C','E','A','N')):
    print("Error analysis per trait:\n")

    for i, trait in enumerate(trait_names):
        y_t = y_true[:, i].astype(int)
        y_p = y_pred_labels[:, i].astype(int)

        prec, rec, f1, _ = precision_recall_fscore_support(
            y_t, y_p, average='binary', zero_division=0
        )

        cm = multilabel_confusion_matrix(y_t, y_p)[0]
        tn, fp, fn, tp = cm.ravel()

        total = tn + fp + fn + tp
        acc = (tp + tn) / total if total > 0 else 0.0
        support_pos = int(y_t.sum())
        thr = thresholds[i] if thresholds is not None and len(thresholds) > i else None

        print(f"Trait {trait}:")
        if thr is not None:
            print(f" Threshold={thr:.3f}")
        print(f" Precision={prec:.3f}, Recall={rec:.3f}, F1={f1:.3f}, Accuracy={acc:.3f}, Support(pos)={support_pos}")

        print(f" Confusion matrix (tp, fn, fp, tn) = {(int(tp), int(fn), int(fp), int(tn))}")
        print(f"  TP={int(tp)}, TN={int(tn)}, FP={int(fp)}, FN={int(fn)}\n")

# Error analysis
error_analysis_all_traits(
    y_true=oof_trues,
    y_pred_labels=oof_pred_labels,
    thresholds=best_thresholds,
    trait_names=('O','C','E','A','N')
)

# Plot gabungan confusion matrix untuk semua traits
cms = multilabel_confusion_matrix(oof_trues, oof_pred_labels)
trait_names = ['O','C','E','A','N']
n = len(trait_names)

fig, axes = plt.subplots(1, n, figsize=(4*n, 4))
if n == 1:
    axes = [axes]

for i in range(n):
    cm = cms[i]  # [[TN, FP],[FN, TP]]
    ax = axes[i]
    ax.imshow(cm, interpolation='nearest')
    ax.set_title(trait_names[i])
    ax.set_xlabel("Predicted")
    ax.set_ylabel("Actual")
    ax.set_xticks([0, 1]); ax.set_yticks([0, 1])
    ax.set_xticklabels(["Neg(0)", "Pos(1)"])
    ax.set_yticklabels(["Neg(0)", "Pos(1)"])

    for (r, c), v in np.ndenumerate(cm):
        ax.text(c, r, int(v), ha="center", va="center")

plt.tight_layout()
plt.show()

In [None]:
# ----------------------------
# 17) Threshold Range Evaluation
# ----------------------------
from sklearn.metrics import accuracy_score, precision_score, recall_score

def evaluate_thresholds_all_metrics(oof_trues, oof_preds, low, high, step):
    threshs = tune_thresholds(oof_trues, oof_preds, low=low, high=high, step=step)
    pred_labels = (oof_preds >= threshs).astype(int)

    accs  = [accuracy_score(oof_trues[:, i], pred_labels[:, i]) for i in range(oof_trues.shape[1])]
    precisions = [precision_score(oof_trues[:, i], pred_labels[:, i], zero_division=0) for i in range(oof_trues.shape[1])]
    recalls    = [recall_score(   oof_trues[:, i], pred_labels[:, i], zero_division=0) for i in range(oof_trues.shape[1])]
    f1s        = [f1_score(       oof_trues[:, i], pred_labels[:, i], zero_division=0) for i in range(oof_trues.shape[1])]

    cms = multilabel_confusion_matrix(oof_trues, pred_labels)

    metrics = {
        "thresholds": threshs,
        "accuracy_per_label": accs,
        "precision_per_label": precisions,
        "recall_per_label": recalls,
        "f1_per_label": f1s,
        "avg_acc": np.mean(accs),
        "avg_prec": np.mean(precisions),
        "avg_rec": np.mean(recalls),
        "avg_f1": np.mean(f1s),
        "confusion_matrices": cms,
        "pred_labels": pred_labels
    }
    return metrics

# Threshold ranges
threshold_ranges = [
    (0.2, 0.7, 0.01),
    (0.3, 0.8, 0.01),
    (0.4, 0.9, 0.01),
    (0.5, 1.0, 0.01),
]

trait_names = ['O','C','E','A','N']

best_avg_f1_overall = -1
best_row = None


for low, high, step in threshold_ranges:
    print(f"\n=== Threshold Range {low}-{high} (step {step}) ===")

    metrics = evaluate_thresholds_all_metrics(oof_trues, oof_preds, low, high, step)

    rows = []
    supports = []

    for i, t in enumerate(trait_names):
        y_true_t = oof_trues[:, i]
        y_pred_t = metrics["pred_labels"][:, i]
        support = y_true_t.sum()
        supports.append(support)

        rows.append([
            t,
            metrics["precision_per_label"][i],
            metrics["recall_per_label"][i],
            metrics["f1_per_label"][i],
            metrics["accuracy_per_label"][i]
        ])

    df = pd.DataFrame(rows, columns=["Trait", "Precision", "Recall", "F1", "Accuracy"])

    macro_row = [
        "Macro Average",
        df["Precision"].mean(),
        df["Recall"].mean(),
        df["F1"].mean(),
        df["Accuracy"].mean()
    ]

    weights = np.array(supports) / np.sum(supports)
    weighted_row = [
        "Weighted Average",
        np.average(df["Precision"], weights=weights),
        np.average(df["Recall"], weights=weights),
        np.average(df["F1"], weights=weights),
        np.average(df["Accuracy"], weights=weights)
    ]

    df = pd.concat(
        [df, pd.DataFrame([macro_row, weighted_row], columns=df.columns)],
        ignore_index=True
    )

    print("Thresholds per trait:", dict(zip(trait_names, metrics["thresholds"])))
    display(df)

    # Best range check
    if metrics["avg_f1"] > best_avg_f1_overall:
        best_avg_f1_overall = metrics["avg_f1"]
        best_row = {
            "range": f"{low:.2f}-{high:.2f}@{step}",
            "thresholds": dict(zip(trait_names, metrics["thresholds"])),
            "avg_acc": metrics["avg_acc"],
            "avg_prec": metrics["avg_prec"],
            "avg_rec": metrics["avg_rec"],
            "avg_f1": metrics["avg_f1"]
        }

    # Confusion matrix
    fig, axes = plt.subplots(1, len(trait_names), figsize=(4 * len(trait_names), 4))
    for i, ax in enumerate(axes):
        sns.heatmap(metrics["confusion_matrices"][i], annot=True, fmt="d", cmap="Blues", ax=ax)
        ax.set_title(f"{trait_names[i]} | thr={metrics['thresholds'][i]:.2f}")
        ax.set_xlabel("Predicted")
        ax.set_ylabel("Actual")
    plt.tight_layout()
    plt.show()

print("\n--- Best Threshold Range Based on Avg F1 ---")
print(f"Range        : {best_row['range']}")
print(f"Avg F1       : {best_row['avg_f1']:.4f}")
print(f"Avg Accuracy : {best_row['avg_acc']:.4f}")
print(f"Avg Precision: {best_row['avg_prec']:.4f}")
print(f"Avg Recall   : {best_row['avg_rec']:.4f}")

print("Best Thresholds per trait:")
for t, v in best_row["thresholds"].items():
    print(f"  {t}: {v:.2f}")