<a href="https://colab.research.google.com/github/cheongyeechian/DLI/blob/main/Lim_Li_Vorn_TP073982.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [168]:
!pip -q install -U transformers accelerate scikit-learn matplotlib

import os, re, time, random, gc, math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from transformers import BertTokenizer, BertConfig, BertModel, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import (accuracy_score, precision_recall_fscore_support,
                             roc_auc_score, confusion_matrix, ConfusionMatrixDisplay,
                             roc_curve)

In [169]:
# ------------------------------------------------------------
# [0] Colab: mount Google Drive
# ------------------------------------------------------------
try:
    from google.colab import drive
    drive.mount('/content/drive')
    IN_COLAB = True
except Exception:
    IN_COLAB = False


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [170]:
# ------------------------------------------------------------
# [1] CONFIG — set your paths
#    Put these three files in Drive and update the paths below.
# ------------------------------------------------------------
CSV_PATH     = "/content/drive/My Drive/DLI Assignment/group_dataset.csv"  # your dataset
VOCAB_TXT    = "/content/drive/My Drive/DLI Assignment/vocab.txt"          # your tokenizer vocab
URLBERT_PT   = "/content/drive/My Drive/DLI Assignment/urlBERT.pt"         # your encoder weights

# Training & model hyperparams
SEED         = 2025
MAX_LEN      = 128
BATCH_SIZE   = 64
EPOCHS       = 3
LR           = 2e-5
WARMUP_RATIO = 0.1
WEIGHT_DECAY = 0.01
DROPOUT      = 0.2
KERNEL_SIZES = (2,3,4)
NUM_FILTERS  = 128

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Device: {DEVICE}")

Device: cuda


In [171]:
# ------------------------------------------------------------
# [2] Reproducibility
# ------------------------------------------------------------
def set_seed(seed=SEED):
    random.seed(seed); np.random.seed(seed)
    torch.manual_seed(seed); torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
set_seed(SEED)

In [175]:
# ------------------------------------------------------------
# [4] Tokenizer: use YOUR vocab.txt
# ------------------------------------------------------------
assert os.path.exists(VOCAB_TXT), f"vocab.txt not found at {VOCAB_TXT}"
tokenizer = BertTokenizer(vocab_file=VOCAB_TXT, do_lower_case=True, tokenize_chinese_chars=False, strip_accents=False)
VOCAB_SIZE = len(tokenizer)
print("Vocab size:", VOCAB_SIZE)

class UrlDataset(Dataset):
    def __init__(self, txts, labels=None, max_len=MAX_LEN):
        self.txts = list(txts)
        self.labels = None if labels is None else list(labels)
        self.max_len = max_len
    def __len__(self): return len(self.txts)
    def __getitem__(self, idx):
        enc = tokenizer(
            self.txts[idx],
            padding="max_length",
            truncation=True,
            max_length=self.max_len,
            return_tensors="pt"
        )
        item = {k: v.squeeze(0) for k,v in enc.items()}
        if self.labels is not None:
            item["labels"] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

train_ds = UrlDataset(X_train, y_train)
test_ds  = UrlDataset(X_test, y_test)
train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, num_workers=2, pin_memory=True)
test_loader  = DataLoader(test_ds, batch_size=BATCH_SIZE, shuffle=False, num_workers=2, pin_memory=True)


Vocab size: 5000
