# Importing lib 

In [4]:
import re
import ast
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, hamming_loss
from sklearn.preprocessing import MultiLabelBinarizer

from gensim.models import KeyedVectors
from transformers import BertTokenizer, BertModel

# Config and model

In [5]:
# ======================
# CONFIG
# ======================

# TODO: change to the actual file path in your Kaggle dataset
CSV_PATH = r"/kaggle/input/arxiv-paper-abstracts/arxiv_data_210930-054931.csv"

EMBEDDING_INFO = {
    "word2vec": {
        "path": r"/kaggle/input/googlenewsvectors/GoogleNews-vectors-negative300.bin",
        "binary": True
    },
    "glove": {
        "path": r"/kaggle/input/glove6b300dtxt/glove.6B.300d.txt",
        "binary": False,
        "is_glove": True,
    },
    "fasttext": {
        "path": r"/kaggle/input/fasttext-wikinews/wiki-news-300d-1M.vec",
        "binary": False
    }
}

BERT_MODEL_NAME = "bert-base-uncased"
BATCH_SIZE_BERT = 16      # abstracts are long, keep this moderate
MAX_LEN_BERT = 256
BATCH_SIZE_TRAIN = 64
EPOCHS = 8
LR = 1e-3
SEED = 42

# device

In [6]:
# ======================
# DEVICE
# ======================

if torch.cuda.is_available():
    device = torch.device("cuda")
elif torch.backends.mps.is_available():
    device = torch.device("mps")
else:
    device = torch.device("cpu")
print("Using device:", device)

torch.manual_seed(SEED)
np.random.seed(SEED)

Using device: cuda


# load dataset

In [7]:
# ======================
# LOAD DATA
# ======================

df = pd.read_csv(CSV_PATH)
print("Columns:", df.columns.tolist())
print("Shape:", df.shape)

# Adjust these names if your CSV is slightly different
TITLE_COL = "titles"
ABSTRACT_COL = "abstracts"
TERMS_COL = "terms"   # e.g. "['cs.LG', 'cs.CR', 'stat.ML']"

# Combine title + abstract
df["text_full"] = df[TITLE_COL].fillna("").astype(str) + " " + df[ABSTRACT_COL].fillna("").astype(str)

# Drop rows with missing text or labels
df = df.dropna(subset=["text_full", TERMS_COL]).copy()

Columns: ['terms', 'titles', 'abstracts']
Shape: (56181, 3)


# label preprocessing

In [8]:
# =====================================
# LABEL PROCESSING (MULTI-LABEL)
# =====================================

# Convert string -> Python list using ast.literal_eval
df["parsed_terms"] = df[TERMS_COL].apply(lambda x: ast.literal_eval(str(x)))

# MultiLabelBinarizer: list of labels -> multi-hot vector
mlb = MultiLabelBinarizer()
labels = mlb.fit_transform(df["parsed_terms"]).astype("float32")

LABELS = list(mlb.classes_)
num_labels = len(LABELS)

texts = df["text_full"].astype(str).tolist()

print("Number of unique labels:", num_labels)
print("First 20 labels:", LABELS[:20])
print("Labels shape:", labels.shape)
print("Example text:", texts[0][:200], "...")
print("Example label vector sum (how many labels):", labels[0].sum())

Number of unique labels: 1177
First 20 labels: ['00', '00-02', '00B25', '00Bxx', '03B52, 94A08', '03B70, 03-04, 03D10, 11Y16', '05B45, 62H30, 54E05, 68T10', '05C20, 14T10, 62G32, 62H22, 05C99, 62R01, 65S05', '05C21, 68T07, 76D07, 76M10, 76S05', '05C50', '05C50, 05C70, 65M55', '05C50, 68T07', '05C60', '05C62, 41A46, 41A63, 62J02', '05C85, 05C80', '05C99, 62M45', '06A15, 06B99, 68T05, 91A80', '10010147.10010257.10010258.10010259.10010263', '11Z05', '14J60']
Labels shape: (56181, 1177)
Example text: Multi-Level Attention Pooling for Graph Neural Networks: Unifying Graph Representations with Multiple Localities Graph neural networks (GNNs) have been widely used to learn vector
representation of gr ...
Example label vector sum (how many labels): 1.0


# train and test split 

In [9]:
# ======================
# TRAIN / VAL / TEST SPLIT
# ======================

indices = np.arange(len(texts))
idx_train, idx_temp, y_train, y_temp = train_test_split(
    indices, labels, test_size=0.25, random_state=SEED
)
idx_val, idx_test, y_val, y_test = train_test_split(
    idx_temp, y_temp, test_size=0.5, random_state=SEED
)

texts = np.array(texts)
X_train_text = texts[idx_train]
X_val_text = texts[idx_val]
X_test_text = texts[idx_test]

print(f"Train: {len(X_train_text)}, Val: {len(X_val_text)}, Test: {len(X_test_text)}")

Train: 42135, Val: 7023, Test: 7023


# text cleaning and tokenization

In [10]:
# ======================
# TEXT CLEANING / TOKENIZATION
# ======================

import html
import emoji

CONTRACTIONS = {
    "can't": "cannot",
    "won't": "will not",
    "don't": "do not",
    "doesn't": "does not",
    "didn't": "did not",
    "isn't": "is not",
    "aren't": "are not",
    "weren't": "were not",
    "wasn't": "was not",
    "i'm": "i am",
    "you're": "you are",
    "we're": "we are",
    "they're": "they are",
    "it's": "it is",
    "that's": "that is",
    "there's": "there is",
    "i've": "i have",
    "you've": "you have",
    "we've": "we have",
    "they've": "they have",
    "i'll": "i will",
    "you'll": "you will",
    "we'll": "we will",
    "they'll": "they will",
    "i'd": "i would",
    "you'd": "you would",
    "we'd": "we would",
    "they'd": "they would",
}

STOPWORDS = {
    "the", "a", "an", "of", "and", "to", "in", "on", "for", "with", "at", "by",
    "this", "that", "these", "those",
    "is", "am", "are", "was", "were", "be", "been",
    "it", "its", "as", "or", "so",
    "do", "does", "did",
    "you", "i", "we", "they", "he", "she", "him", "her", "them", "our", "your",
    "from", "about", "into", "out", "up", "down",
    "just", "very", "too"
}

def expand_contractions(text: str) -> str:
    for contr, full in CONTRACTIONS.items():
        text = re.sub(r"\b" + re.escape(contr) + r"\b", full, text)
    return text

def clean_text(text: str) -> str:
    text = str(text)
    text = html.unescape(text)
    text = text.lower()
    text = expand_contractions(text)
    text = re.sub(r"http\S+|www\.\S+", " ", text)
    text = re.sub(r"@\w+", " ", text)
    text = re.sub(r"#(\w+)", r"\1", text)
    text = emoji.replace_emoji(text, replace=" ")
    text = re.sub(r"[^a-z0-9\s']", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text
def simple_tokenize(text: str):
    text = clean_text(text)
    tokens = text.split()
    tokens = [t for t in tokens if t not in STOPWORDS]
    return tokens


# static embedding

In [11]:
# ======================
# STATIC EMBEDDINGS
# ======================

def load_keyed_vectors(info):
    path = info["path"]
    binary = info.get("binary", False)
    is_glove = info.get("is_glove", False)

    if is_glove:
        print(f"Converting GloVe from {path} to word2vec format...")
        converted_path = "/kaggle/working/glove_converted.txt"
        from gensim.scripts.glove2word2vec import glove2word2vec
        glove2word2vec(path, converted_path)
        path = converted_path
        binary = False

    print(f"Loading embeddings from {path} (binary={binary}) ...")
    kv = KeyedVectors.load_word2vec_format(path, binary=binary)
    print("Embedding dim:", kv.vector_size)
    return kv
def compute_doc_embeddings(text_list, kv):
    dim = kv.vector_size
    embs = []
    for text in tqdm(text_list, desc="Computing doc embeddings"):
        toks = simple_tokenize(text)
        vecs = [kv[w] for w in toks if w in kv]
        if len(vecs) == 0:
            embs.append(np.zeros(dim, dtype="float32"))
        else:
            embs.append(np.mean(vecs, axis=0).astype("float32"))
    return np.vstack(embs)


# bert embedding

In [12]:
# ======================
# BERT EMBEDDINGS (GPU)
# ======================

tokenizer = BertTokenizer.from_pretrained(BERT_MODEL_NAME)
bert_model = BertModel.from_pretrained(BERT_MODEL_NAME).to(device)
bert_model.eval()
for p in bert_model.parameters():
    p.requires_grad = False

@torch.no_grad()
def compute_bert_embeddings(text_list):
    embs = []
    for i in tqdm(range(0, len(text_list), BATCH_SIZE_BERT), desc="BERT embeddings"):
        raw_batch = text_list[i : i + BATCH_SIZE_BERT]
        batch_texts = [clean_text(t) for t in raw_batch]
        enc = tokenizer(
            batch_texts,
            padding=True,
            truncation=True,
            max_length=MAX_LEN_BERT,
            return_tensors="pt"
        )
        enc = {k: v.to(device) for k, v in enc.items()}
        out = bert_model(**enc)
        cls_emb = out.last_hidden_state[:, 0, :]  # [CLS]
        embs.append(cls_emb.cpu().numpy())
    return np.vstack(embs).astype("float32")
# ======================
# MODEL & TRAINING (MULTI-LABEL)
# ======================

class MLPClassifier(nn.Module):
    def __init__(self, input_dim, num_labels):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 256),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256, num_labels)
        )

    def forward(self, x):
        return self.net(x)

def train_and_eval(X_train, y_train, X_val, y_val, X_test, y_test, name):
    X_train_t = torch.tensor(X_train, dtype=torch.float32)
    y_train_t = torch.tensor(y_train, dtype=torch.float32)
    X_val_t = torch.tensor(X_val, dtype=torch.float32)
    y_val_t = torch.tensor(y_val, dtype=torch.float32)
    X_test_t = torch.tensor(X_test, dtype=torch.float32)
    y_test_t = torch.tensor(y_test, dtype=torch.float32)

    train_ds = TensorDataset(X_train_t, y_train_t)
    val_ds = TensorDataset(X_val_t, y_val_t)
    test_ds = TensorDataset(X_test_t, y_test_t)

    train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE_TRAIN, shuffle=True)
    val_loader = DataLoader(val_ds, batch_size=BATCH_SIZE_TRAIN, shuffle=False)
    test_loader = DataLoader(test_ds, batch_size=BATCH_SIZE_TRAIN, shuffle=False)

    input_dim = X_train.shape[1]
    num_labels = y_train.shape[1]
    model = MLPClassifier(input_dim, num_labels).to(device)
    criterion = nn.BCEWithLogitsLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=LR)

    best_val_micro = 0.0
    best_state = None

    for epoch in range(1, EPOCHS + 1):
        model.train()
        total_loss = 0.0
        for xb, yb in train_loader:
            xb = xb.to(device)
            yb = yb.to(device)

            optimizer.zero_grad()
            logits = model(xb)
            loss = criterion(logits, yb)
            loss.backward()
            optimizer.step()
            total_loss += loss.item() * xb.size(0)

        avg_loss = total_loss / len(train_ds)
    # Validation
        model.eval()
        all_y = []
        all_pred = []
        with torch.no_grad():
            for xb, yb in val_loader:
                xb = xb.to(device)
                yb = yb.to(device)
                logits = model(xb)
                probs = torch.sigmoid(logits)
                preds = (probs >= 0.5).float()
                all_y.append(yb.cpu().numpy())
                all_pred.append(preds.cpu().numpy())
        y_true = np.vstack(all_y)
        y_pred = np.vstack(all_pred)

        micro_f1 = f1_score(y_true, y_pred, average="micro", zero_division=0)
        macro_f1 = f1_score(y_true, y_pred, average="macro", zero_division=0)
        h_loss = hamming_loss(y_true, y_pred)

        if micro_f1 > best_val_micro:
            best_val_micro = micro_f1
            best_state = model.state_dict()

        print(f"[{name}] Epoch {epoch}/{EPOCHS} | "
              f"Train loss {avg_loss:.4f} | "
              f"Val micro-F1 {micro_f1:.4f} | "
              f"Val macro-F1 {macro_f1:.4f} | "
              f"Val Hamming {h_loss:.4f}")
    # Load best model and evaluate on test
    if best_state is not None:
        model.load_state_dict(best_state)

    model.eval()
    all_y = []
    all_pred = []
    with torch.no_grad():
        for xb, yb in test_loader:
            xb = xb.to(device)
            yb = yb.to(device)
            logits = model(xb)
            probs = torch.sigmoid(logits)
            preds = (probs >= 0.5).float()
            all_y.append(yb.cpu().numpy())
            all_pred.append(preds.cpu().numpy())
    y_true = np.vstack(all_y)
    y_pred = np.vstack(all_pred)

    micro_f1 = f1_score(y_true, y_pred, average="micro", zero_division=0)
    macro_f1 = f1_score(y_true, y_pred, average="macro", zero_division=0)
    h_loss = hamming_loss(y_true, y_pred)

    print(f"\n==== TEST RESULTS: {name} ====")
    print(f"Micro-F1 : {micro_f1:.4f}")
    print(f"Macro-F1 : {macro_f1:.4f}")
    print(f"Hamming  : {h_loss:.4f}\n")

    return micro_f1, macro_f1, h_loss
    

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

# run models

In [13]:
# ======================
# RUN 4 MODELS
# ======================

results = {}

# model - Word2Vec

In [15]:
# 1) Word2Vec
kv_w2v = load_keyed_vectors(EMBEDDING_INFO["word2vec"])
X_train_w2v = compute_doc_embeddings(X_train_text, kv_w2v)
X_val_w2v = compute_doc_embeddings(X_val_text, kv_w2v)
X_test_w2v = compute_doc_embeddings(X_test_text, kv_w2v)
results["Word2Vec"] = train_and_eval(X_train_w2v, y_train, X_val_w2v, y_val, X_test_w2v, y_test, "Word2Vec")

Loading embeddings from /kaggle/input/googlenewsvectors/GoogleNews-vectors-negative300.bin (binary=True) ...
Embedding dim: 300


Computing doc embeddings:   0%|          | 0/42135 [00:00<?, ?it/s]

Computing doc embeddings:   0%|          | 0/7023 [00:00<?, ?it/s]

Computing doc embeddings:   0%|          | 0/7023 [00:00<?, ?it/s]

[Word2Vec] Epoch 1/8 | Train loss 0.0271 | Val micro-F1 0.5871 | Val macro-F1 0.0013 | Val Hamming 0.0013
[Word2Vec] Epoch 2/8 | Train loss 0.0039 | Val micro-F1 0.6686 | Val macro-F1 0.0020 | Val Hamming 0.0010
[Word2Vec] Epoch 3/8 | Train loss 0.0037 | Val micro-F1 0.6760 | Val macro-F1 0.0020 | Val Hamming 0.0010
[Word2Vec] Epoch 4/8 | Train loss 0.0036 | Val micro-F1 0.6819 | Val macro-F1 0.0020 | Val Hamming 0.0009
[Word2Vec] Epoch 5/8 | Train loss 0.0036 | Val micro-F1 0.6798 | Val macro-F1 0.0020 | Val Hamming 0.0009
[Word2Vec] Epoch 6/8 | Train loss 0.0035 | Val micro-F1 0.6826 | Val macro-F1 0.0020 | Val Hamming 0.0009
[Word2Vec] Epoch 7/8 | Train loss 0.0034 | Val micro-F1 0.6826 | Val macro-F1 0.0020 | Val Hamming 0.0009
[Word2Vec] Epoch 8/8 | Train loss 0.0034 | Val micro-F1 0.6840 | Val macro-F1 0.0021 | Val Hamming 0.0009

==== TEST RESULTS: Word2Vec ====
Micro-F1 : 0.6809
Macro-F1 : 0.0020
Hamming  : 0.0009



# model - GloVe

In [16]:
# 2) GloVe
kv_glove = load_keyed_vectors(EMBEDDING_INFO["glove"])
X_train_glove = compute_doc_embeddings(X_train_text, kv_glove)
X_val_glove = compute_doc_embeddings(X_val_text, kv_glove)
X_test_glove = compute_doc_embeddings(X_test_text, kv_glove)
results["GloVe"] = train_and_eval(X_train_glove, y_train, X_val_glove, y_val, X_test_glove, y_test, "GloVe")

Converting GloVe from /kaggle/input/glove6b300dtxt/glove.6B.300d.txt to word2vec format...


  glove2word2vec(path, converted_path)


Loading embeddings from /kaggle/working/glove_converted.txt (binary=False) ...
Embedding dim: 300


Computing doc embeddings:   0%|          | 0/42135 [00:00<?, ?it/s]

Computing doc embeddings:   0%|          | 0/7023 [00:00<?, ?it/s]

Computing doc embeddings:   0%|          | 0/7023 [00:00<?, ?it/s]

[GloVe] Epoch 1/8 | Train loss 0.0210 | Val micro-F1 0.6646 | Val macro-F1 0.0019 | Val Hamming 0.0010
[GloVe] Epoch 2/8 | Train loss 0.0036 | Val micro-F1 0.6797 | Val macro-F1 0.0020 | Val Hamming 0.0009
[GloVe] Epoch 3/8 | Train loss 0.0035 | Val micro-F1 0.6746 | Val macro-F1 0.0020 | Val Hamming 0.0009
[GloVe] Epoch 4/8 | Train loss 0.0034 | Val micro-F1 0.6839 | Val macro-F1 0.0021 | Val Hamming 0.0009
[GloVe] Epoch 5/8 | Train loss 0.0033 | Val micro-F1 0.6805 | Val macro-F1 0.0021 | Val Hamming 0.0009
[GloVe] Epoch 6/8 | Train loss 0.0033 | Val micro-F1 0.6868 | Val macro-F1 0.0021 | Val Hamming 0.0009
[GloVe] Epoch 7/8 | Train loss 0.0032 | Val micro-F1 0.6881 | Val macro-F1 0.0022 | Val Hamming 0.0009
[GloVe] Epoch 8/8 | Train loss 0.0032 | Val micro-F1 0.6838 | Val macro-F1 0.0022 | Val Hamming 0.0009

==== TEST RESULTS: GloVe ====
Micro-F1 : 0.6850
Macro-F1 : 0.0022
Hamming  : 0.0009



# model - FastText

In [17]:
# 3) FastText
kv_ft = load_keyed_vectors(EMBEDDING_INFO["fasttext"])
X_train_ft = compute_doc_embeddings(X_train_text, kv_ft)
X_val_ft = compute_doc_embeddings(X_val_text, kv_ft)
X_test_ft = compute_doc_embeddings(X_test_text, kv_ft)
results["FastText"] = train_and_eval(X_train_ft, y_train, X_val_ft, y_val, X_test_ft, y_test, "FastText")

Loading embeddings from /kaggle/input/fasttext-wikinews/wiki-news-300d-1M.vec (binary=False) ...
Embedding dim: 300


Computing doc embeddings:   0%|          | 0/42135 [00:00<?, ?it/s]

Computing doc embeddings:   0%|          | 0/7023 [00:00<?, ?it/s]

Computing doc embeddings:   0%|          | 0/7023 [00:00<?, ?it/s]

[FastText] Epoch 1/8 | Train loss 0.0300 | Val micro-F1 0.6289 | Val macro-F1 0.0015 | Val Hamming 0.0010
[FastText] Epoch 2/8 | Train loss 0.0038 | Val micro-F1 0.6730 | Val macro-F1 0.0019 | Val Hamming 0.0009
[FastText] Epoch 3/8 | Train loss 0.0036 | Val micro-F1 0.6807 | Val macro-F1 0.0020 | Val Hamming 0.0009
[FastText] Epoch 4/8 | Train loss 0.0036 | Val micro-F1 0.6887 | Val macro-F1 0.0020 | Val Hamming 0.0009
[FastText] Epoch 5/8 | Train loss 0.0036 | Val micro-F1 0.6866 | Val macro-F1 0.0020 | Val Hamming 0.0009
[FastText] Epoch 6/8 | Train loss 0.0035 | Val micro-F1 0.6897 | Val macro-F1 0.0020 | Val Hamming 0.0009
[FastText] Epoch 7/8 | Train loss 0.0035 | Val micro-F1 0.6770 | Val macro-F1 0.0020 | Val Hamming 0.0009
[FastText] Epoch 8/8 | Train loss 0.0034 | Val micro-F1 0.6837 | Val macro-F1 0.0020 | Val Hamming 0.0009

==== TEST RESULTS: FastText ====
Micro-F1 : 0.6847
Macro-F1 : 0.0020
Hamming  : 0.0009



# model - Bert

In [18]:
# 4) BERT
X_train_bert = compute_bert_embeddings(list(X_train_text))
X_val_bert = compute_bert_embeddings(list(X_val_text))
X_test_bert = compute_bert_embeddings(list(X_test_text))
results["BERT"] = train_and_eval(X_train_bert, y_train, X_val_bert, y_val, X_test_bert, y_test, "BERT")

BERT embeddings:   0%|          | 0/2634 [00:00<?, ?it/s]

BERT embeddings:   0%|          | 0/439 [00:00<?, ?it/s]

BERT embeddings:   0%|          | 0/439 [00:00<?, ?it/s]

[BERT] Epoch 1/8 | Train loss 0.0113 | Val micro-F1 0.6552 | Val macro-F1 0.0019 | Val Hamming 0.0010
[BERT] Epoch 2/8 | Train loss 0.0037 | Val micro-F1 0.6695 | Val macro-F1 0.0019 | Val Hamming 0.0009
[BERT] Epoch 3/8 | Train loss 0.0035 | Val micro-F1 0.6840 | Val macro-F1 0.0020 | Val Hamming 0.0009
[BERT] Epoch 4/8 | Train loss 0.0034 | Val micro-F1 0.6734 | Val macro-F1 0.0020 | Val Hamming 0.0009
[BERT] Epoch 5/8 | Train loss 0.0033 | Val micro-F1 0.6842 | Val macro-F1 0.0021 | Val Hamming 0.0009
[BERT] Epoch 6/8 | Train loss 0.0032 | Val micro-F1 0.6862 | Val macro-F1 0.0022 | Val Hamming 0.0009
[BERT] Epoch 7/8 | Train loss 0.0031 | Val micro-F1 0.6929 | Val macro-F1 0.0025 | Val Hamming 0.0009
[BERT] Epoch 8/8 | Train loss 0.0031 | Val micro-F1 0.6934 | Val macro-F1 0.0028 | Val Hamming 0.0009

==== TEST RESULTS: BERT ====
Micro-F1 : 0.6939
Macro-F1 : 0.0028
Hamming  : 0.0009



In [20]:
print("==== SUMMARY (Micro-F1, Macro-F1, Hamming) ====")
for name, (mi, ma, h) in results.items():
    print(f"{name:8s} -> Micro-F1 {mi:.4f} | Macro-F1 {ma:.4f} | Hamming {h:.4f}")



==== SUMMARY (Micro-F1, Macro-F1, Hamming) ====
Word2Vec -> Micro-F1 0.6809 | Macro-F1 0.0020 | Hamming 0.0009
GloVe    -> Micro-F1 0.6850 | Macro-F1 0.0022 | Hamming 0.0009
FastText -> Micro-F1 0.6847 | Macro-F1 0.0020 | Hamming 0.0009
BERT     -> Micro-F1 0.6939 | Macro-F1 0.0028 | Hamming 0.0009
