In [None]:
import pandas as pd
import numpy as np
import torch
import re
import tqdm
import xgboost as xgb

from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from sklearn.metrics import matthews_corrcoef,accuracy_score, confusion_matrix, classification_report, f1_score
from sklearn.model_selection import train_test_split

from tqdm import tqdm
from transformers import AutoTokenizer, AutoModel, RobertaModel, RobertaTokenizer

In [None]:
import warnings
warnings.filterwarnings('ignore')

# Import Model from Huggingface

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
tokenizer = AutoTokenizer.from_pretrained('roberta-large')
model = RobertaModel.from_pretrained('roberta-large')
model.to(device)

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaModel(
  (embeddings): RobertaEmbeddings(
    (word_embeddings): Embedding(50265, 1024, padding_idx=1)
    (position_embeddings): Embedding(514, 1024, padding_idx=1)
    (token_type_embeddings): Embedding(1, 1024)
    (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): RobertaEncoder(
    (layer): ModuleList(
      (0-23): 24 x RobertaLayer(
        (attention): RobertaAttention(
          (self): RobertaSdpaSelfAttention(
            (query): Linear(in_features=1024, out_features=1024, bias=True)
            (key): Linear(in_features=1024, out_features=1024, bias=True)
            (value): Linear(in_features=1024, out_features=1024, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): RobertaSelfOutput(
            (dense): Linear(in_features=1024, out_features=1024, bias=True)
            (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
  

# Read Dataset

In [None]:
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Gammafest 2025/combined_with_nan.csv')
train_df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Gammafest 2025/train.csv")
test_df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Gammafest 2025/test.csv")

In [None]:
dt_train = df

# Embedding Process

In [None]:
# Abstract Extraction from TXT file
def extract_abstract(text):
    if not isinstance(text, str):
        return ""
    match = re.search(r'(?i)(abstract)\s*[:\-]?\s*(.+?)(\n\d+\.|\n\s*keywords|\n\s*introduction|\Z)', text, re.DOTALL)
    if match:
        return match.group(2).strip()
    paragraphs = [p.strip() for p in text.split('\n') if len(p.strip()) > 50]
    return paragraphs[0] if paragraphs else ""

dt_train['abstract'] = dt_train['content'].apply(extract_abstract)

In [None]:
# Get Embedding Data from Title and Abstract
def get_roberta_embedding(title, abstract):
    text = f"{title} {abstract}"
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
        embeddings = outputs.last_hidden_state[:, 0, :]
    return embeddings.squeeze().cpu().numpy()

In [None]:
roberta_embeddings = {}

for _, row in tqdm(df.iterrows(), total=len(df)):
    paper_id = row['paper_id']
    title = str(row['title']) if not pd.isna(row['title']) else ""
    abstract = str(row['abstract']) if not pd.isna(row['abstract']) else ""
    embedding = get_roberta_embedding(title, abstract)
    roberta_embeddings[paper_id] = embedding

100%|██████████| 4354/4354 [09:26<00:00,  7.68it/s]


In [None]:
embedding_df = pd.DataFrame.from_dict(roberta_embeddings, orient='index')
embedding_df.columns = [f'roberta_{i}' for i in range(embedding_df.shape[1])]
embedding_df = embedding_df.reset_index().rename(columns={'index': 'paper_id'})

print(embedding_df.head())

  paper_id  roberta_0  roberta_1  roberta_2  roberta_3  roberta_4  roberta_5  \
0    p0000  -0.198911  -0.239042  -0.332162   0.520644   0.835766   0.100233   
1    p0001  -0.264632  -0.278502  -0.227407   0.496248   0.905239   0.096506   
2    p0002  -0.389902  -0.048036   0.022978   0.434674   0.761605   0.194084   
3    p0003  -0.315486  -0.201297  -0.262078   0.744971   1.030764   0.111761   
4    p0004  -0.295814  -0.076650  -0.086424   0.495549   0.747309   0.152950   

   roberta_6  roberta_7  roberta_8  ...  roberta_1014  roberta_1015  \
0  -0.074599  -0.399116   0.173804  ...      0.318039     -0.216533   
1   0.022995  -0.416488   0.180287  ...      0.391272     -0.223277   
2   0.125249  -0.415155   0.204972  ...      0.474594      0.100791   
3   0.088340  -0.425793   0.175049  ...      0.549783     -0.286534   
4   0.082874  -0.363475   0.139220  ...      0.284624     -0.167147   

   roberta_1016  roberta_1017  roberta_1018  roberta_1019  roberta_1020  \
0      0.009761  

In [None]:
# Get Embedding Type, Author, Concept
def get_roberta_embedding_custom(type_, authors, concept):
    text = f"Type: {type_}. Authors: {authors}. Concept: {concept}."
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
        embeddings = outputs.last_hidden_state[:, 0, :]
    return embeddings.squeeze().cpu().numpy()

In [None]:
roberta_embedding_custom = {}

for _, row in tqdm(df.iterrows(), total=len(df)):
    paper_id = row['paper_id']
    type_ = str(row['type']) if not pd.isna(row['type']) else ""
    authors = str(row['authors']) if not pd.isna(row['authors']) else ""
    concept = str(row['concepts']) if not pd.isna(row['concepts']) else ""

    embedding = get_roberta_embedding_custom(type_, authors, concept)
    roberta_embedding_custom[paper_id] = embedding

100%|██████████| 4354/4354 [01:22<00:00, 52.55it/s]


In [None]:
metadata_embedding = pd.DataFrame.from_dict(roberta_embedding_custom, orient='index')
metadata_embedding.columns = [f'roberta_meta_{i}' for i in range(metadata_embedding.shape[1])]
metadata_embedding = metadata_embedding.reset_index().rename(columns={'index': 'paper_id'})

print(metadata_embedding.head())

  paper_id  roberta_meta_0  roberta_meta_1  roberta_meta_2  roberta_meta_3  \
0    p0000       -0.123791       -0.062504        0.174604        0.317131   
1    p0001       -0.281361       -0.161858       -0.034952        0.289069   
2    p0002       -0.182121       -0.108555        0.044391        0.271855   
3    p0003       -0.152071       -0.044412        0.120586        0.270049   
4    p0004       -0.191615       -0.070320        0.149044        0.296296   

   roberta_meta_4  roberta_meta_5  roberta_meta_6  roberta_meta_7  \
0        0.266121       -0.054181       -0.025784       -0.232852   
1        0.444555       -0.145143       -0.092889       -0.176969   
2        0.296554       -0.030231       -0.064106       -0.124288   
3        0.234698       -0.055356       -0.065282       -0.155767   
4        0.236460       -0.056183       -0.013509       -0.167272   

   roberta_meta_8  ...  roberta_meta_1014  roberta_meta_1015  \
0        0.187552  ...           0.282766          -

In [None]:
mtdt_embedding = metadata_embedding.drop(columns=['paper_id'])
mtdt_embedding

Unnamed: 0,roberta_meta_0,roberta_meta_1,roberta_meta_2,roberta_meta_3,roberta_meta_4,roberta_meta_5,roberta_meta_6,roberta_meta_7,roberta_meta_8,roberta_meta_9,...,roberta_meta_1014,roberta_meta_1015,roberta_meta_1016,roberta_meta_1017,roberta_meta_1018,roberta_meta_1019,roberta_meta_1020,roberta_meta_1021,roberta_meta_1022,roberta_meta_1023
0,-0.123791,-0.062504,0.174604,0.317131,0.266121,-0.054181,-0.025784,-0.232852,0.187552,0.083050,...,0.282766,-0.120605,-0.011692,0.162504,-0.232093,-0.550375,0.316255,0.311641,-0.035459,0.208121
1,-0.281361,-0.161858,-0.034952,0.289069,0.444555,-0.145143,-0.092889,-0.176969,0.311420,0.075188,...,0.369540,-0.143938,-0.012730,0.034646,-0.205579,-0.902193,0.250876,0.235347,-0.196356,0.232218
2,-0.182121,-0.108555,0.044391,0.271855,0.296554,-0.030231,-0.064106,-0.124288,0.231718,0.159166,...,0.268616,-0.090507,0.012862,0.133698,-0.251225,-0.535957,0.225831,0.167864,-0.058249,0.175415
3,-0.152071,-0.044412,0.120586,0.270049,0.234698,-0.055356,-0.065282,-0.155767,0.199209,0.002252,...,0.281487,-0.126851,0.054430,0.157074,-0.245361,-0.487764,0.262894,0.242901,-0.061621,0.188948
4,-0.191615,-0.070320,0.149044,0.296296,0.236460,-0.056183,-0.013509,-0.167272,0.223204,0.030754,...,0.278855,-0.057932,0.036742,0.149284,-0.238188,-0.536759,0.352157,0.219300,-0.081147,0.234436
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4349,-0.182792,-0.108176,0.136905,0.289689,0.265995,-0.092831,-0.063594,-0.130943,0.261936,0.010762,...,0.276138,-0.106549,-0.044210,0.133733,-0.213321,-0.556138,0.234066,0.191057,-0.096436,0.237257
4350,-0.213528,-0.063091,0.126680,0.313834,0.318762,-0.092210,0.027071,-0.190223,0.281947,0.007061,...,0.323063,-0.081877,0.075340,0.089309,-0.307869,-0.653533,0.376783,0.253062,-0.124920,0.220098
4351,-0.158843,-0.069204,0.117266,0.227560,0.239914,-0.042831,-0.067930,-0.156569,0.268037,0.081789,...,0.317908,-0.060024,0.038407,0.145882,-0.235879,-0.539327,0.271752,0.205113,-0.051973,0.221113
4352,-0.187793,-0.061267,0.125135,0.234504,0.229067,-0.058890,-0.055534,-0.140506,0.215683,0.071193,...,0.252184,-0.042942,0.012321,0.157790,-0.212187,-0.462574,0.236328,0.186402,-0.061936,0.275273


# Concat Embedding Data

In [None]:
roberta_embedding_df = pd.concat([embedding_df,mtdt_embedding], axis = 1)
roberta_embedding_df

Unnamed: 0,paper_id,roberta_0,roberta_1,roberta_2,roberta_3,roberta_4,roberta_5,roberta_6,roberta_7,roberta_8,...,roberta_meta_1014,roberta_meta_1015,roberta_meta_1016,roberta_meta_1017,roberta_meta_1018,roberta_meta_1019,roberta_meta_1020,roberta_meta_1021,roberta_meta_1022,roberta_meta_1023
0,p0000,-0.198911,-0.239042,-0.332162,0.520644,0.835766,0.100233,-0.074599,-0.399116,0.173804,...,0.282766,-0.120605,-0.011692,0.162504,-0.232093,-0.550375,0.316255,0.311641,-0.035459,0.208121
1,p0001,-0.264632,-0.278502,-0.227407,0.496248,0.905239,0.096506,0.022995,-0.416488,0.180287,...,0.369540,-0.143938,-0.012730,0.034646,-0.205579,-0.902193,0.250876,0.235347,-0.196356,0.232218
2,p0002,-0.389902,-0.048036,0.022978,0.434674,0.761605,0.194084,0.125249,-0.415155,0.204972,...,0.268616,-0.090507,0.012862,0.133698,-0.251225,-0.535957,0.225831,0.167864,-0.058249,0.175415
3,p0003,-0.315486,-0.201297,-0.262078,0.744971,1.030764,0.111761,0.088340,-0.425793,0.175049,...,0.281487,-0.126851,0.054430,0.157074,-0.245361,-0.487764,0.262894,0.242901,-0.061621,0.188948
4,p0004,-0.295814,-0.076650,-0.086424,0.495549,0.747309,0.152950,0.082874,-0.363475,0.139220,...,0.278855,-0.057932,0.036742,0.149284,-0.238188,-0.536759,0.352157,0.219300,-0.081147,0.234436
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4349,p4349,-0.297915,-0.128169,-0.093964,0.400182,0.838429,0.130937,0.008233,-0.419577,0.244656,...,0.276138,-0.106549,-0.044210,0.133733,-0.213321,-0.556138,0.234066,0.191057,-0.096436,0.237257
4350,p4350,-0.271245,-0.225761,0.001943,0.350183,0.767254,0.238450,0.138374,-0.410727,0.087037,...,0.323063,-0.081877,0.075340,0.089309,-0.307869,-0.653533,0.376783,0.253062,-0.124920,0.220098
4351,p4351,-0.238299,-0.102118,-0.058286,0.379276,0.731270,0.041905,0.173798,-0.304061,0.355552,...,0.317908,-0.060024,0.038407,0.145882,-0.235879,-0.539327,0.271752,0.205113,-0.051973,0.221113
4352,p4352,-0.290375,-0.233935,-0.222981,0.551081,0.843405,0.203125,0.035137,-0.245871,0.335742,...,0.252184,-0.042942,0.012321,0.157790,-0.212187,-0.462574,0.236328,0.186402,-0.061936,0.275273


# Undeersampling the Train Data then Merge with Embedding by RoBERTa

In [None]:
# Balancing the Imbalance Data
df_majority = train_df[train_df['is_referenced'] == 0]
df_minority = train_df[train_df['is_referenced'] == 1]
df_majority_sampled = df_majority.sample(n = 6500, random_state = 2025)
df_balanced = pd.concat([df_majority_sampled, df_minority], axis = 0).sample(frac = 1, random_state = 2025).reset_index(drop=True)

In [None]:
"""batch_size = 5000
num_batches = (len(df_balanced) + batch_size - 1) // batch_size

# Siapkan list hasil batch
merged_batches = []

# Siapkan embedding versi untuk paper dan referenced_paper
embedding_cols = [col for col in roberta_embedding_df.columns if col != 'paper_id']

# Embedding untuk paper
embedding_paper = roberta_embedding_df.rename(columns={'paper_id': 'paper'})

# Embedding untuk referenced_paper (rename kolom embedding biar gak tabrakan)
embedding_ref = roberta_embedding_df.rename(columns={'paper_id': 'referenced_paper'})
embedding_ref = embedding_ref.rename(columns={col: f"{col}_ref" for col in embedding_cols})

# Loop per batch
for i in range(num_batches):
    start = i * batch_size
    end = min((i + 1) * batch_size, len(df_balanced))

    batch = df_balanced.iloc[start:end].copy()

    # Merge ke embedding paper
    batch = batch.merge(embedding_paper, on='paper', how='left')

    # Merge ke embedding referenced_paper
    batch = batch.merge(embedding_ref, on='referenced_paper', how='left')

    merged_batches.append(batch)
    print(f"Batch {i+1}/{num_batches} selesai digabung.")

# Gabungkan semua batch jadi satu
train_df_with_embed = pd.concat(merged_batches, ignore_index=True)"""

Batch 1/3 selesai digabung.
Batch 2/3 selesai digabung.
Batch 3/3 selesai digabung.


In [None]:
batch_size = 5000
num_batches = (len(df_balanced) + batch_size - 1) // batch_size

merged_batches = []

embedding_cols = [col for col in roberta_embedding_df.columns if col != 'paper_id']

embedding_paper = roberta_embedding_df.rename(columns={'paper_id': 'paper'})
embedding_ref = roberta_embedding_df.rename(columns={'paper_id': 'referenced_paper'})
embedding_ref = embedding_ref.rename(columns={col: f"{col}_ref" for col in embedding_cols})

for i in range(num_batches):
    start = i * batch_size
    end = min((i + 1) * batch_size, len(df_balanced))

    batch = df_balanced.iloc[start:end].copy()

    # Merge embedding
    batch = batch.merge(embedding_paper, on='paper', how='left')
    batch = batch.merge(embedding_ref, on='referenced_paper', how='left')

    # Buat fitur baru
    for col in embedding_cols:
        ref_col = f"{col}_ref"
        batch[f"{col}_abs_diff"] = (batch[col] - batch[ref_col]).abs()
        batch[f"{col}_mul"] = batch[col] * batch[ref_col]

    merged_batches.append(batch)
    print(f"✅ Batch {i+1}/{num_batches} selesai digabung & dihitung.")

# Gabung semua batch
train_df_with_embed = pd.concat(merged_batches, ignore_index=True)

✅ Batch 1/3 selesai digabung & dihitung.
✅ Batch 2/3 selesai digabung & dihitung.
✅ Batch 3/3 selesai digabung & dihitung.


# Modeling

In [None]:
X = train_df_with_embed.drop(columns=['paper', 'referenced_paper', 'is_referenced'])
y = train_df_with_embed['is_referenced']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=2025, stratify=y
)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((8633, 8192), (2159, 8192), (8633,), (2159,))

In [None]:
# Decision Tree
dc_model = DecisionTreeClassifier(random_state=2025)
dc_model.fit(X_train, y_train)
y_pred_dc = dc_model.predict(X_test)

print("📊 Decision Tree")
print(classification_report(y_test, y_pred_dc))

📊 Decision Tree
              precision    recall  f1-score   support

           0       0.67      0.67      0.67      1300
           1       0.50      0.51      0.50       859

    accuracy                           0.60      2159
   macro avg       0.59      0.59      0.59      2159
weighted avg       0.60      0.60      0.60      2159



In [None]:
# Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=2025, n_jobs=-1)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)

print("🌲 Random Forest")
print(classification_report(y_test, y_pred_rf))

🌲 Random Forest
              precision    recall  f1-score   support

           0       0.69      0.94      0.80      1300
           1       0.81      0.37      0.51       859

    accuracy                           0.72      2159
   macro avg       0.75      0.66      0.66      2159
weighted avg       0.74      0.72      0.69      2159



In [None]:
# XGBoost
xgb_model = XGBClassifier(
    n_estimators=100,
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=2025,
    tree_method='hist',
)
xgb_model.fit(X_train, y_train)
y_pred_xgb = xgb_model.predict(X_test)

print("⚡ XGBoost")
print(classification_report(y_test, y_pred_xgb))

⚡ XGBoost
              precision    recall  f1-score   support

           0       0.76      0.87      0.81      1300
           1       0.75      0.59      0.66       859

    accuracy                           0.76      2159
   macro avg       0.76      0.73      0.74      2159
weighted avg       0.76      0.76      0.75      2159



In [None]:
mcc_dc = matthews_corrcoef(y_test, y_pred_dc)
mcc_rf = matthews_corrcoef(y_test, y_pred_rf)
mcc_xgb = matthews_corrcoef(y_test, y_pred_xgb)

In [None]:
print("Accuracy:")
print(f"DC  : {accuracy_score(y_test, y_pred_dc):.4f}")
print(f"RF  : {accuracy_score(y_test, y_pred_rf):.4f}")
print(f"XGB : {accuracy_score(y_test, y_pred_xgb):.4f}")

print("")

print("📈 Matthews Correlation Coefficient")
print(f"DC  : {mcc_dc:.4f}")
print(f"RF  : {mcc_rf:.4f}")
print(f"XGB : {mcc_xgb:.4f}")

Accuracy:
DC  : 0.6044
RF  : 0.7161
XGB : 0.7578

📈 Matthews Correlation Coefficient
DC  : 0.1751
RF  : 0.3996
XGB : 0.4837
