In [None]:
import pandas as pd
import numpy as np
import torch
import re
import tqdm
import xgboost as xgb

import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras import backend as K
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Masking
from tensorflow.keras.optimizers import Adam

from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from sklearn.metrics import matthews_corrcoef,accuracy_score, confusion_matrix, classification_report, f1_score
from sklearn.model_selection import train_test_split

from tqdm import tqdm
from tqdm.keras import TqdmCallback
from transformers import AutoTokenizer, AutoModel, RobertaModel, RobertaTokenizer

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
metadata = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Gammafest 2025/combined_with_nan.csv')

mtdt = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Gammafest 2025/mtdt_embedding.csv')
txt = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Gammafest 2025/txt_embedding.csv')

train_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Gammafest 2025/train.csv')
test_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Gammafest 2025/test.csv')
sample = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Gammafest 2025/sample_submission.csv')

In [None]:
txt_embed = txt.drop(columns=['Unnamed: 0'])
mtdt_embed = mtdt.drop(columns=['Unnamed: 0', 'paper_id'])

In [None]:
data_embed = pd.concat([txt_embed, mtdt_embed], axis=1)
data_embed

Unnamed: 0,paper_id,dim_0,dim_1,dim_2,dim_3,dim_4,dim_5,dim_6,dim_7,dim_8,...,mtdt_758,mtdt_759,mtdt_760,mtdt_761,mtdt_762,mtdt_763,mtdt_764,mtdt_765,mtdt_766,mtdt_767
0,p0000,-0.307896,0.233469,-0.795748,-0.020367,-0.135363,0.108508,1.538086,1.053744,0.992476,...,0.361256,0.309300,0.940870,0.438250,-0.097464,-0.177169,0.318410,0.378203,-0.130136,0.547858
1,p0001,0.084872,0.392815,-0.504959,-0.021132,-0.114442,0.422108,0.588093,0.723649,0.348245,...,0.345426,-0.708377,0.869000,0.223402,-0.416547,0.202308,-0.492604,0.643108,-0.323697,0.918231
2,p0002,-0.028009,0.680236,-0.821320,0.205567,0.157409,1.311699,-0.855878,0.208994,1.789679,...,-0.080286,-0.225180,0.869219,0.469897,0.418345,0.023076,0.550739,0.508122,-0.765913,1.141435
3,p0003,-0.462517,1.123400,0.482200,0.050586,0.252007,-0.154191,0.874468,1.153361,0.787142,...,0.327962,-1.480308,0.990188,1.004630,-1.131652,0.046745,0.652956,0.394007,0.403567,0.913617
4,p0004,-0.686830,1.164063,0.139771,-0.061362,-0.311246,0.477096,1.062190,0.358880,0.112960,...,0.267060,-0.313841,0.427698,0.400070,-0.242398,0.235142,0.353829,0.140394,0.488866,0.461436
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4349,p4349,-0.228524,0.531688,-0.480547,0.038501,-0.109922,-0.126173,0.954068,0.801017,0.071041,...,-0.106525,-1.055142,0.591994,-0.062940,-0.504965,0.337807,0.331830,0.070886,0.197972,0.950505
4350,p4350,-0.760740,0.784616,-0.522071,-0.182086,-0.432741,0.345590,-0.227202,0.635391,0.787055,...,0.678842,-0.207159,1.074868,0.143668,-0.590506,0.170782,0.200454,-0.447116,-0.931619,0.611930
4351,p4351,0.003420,0.617399,-0.119199,-0.603085,0.375625,-0.284949,0.108157,-0.025868,0.097006,...,0.406650,0.062788,0.820162,0.283859,-0.373997,-0.174518,0.225618,-0.155076,-0.131684,0.226827
4352,p4352,-0.822705,0.531300,0.687420,-0.998790,0.647437,-0.258114,0.966562,0.221419,0.495811,...,0.824986,-0.652105,0.342011,1.322424,-0.120471,0.523878,0.667718,0.203245,0.513689,0.717953


In [None]:
for col in data_embed.select_dtypes(include='float').columns:
    data_embed[col] = data_embed[col].astype('float16')

In [None]:
embedding = data_embed

# Prepare Data Train and Test

In [None]:
train_df_paper = train_df.merge(metadata, how='left', left_on='paper', right_on='paper_id') \
                   .rename(columns={'publication_year': 'paper_year', 'publication_date': 'paper_date'}) \
                   .drop(columns=['paper_id', 'content', 'doi', 'title', 'cited_by_count', 'type', 'authors', 'concepts'])

train_df_new = train_df_paper.merge(metadata, how='left', left_on='referenced_paper', right_on='paper_id') \
                   .rename(columns={'publication_year': 'ref_paper_year', 'publication_date': 'ref_paper_date'}) \
                   .drop(columns=['paper_id', 'content', 'doi', 'title', 'cited_by_count', 'type', 'authors', 'concepts'])

In [None]:
train_df_new['paper_date'] = pd.to_datetime(train_df_new['paper_date'], format='mixed', errors='coerce')
train_df_new['ref_paper_date'] = pd.to_datetime(train_df_new['ref_paper_date'], format='mixed', errors='coerce')

In [None]:
# Filter baris yang valid: paper_date > ref_paper_date
train_df_cleaned = train_df_new[train_df_new['paper_date'] > train_df_new['ref_paper_date']].reset_index(drop=True)

In [None]:
dt_train = train_df_cleaned.copy()
dt_train = dt_train[['paper', 'referenced_paper', 'is_referenced']]

In [None]:
test_df_paper = test_df.merge(metadata, how='left', left_on='paper', right_on='paper_id') \
                   .rename(columns={'publication_year': 'paper_year', 'publication_date': 'paper_date'}) \
                   .drop(columns=['paper_id', 'content', 'doi', 'title', 'cited_by_count', 'type', 'authors', 'concepts'])

test_df_new = test_df_paper.merge(metadata, how='left', left_on='referenced_paper', right_on='paper_id') \
                   .rename(columns={'publication_year': 'ref_paper_year', 'publication_date': 'ref_paper_date'}) \
                   .drop(columns=['paper_id', 'content', 'doi', 'title', 'cited_by_count', 'type', 'authors', 'concepts'])

In [None]:
test_df_new['paper_date'] = pd.to_datetime(test_df_new['paper_date'], format='mixed', errors='coerce')
test_df_new['ref_paper_date'] = pd.to_datetime(test_df_new['ref_paper_date'], format='mixed', errors='coerce')

In [None]:
test_df_cleaned = test_df_new[test_df_new['paper_date'] > test_df_new['ref_paper_date']].reset_index(drop=True)
test_df_0 = test_df_new[test_df_new['paper_date'] <= test_df_new['ref_paper_date']].reset_index(drop=True)

# Merging Embedded with Train

In [None]:
X = dt_train.drop(columns=['paper', 'referenced_paper', 'is_referenced'])
y = dt_train['is_referenced']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state = 2025, stratify = y
)

In [None]:
def process_batches(df_subset, embedding, batch_size=5000):
    import numpy as np
    import pandas as pd

    num_batches = (len(df_subset) + batch_size - 1) // batch_size

    embedding_cols = [col for col in embedding.columns if col != 'paper_id']
    embedding_paper = embedding.rename(columns={'paper_id': 'paper'})
    embedding_ref = embedding.rename(columns={'paper_id': 'referenced_paper'})
    embedding_ref = embedding_ref.rename(columns={col: f"{col}_ref" for col in embedding_cols})

    X_list, y_list = [], []

    for i in range(num_batches):
        start = i * batch_size
        end = min((i + 1) * batch_size, len(df_subset))
        batch = df_subset.iloc[start:end].copy()

        # Merge
        batch = batch.merge(embedding_paper, on='paper', how='left')
        batch = batch.merge(embedding_ref, on='referenced_paper', how='left')

        # Drop kolom non-fitur
        for drop_col in ['paper', 'referenced_paper']:
            if drop_col in batch.columns:
                batch.drop(columns=drop_col, inplace=True)

        # Ambil hanya kolom embedding dan _ref
        feature_cols = embedding_cols + [f"{col}_ref" for col in embedding_cols]

        # Filter kalau ada yang null atau bukan numerik
        feature_cols = [
            col for col in feature_cols
            if col in batch.columns and pd.api.types.is_numeric_dtype(batch[col])
        ]

        batch = batch.dropna(subset=feature_cols + ['is_referenced'])

        # Konversi
        X_batch = batch[feature_cols].astype('float32').to_numpy()
        y_batch = batch['is_referenced'].astype('int').to_numpy()

        X_list.append(X_batch)
        y_list.append(y_batch)

        print(f"✅ Batch {i+1}/{num_batches} selesai. Fitur: {len(feature_cols)} kolom.")

    if not X_list:
        raise ValueError("❌ Tidak ada batch yang berhasil diproses. Cek data input.")

    X_all = np.concatenate(X_list, axis=0)
    y_all = np.concatenate(y_list, axis=0)

    print(f"🎉 Selesai semua batch! Total data: {X_all.shape}")
    return X_all, y_all

In [None]:
X_train, y_train = process_batches(dt_train, embedding)
X_test, y_test = process_batches(dt_train, embedding)

✅ Batch 1/54 selesai. Fitur: 3072 kolom.
✅ Batch 2/54 selesai. Fitur: 3072 kolom.
✅ Batch 3/54 selesai. Fitur: 3072 kolom.
✅ Batch 4/54 selesai. Fitur: 3072 kolom.
✅ Batch 5/54 selesai. Fitur: 3072 kolom.
✅ Batch 6/54 selesai. Fitur: 3072 kolom.
✅ Batch 7/54 selesai. Fitur: 3072 kolom.
✅ Batch 8/54 selesai. Fitur: 3072 kolom.
✅ Batch 9/54 selesai. Fitur: 3072 kolom.
✅ Batch 10/54 selesai. Fitur: 3072 kolom.
✅ Batch 11/54 selesai. Fitur: 3072 kolom.
✅ Batch 12/54 selesai. Fitur: 3072 kolom.
✅ Batch 13/54 selesai. Fitur: 3072 kolom.
✅ Batch 14/54 selesai. Fitur: 3072 kolom.
✅ Batch 15/54 selesai. Fitur: 3072 kolom.
✅ Batch 16/54 selesai. Fitur: 3072 kolom.
✅ Batch 17/54 selesai. Fitur: 3072 kolom.
✅ Batch 18/54 selesai. Fitur: 3072 kolom.
✅ Batch 19/54 selesai. Fitur: 3072 kolom.
✅ Batch 20/54 selesai. Fitur: 3072 kolom.
✅ Batch 21/54 selesai. Fitur: 3072 kolom.
✅ Batch 22/54 selesai. Fitur: 3072 kolom.
✅ Batch 23/54 selesai. Fitur: 3072 kolom.
✅ Batch 24/54 selesai. Fitur: 3072 kolom.
✅

In [None]:
# Custom wrapper for LSTM classifier (sklearn-style)
class LSTMClassifier:
    def __init__(self, input_shape, epochs=5, batch_size=32):
        self.input_shape = input_shape
        self.epochs = epochs
        self.batch_size = batch_size
        self.model = None

    def fit(self, X, y):
        X = X.reshape((-1, 1, self.input_shape))  # Reshape for LSTM
        self.model = Sequential([
            Masking(mask_value=0., input_shape=(1, self.input_shape)),
            LSTM(16),
            Dense(1, activation='sigmoid')
        ])
        self.model.compile(optimizer=Adam(), loss='binary_crossentropy', metrics=['accuracy'])
        self.model.fit(X, y, epochs=self.epochs, batch_size=self.batch_size, verbose=0)

    def predict(self, X):
        X = X.reshape((-1, 1, self.input_shape))
        proba = self.model.predict(X, verbose=0)
        return (proba > 0.5).astype(int).flatten()

    def predict_proba(self, X):
        X = X.reshape((-1, 1, self.input_shape))
        return self.model.predict(X, verbose=0).flatten()

In [None]:
# TSSE-BIM Boosting-style ensemble using custom base model (e.g., LSTM)
class TSSEBIMBoost:
    def __init__(self, n_estimators=5, base_model_class=None, input_shape=None):
        self.n_estimators = n_estimators
        self.base_model_class = base_model_class
        self.input_shape = input_shape
        self.classifiers = []
        self.weights = []

    def _bin_samples(self, probs, bins):
        return np.digitize(probs, bins) - 1

    def _assign_weight(self, fp, fn, fp_max, fn_max):
        return (1 - fp / (fp_max + 1e-10)) * (1 - fn / (fn_max + 1e-10))

    def fit(self, X, y):
        X, y = shuffle(X, y, random_state=42)
        pos_idx = np.where(y == 1)[0]
        neg_idx = np.where(y == 0)[0]
        k = len(pos_idx)
        n_bins = k if k > 0 else 1
        bins = np.linspace(0, 1, n_bins + 1)

        pred_probs_all = np.zeros(len(y))

        for t in range(self.n_estimators):
            if t > 0:
                pred_probs_all = np.zeros(len(y))
                total_weight = sum(self.weights)
                for clf, w in zip(self.classifiers, self.weights):
                    pred_probs_all += w * clf.predict_proba(X)
                pred_probs_all /= total_weight
            else:
                pred_probs_all[:] = 0.5

            neg_probs = pred_probs_all[neg_idx]
            bin_assignments = self._bin_samples(neg_probs, bins)
            neg_sampled_idx = []

            for i in range(n_bins):
                bin_indices = neg_idx[bin_assignments == i]
                if len(bin_indices) > 0:
                    neg_sampled = np.random.choice(bin_indices, 1, replace=False)
                    neg_sampled_idx.extend(neg_sampled)

            train_idx = np.concatenate([pos_idx, neg_sampled_idx])
            clf = self.base_model_class(self.input_shape)
            clf.fit(X[train_idx], y[train_idx])
            y_pred = clf.predict(X)

            tn, fp, fn, tp = confusion_matrix(y, y_pred).ravel()
            fp_max = max([fp] + [confusion_matrix(y, c.predict(X)).ravel()[1] for c in self.classifiers]) if self.classifiers else fp
            fn_max = max([fn] + [confusion_matrix(y, c.predict(X)).ravel()[2] for c in self.classifiers]) if self.classifiers else fn

            weight = self._assign_weight(fp, fn, fp_max, fn_max)
            self.classifiers.append(clf)
            self.weights.append(weight)

    def predict(self, X):
        total_weight = sum(self.weights)
        preds = sum(w * clf.predict(X) for clf, w in zip(self.classifiers, self.weights))
        return (preds / total_weight) > 0.5

    def predict_proba(self, X):
        total_weight = sum(self.weights)
        probs = sum(w * clf.predict_proba(X) for clf, w in zip(self.classifiers, self.weights))
        return probs / total_weight