In [1]:
import os
import numpy as np
from datasets import load_dataset
from sklearn.metrics import accuracy_score, f1_score
from sklearn.preprocessing import LabelEncoder
import pandas as pd

In [2]:
df = pd.read_csv('../data/raw.csv')

In [3]:
data = np.load("embeddings.npy")
X = data
y = df["author"]
le = LabelEncoder()
y = le.fit_transform(y)


In [4]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [11]:
from catboost import CatBoostClassifier
from sklearn.metrics import classification_report, log_loss

cb = CatBoostClassifier(
    loss_function="MultiClass",
    eval_metric="MultiClass",
    iterations=500,
    learning_rate=0.3,
    depth=8,
    l2_leaf_reg=3.0,
    random_seed=42,
    verbose=200,
    task_type="GPU",        
    devices="0"
)

cb.fit(
    X_train, y_train,
    eval_set=(X_val, y_val),
    use_best_model=True
)

0:	learn: 0.9802287	test: 0.9935525	best: 0.9935525 (0)	total: 89.5ms	remaining: 44.6s
200:	learn: 0.1010463	test: 0.4953530	best: 0.4953530 (200)	total: 15.5s	remaining: 23.1s
400:	learn: 0.0361603	test: 0.4769990	best: 0.4768949 (397)	total: 30.4s	remaining: 7.51s
499:	learn: 0.0240661	test: 0.4752586	best: 0.4744445 (467)	total: 37.8s	remaining: 0us
bestTest = 0.4744444741
bestIteration = 467
Shrink model to first 468 iterations.


<catboost.core.CatBoostClassifier at 0x7ff852c8fc50>

In [12]:
proba = cb.predict_proba(X_val)
print("proba shape:", proba.shape)

pred = proba.argmax(axis=1)
print(classification_report(y_val, pred))
print("logloss:", log_loss(y_val, proba))

proba shape: (3916, 3)
              precision    recall  f1-score   support

           0       0.79      0.83      0.81      1580
           1       0.85      0.81      0.83      1127
           2       0.81      0.79      0.80      1209

    accuracy                           0.81      3916
   macro avg       0.82      0.81      0.81      3916
weighted avg       0.81      0.81      0.81      3916

logloss: 0.4744444376654691


In [13]:
from sklearn.metrics import f1_score

proba = cb.predict_proba(X_val)          # (N, 3)
y_pred = proba.argmax(axis=1)

f1_macro = f1_score(y_val, y_pred, average="macro")
f1_micro = f1_score(y_val, y_pred, average="micro")
f1_weighted = f1_score(y_val, y_pred, average="weighted")

print("F1 macro   :", f1_macro)
print("F1 micro   :", f1_micro)
print("F1 weighted:", f1_weighted)


F1 macro   : 0.8140475216406012
F1 micro   : 0.8133299284984679
F1 weighted: 0.8134195529376034
