# BiberPlus Encodings + Random Forest

In [1]:
import os
import logging
import numpy as np
import pandas as pd

from tqdm import tqdm
from typing import List
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score, classification_report

## Data Setup

**Load the data in**

In [3]:
input_directory = '/shared/3/datasets/PAN/pan20-av-training-small/tagged/'

train = pd.read_json(input_directory + 'train_tagged.jsonl', lines=True)
test = pd.read_json(input_directory + 'test_tagged.jsonl', lines=True)
print(len(train), len(test))

42098 5262


**Convert the tagged data into a training dataframe**

In [5]:
def get_training_row(row) -> List:
    try:
        encodings_a, encodings_b = row.neural_biber_1, row.neural_biber_2
        vals = [row.same]
        training_row = vals + encodings_a + encodings_b
    except Exception as e:
        logging.error(f'Error processing row {row.Index}: {e}')
        return None
    return training_row

def generate_training_data(df) -> pd.DataFrame:
    training_rows = []
    
    # Using itertuples for better speed than iterrows
    for row in tqdm(df.itertuples(), total=df.shape[0]):
        training_row = get_training_row(row)
        if training_row is not None:
            training_rows.append(training_row)

    label_column = ['same']
    cols_a = [f'A{i}' for i in range(96)]
    cols_b = [f'B{i}' for i in range(96)]
    training_cols = label_column + cols_a + cols_b

    return pd.DataFrame(training_rows, columns=training_cols)
    
training_df = generate_training_data(train)
testing_df = generate_training_data(test)
print(f'Generated {len(training_df)} training rows.')
print(f'Generated {len(testing_df)} training rows.')
training_df.head()

100%|██████████| 42098/42098 [00:00<00:00, 122993.59it/s]
100%|██████████| 5262/5262 [00:00<00:00, 160549.57it/s]

Generated 42098 training rows.
Generated 5262 training rows.





Unnamed: 0,same,A0,A1,A2,A3,A4,A5,A6,A7,A8,...,B86,B87,B88,B89,B90,B91,B92,B93,B94,B95
0,True,0.603041,0.209754,0.026219,0.183534,0.445726,0.996329,0.786576,0.0,0.0,...,0.0,1.0,0.0,0.841647,0.894431,0.709686,0.973608,0.23753,0.079177,0.026392
1,True,0.617589,0.296443,0.074111,0.321146,0.395257,1.0,0.901186,0.123518,0.098814,...,0.0,1.0,0.0,0.87712,0.852544,0.950848,0.852544,0.221185,0.245761,0.073728
2,True,0.494193,0.271806,0.172968,0.469484,0.197677,1.0,0.691871,0.02471,0.172968,...,0.0,1.0,0.0,0.848092,0.673485,0.947867,0.900225,0.32427,0.124719,0.074832
3,True,0.699482,0.440415,0.15544,0.284974,0.310881,1.0,0.725389,0.207254,0.025907,...,0.0,1.0,0.0,0.765197,0.947822,0.921732,1.0,0.443517,0.260892,0.130446
4,True,0.687723,0.229241,0.050942,0.229241,0.305655,1.0,0.891493,0.050942,0.050942,...,0.0,1.0,0.0,0.653424,1.0,0.764767,0.921589,0.444328,0.26137,0.0


## Model Training

In [9]:
y_train = training_df['same']
X_train = training_df.drop('same', axis=1)

y_test = testing_df['same']
X_test = testing_df.drop('same', axis=1)

clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))

print(classification_report(y_test, y_pred))
print(roc_auc_score(y_test, y_pred))


In [10]:
y_train = training_df['same']
X_train = training_df.drop('same', axis=1)

y_test = testing_df['same']
X_test = testing_df.drop('same', axis=1)

clf = RandomForestClassifier(n_estimators=200, random_state=42)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))

print(classification_report(y_test, y_pred))
print(roc_auc_score(y_test, y_pred))


Accuracy: 0.7590269859369061


In [11]:
y_train = training_df['same']
X_train = training_df.drop('same', axis=1)

y_test = testing_df['same']
X_test = testing_df.drop('same', axis=1)

clf = RandomForestClassifier(n_estimators=500, random_state=42)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))

print(classification_report(y_test, y_pred))
print(roc_auc_score(y_test, y_pred))


              precision    recall  f1-score   support

       False       0.79      0.68      0.73      2498
        True       0.74      0.83      0.78      2764

    accuracy                           0.76      5262
   macro avg       0.76      0.76      0.76      5262
weighted avg       0.76      0.76      0.76      5262

0.7550578813267691
