In [None]:
import warnings
import helpers
from transformers import AutoTokenizer, AutoConfig, AutoModel
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
import numpy as np
import pandas as pd
warnings.filterwarnings("ignore")

In [None]:
device = helpers.get_device()
model_ckpt = "distilbert-base-uncased"

train_path = "data/SemEval2024-Task8/SubtaskA/subtaskA_train_monolingual.jsonl"

train_addon1_path = "./backtranslation_data_mono/mono_addon1.jsonl"
train_addon2_path = "./backtranslation_data_mono/mono_addon2.jsonl"
train_addon3_path = "./backtranslation_data_mono/mono_addon3.jsonl"

val_path = "data/SemEval2024-Task8/SubtaskA/subtaskA_dev_monolingual.jsonl"

tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
config = AutoConfig.from_pretrained(model_ckpt)
model = AutoModel.from_pretrained(model_ckpt).to(device)

In [None]:
train_df, val_df = helpers.get_pandas_dfs(train_path, val_path)

train_addon1_df = helpers.get_pandas_atomic_dfs(train_addon1_path)
train_addon2_df = helpers.get_pandas_atomic_dfs(train_addon2_path)
train_addon3_df = helpers.get_pandas_atomic_dfs(train_addon3_path)

train_df = pd.concat([train_df, train_addon1_df, train_addon2_df, train_addon3_df], axis=0, ignore_index=True)
print(train_df.shape)

train_df["text"] = train_df["text"].apply(lambda x: helpers.chunk_text(x, tokenizer))
train_df = train_df.explode("text").reset_index(drop=True)
train_ds, val_ds = helpers.prepare_datasets(train_df, val_df)

train_ds_embeddings = train_ds.map(lambda x: helpers.extract_hidden_states(x, tokenizer, model, device), batched=True, batch_size=128)
test_ds_embeddings = val_ds.map(lambda x: helpers.extract_hidden_states(x, tokenizer, model, device), batched=True, batch_size=128)

In [None]:
X_train = np.array(train_ds_embeddings["embeddings"]) 
X_val = np.array(test_ds_embeddings["embeddings"]) 
y_train = np.array(train_ds_embeddings["label"]) 
y_val = np.array(test_ds_embeddings["label"]) 

X_train.shape, X_val.shape, y_train.shape, y_val.shape

In [None]:
lr_clf = LogisticRegression(max_iter=3000) 
lr_clf.fit(X_train, y_train) 
y_pred = lr_clf.predict(X_val) 
print(f"Logistic Regression metrics: {helpers.calculate_metrics(y_val, y_pred)}")

xgb = XGBClassifier()
xgb.fit(X_train, y_train) 
y_pred = xgb.predict(X_val) 
print(f"XGB metrics: {helpers.calculate_metrics(y_val, y_pred)}")