In [1]:
import warnings
import helpers
from transformers import AutoTokenizer, AutoConfig, AutoModel
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
import numpy as np
warnings.filterwarnings("ignore")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = helpers.get_device()
model_ckpt = "distilbert-base-multilingual-cased"
train_path = "data/SemEval2024-Task8/SubtaskA/subtaskA_train_multilingual.jsonl"
val_path = "data/SemEval2024-Task8/SubtaskA/subtaskA_dev_multilingual.jsonl"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
config = AutoConfig.from_pretrained(model_ckpt)
model = AutoModel.from_pretrained(model_ckpt).to(device)

In [3]:
train_df, val_df = helpers.get_pandas_dfs(train_path, val_path)
train_df["text"] = train_df["text"].apply(lambda x: helpers.chunk_text(x, tokenizer))
train_df = train_df.explode("text").reset_index(drop=True)
train_ds, val_ds = helpers.prepare_datasets(train_df, val_df)

train_ds_embeddings = train_ds.map(lambda x: helpers.extract_hidden_states(x, tokenizer, model, device), batched=True, batch_size=128)
test_ds_embeddings = val_ds.map(lambda x: helpers.extract_hidden_states(x, tokenizer, model, device), batched=True, batch_size=128)

Token indices sequence length is longer than the specified maximum sequence length for this model (1958 > 512). Running this sequence through the model will result in indexing errors
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
Map:   0%|          | 0/15 [00:00<?, ? examples/s]TOKENIZERS_PARALLELISM=(true | false)
Map: 100%|██████████| 15/15 [00:00<00:00, 27.92 examples/s]
Map: 100%|██████████| 2/2 [00:00<00:00,  8.63 examples/s]


In [5]:
X_train = np.array(train_ds_embeddings["embeddings"]) 
X_val = np.array(test_ds_embeddings["embeddings"]) 
y_train = np.array(train_ds_embeddings["label"]) 
y_val = np.array(test_ds_embeddings["label"]) 

X_train.shape, X_val.shape, y_train.shape, y_val.shape

((15, 2304), (2, 2304), (15,), (2,))

In [6]:
lr_clf = LogisticRegression(max_iter=3000) 
lr_clf.fit(X_train, y_train) 
y_pred = lr_clf.predict(X_val) 
print(f"Logistic Regression metrics: {helpers.calculate_metrics(y_val, y_pred)}")

xgb = XGBClassifier()
xgb.fit(X_train, y_train) 
y_pred = xgb.predict(X_val) 
print(f"XGB metrics: {helpers.calculate_metrics(y_val, y_pred)}")

Logistic Regression metrics: {'accuracy': 0.5, 'precision': 0.5, 'recall': 1.0, 'f1_score': 0.6666666666666666}
XGB metrics: {'accuracy': 0.5, 'precision': 0.5, 'recall': 1.0, 'f1_score': 0.6666666666666666}
