# BETO baselines

In [None]:
%env CUDA_DEVICE_ORDER=PCI_BUS_ID
%env CUDA_VISIBLE_DEVICES=1
%env NCCL_P2P_DISABLE=1
%env NCCL_IB_DISABLE=1

import random

import numpy as np
import pandas as pd
import torch
from datasets import Dataset
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments

We create a validation partition, which we also use as an example for the evaluation

In [None]:
test = pd.read_csv("data/test.csv")
train = pd.read_csv("data/train.csv")

df = train
train = df.sample(frac=0.8, random_state=42)
validation = df.drop(train.index)

train.to_csv("data/train_val.csv", index=False)
validation.to_csv("data/validation.csv", index=False)

In [None]:
model_id = "dccuchile/bert-base-spanish-wwm-cased"
tokenizer = AutoTokenizer.from_pretrained(model_id)

In [None]:
def tokenize(examples):
    tokenized_inputs = tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)
    return tokenized_inputs


def preprocessing_data(data):
    dt = Dataset.from_pandas(data)
    tokenized_dt = dt.map(tokenize, remove_columns=["text"], batched=True)
    return tokenized_dt.with_format("torch")


def set_deterministic(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.enabled = False


training_args = TrainingArguments(
    output_dir="baselines",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=5,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

## Hard labels

### Task 1

In [None]:
tok_train = preprocessing_data(train[["text", "stereotype"]].rename(columns={"stereotype": "labels"}))
tok_val = preprocessing_data(validation[["text", "stereotype"]].rename(columns={"stereotype": "labels"}))
tok_test = preprocessing_data(test[["text"]])

In [None]:
set_deterministic()
model = AutoModelForSequenceClassification.from_pretrained(model_id, num_labels=2)
trainer = Trainer(model=model, args=training_args, train_dataset=tok_train, eval_dataset=tok_val)
trainer.train()

predictions = trainer.predict(tok_test)
results = test[["id"]].copy()
results["stereotype"] = np.argmax(predictions[0], axis=1)
results.to_csv("baselines/beto_t1_hard.csv", index=False)

Save the result to the test DataFrame to filter it in task 2

In [None]:
test["stereotype_pred"] = np.argmax(predictions[0], axis=1)

### Task 2

This baseline only predicts the impliciteness for the cases of task 1 that have stereotype == 1

In [None]:
tok_train = preprocessing_data(
    train[train["stereotype"] == 1][["text", "implicit"]].rename(columns={"implicit": "labels"})
)
tok_val = preprocessing_data(
    validation[validation["stereotype"] == 1][["text", "implicit"]].rename(columns={"implicit": "labels"})
)
tok_test = preprocessing_data(test[test["stereotype_pred"] == 1][["text"]])

In [None]:
set_deterministic()
model = AutoModelForSequenceClassification.from_pretrained(model_id, num_labels=2)
trainer = Trainer(model=model, args=training_args, train_dataset=tok_train, eval_dataset=tok_val)
trainer.train()

predictions = trainer.predict(tok_test)
results2 = results.copy()
results2["implicit"] = 0
results2.loc[test["stereotype_pred"] == 1, "implicit"] = np.argmax(predictions[0], axis=1)
results2.to_csv("baselines/beto_t2_hard.csv", index=False)

## Soft labels

We use the softmax normalization of the 3 annotators to train the models, namely `stereotype_soft` and `implicit_soft` as provided in the training set.

We clip the predicted values to the interval [0,1]

### Task 1

In [None]:
tok_train = preprocessing_data(train[["text", "stereotype_soft"]].rename(columns={"stereotype_soft": "labels"}))
tok_val = preprocessing_data(validation[["text", "stereotype_soft"]].rename(columns={"stereotype_soft": "labels"}))
tok_test = preprocessing_data(test[["text"]])

In [None]:
set_deterministic()
model = AutoModelForSequenceClassification.from_pretrained(model_id, num_labels=1)
trainer = Trainer(model=model, args=training_args, train_dataset=tok_train, eval_dataset=tok_val)
trainer.train()

predictions = trainer.predict(tok_test)
results = test[["id"]].copy()
results["stereotype"] = np.clip(predictions[0], 0, 1)
results.to_csv("baselines/beto_t1_soft.csv", index=False)

In [None]:
test["stereotype_pred"] = predictions[0]

### Task 2

In this simple baseline, we only predict the impliciteness for the cases with stereotype_soft > 0.5

In [None]:
tok_train = preprocessing_data(
    train[train["stereotype_soft"] > 0.5][["text", "implicit_soft"]].rename(columns={"implicit_soft": "labels"})
)
tok_val = preprocessing_data(
    validation[validation["stereotype_soft"] > 0.5][["text", "implicit_soft"]].rename(
        columns={"implicit_soft": "labels"}
    )
)
tok_test = preprocessing_data(test[test["stereotype_pred"] > 0.5][["text"]])

In [None]:
set_deterministic()
model = AutoModelForSequenceClassification.from_pretrained(model_id, num_labels=1)
trainer = Trainer(model=model, args=training_args, train_dataset=tok_train, eval_dataset=tok_val)
trainer.train()

predictions = trainer.predict(tok_test)
results2 = results.copy()
results2["implicit"] = 0.0
results2.loc[test["stereotype_pred"] > 0.5, "implicit"] = np.clip(predictions[0], 0, 1)
results2.to_csv("baselines/beto_t2_soft.csv", index=False)