In [10]:
import warnings
import helpers
from transformers import AutoTokenizer, AutoConfig, DataCollatorWithPadding, Trainer, TrainingArguments, AutoModelForSequenceClassification
warnings.filterwarnings("ignore")

In [11]:
device = helpers.get_device()
model_ckpt = "distilbert-base-multilingual-cased"
train_path = "data/SemEval2024-Task8/SubtaskA/subtaskA_train_multilingual.jsonl"
val_path = "data/SemEval2024-Task8/SubtaskA/subtaskA_dev_multilingual.jsonl"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
config = AutoConfig.from_pretrained(model_ckpt)
model = AutoModelForSequenceClassification.from_pretrained(model_ckpt).to(device)

In [6]:
train_df, val_df = helpers.get_pandas_dfs(train_path, val_path)

In [7]:
train_df["text"] = train_df["text"].apply(lambda x: helpers.chunk_text(x, tokenizer))
train_df = train_df.explode("text").reset_index(drop=True)

Token indices sequence length is longer than the specified maximum sequence length for this model (1958 > 512). Running this sequence through the model will result in indexing errors


In [8]:
train_ds, val_ds = helpers.prepare_datasets(train_df, val_df)

In [9]:
data_collator = DataCollatorWithPadding(tokenizer, return_tensors="pt")

def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True, return_tensors="pt")

train_ds_encoded = train_ds.map(tokenize, batched=True)
val_ds_encoded = val_ds.map(tokenize, batched=True)

training_args = TrainingArguments(
    "SemEval-Trainer",
    num_train_epochs=15,
    save_strategy="epoch",
    save_total_limit=20,
    evaluation_strategy="epoch",
    metric_for_best_model="eval_loss",
)


trainer = Trainer(
    model,
    training_args,
    train_dataset=train_ds_encoded,
    eval_dataset=val_ds_encoded,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=helpers.compute_metrics
)

trainer.train()
trainer.save_model(f"fine_tuned_distilbert_for_monolingual.pt")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Map: 100%|██████████| 789/789 [00:00<00:00, 2914.30 examples/s]
Map: 100%|██████████| 100/100 [00:00<00:00, 2872.24 examples/s]
  0%|          | 0/1485 [00:00<?, ?it/s]

KeyboardInterrupt: 