In [27]:
import warnings
import helpers
from transformers import AutoTokenizer, AutoConfig, DataCollatorWithPadding, Trainer, TrainingArguments, AutoModelForSequenceClassification
warnings.filterwarnings("ignore")

In [28]:
device = helpers.get_device()
model_ckpt = "distilbert-base-uncased"
train_path = "data/SemEval2024-Task8/SubtaskA/subtaskA_train_monolingual.jsonl"
val_path = "data/SemEval2024-Task8/SubtaskA/subtaskA_dev_monolingual.jsonl"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
config = AutoConfig.from_pretrained(model_ckpt)
model = AutoModelForSequenceClassification.from_pretrained(model_ckpt).to(device)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [29]:
train_df, val_df = helpers.get_pandas_dfs(train_path, val_path)

In [30]:
train_df["text"] = train_df["text"].apply(lambda x: helpers.chunk_text(x, tokenizer))
train_df = train_df.explode("text").reset_index(drop=True)

Token indices sequence length is longer than the specified maximum sequence length for this model (1376 > 512). Running this sequence through the model will result in indexing errors


In [31]:
train_ds, val_ds = helpers.prepare_datasets(train_df, val_df)

In [32]:
# train_ds_embeddings = train_ds.map(lambda x: helpers.extract_hidden_states(x, tokenizer, model, device), batched=True, batch_size=128)
# test_ds_embeddings = val_ds.map(lambda x: helpers.extract_hidden_states(x, tokenizer, model, device), batched=True, batch_size=128)

In [33]:
data_collator = DataCollatorWithPadding(tokenizer, return_tensors="pt")

def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True, return_tensors="pt")

train_ds_encoded = train_ds.map(tokenize, batched=True)
val_ds_encoded = val_ds.map(tokenize, batched=True)

training_args = TrainingArguments(
    "SemEval-Trainer",
    num_train_epochs=15,
    save_strategy="epoch",
    save_total_limit=20,
    evaluation_strategy="epoch",
    metric_for_best_model="eval_loss",
)


trainer = Trainer(
    model,
    training_args,
    train_dataset=train_ds_encoded,
    eval_dataset=val_ds_encoded,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=helpers.compute_metrics
)

trainer.train()
trainer.save_model(f"fine_tuned_distilbert_for_monolingual.pt")

Map: 100%|██████████| 782/782 [00:00<00:00, 2720.10 examples/s]
Map: 100%|██████████| 100/100 [00:00<00:00, 2453.02 examples/s]
  0%|          | 0/1470 [15:40<?, ?it/s]
  3%|▎         | 38/1470 [00:30<16:50,  1.42it/s] 

KeyboardInterrupt: 