In [None]:
pip install transformers datasets sentencepiece

In [None]:
!apt-get install git-lfs

In [None]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [None]:
from pathlib import Path
from tqdm.auto import tqdm
import numpy as np
import pandas as pd

from transformers import AutoModel, AutoTokenizer
from datasets import load_dataset, DatasetDict
import torch

Constants

In [None]:
ls

In [None]:
dataset = load_dataset(
    'csv', 
    data_files={
        'train': "./review_references_title_abstracts_sample_train.csv",
        'test': "./review_references_title_abstracts_sample_test.csv"
    },
)

## Defining the model

In [None]:
tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-xsmall")

In [None]:
dataset

In [None]:
def preprocess_function(examples):
    examples['labels'] = examples['review']
    return tokenizer(examples["title_abstract"], truncation=True, max_length=1024)

tokenized_dataset = dataset.map(
    preprocess_function, 
    batched=True,
    remove_columns=dataset['train'].column_names
  )

In [None]:
tokenized_dataset

In [None]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained("microsoft/deberta-v3-xsmall", num_labels=2)

In [None]:
from datasets import load_metric

def compute_metrics(eval_preds):
    metric = load_metric("glue", "mrpc")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
training_args = TrainingArguments(
    "deberta-v3-xsmall-finetuned-review_classifier",
    learning_rate=4.5e-5,
    per_device_train_batch_size=12,
    per_device_eval_batch_size=12,
    num_train_epochs=2,
    weight_decay=0.01,
    warmup_steps=1000,
    evaluation_strategy="epoch",
    save_total_limit=2,
    push_to_hub=True,
    fp16=True # switch off if not using GPU
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.evaluate()

In [None]:
trainer.train()

In [None]:
trainer.push_to_hub(commit_message="Training complete", tags="text-classification")