In [1]:
%pip install -q transformers datasets evaluate accelerate scikit-learn torch

In [2]:
from google.colab import drive
### mount your google drive
drive.mount('/content/drive')
model_save_path = "/content/drive/MyDrive/Colab Notebooks/sose2023" # change this to your local project folder

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Imports

In [3]:
from datasets import (
    load_dataset,
    DatasetDict,
)
import torch
from typing import Dict, Any
from transformers import (
    AutoModelForMaskedLM,
    AutoTokenizer,
    DataCollatorForLanguageModeling,
    TrainingArguments,
    Trainer,
    pipeline
)

In [4]:
model_checkpoint = "distilroberta-base"

## Load Dataset

In [5]:
# Task 1 - Load the train and test splits from ag_news. Randomly select 10% of the training set as validation.

SEED = 42

dataset = load_dataset("ag_news")
dataset = dataset.shuffle(SEED)

# dataset["train"] = load_dataset("ag_news", split="train[:4000]") # Note: This is useful for sanity checking the training process. Comment out/Uncomment as necessary

train_val_dataset = dataset["train"].train_test_split(test_size=0.1, seed=SEED)  # Split training set into training and validation set

# Construct new dataset object from old test, new train and new validation sets
dataset = DatasetDict({
    'train': train_val_dataset["train"],
    'test': dataset["test"],
    'val': train_val_dataset['test']
})



  0%|          | 0/2 [00:00<?, ?it/s]



## Preprocessing Function

In [6]:
# Task 3
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

def preprocess_function(example, seq_len: int):
    result = tokenizer(
        example["text"],
        truncation=True,
        padding="max_length",
        max_length=seq_len,
        return_special_tokens_mask=True,
    )
    result['labels'] = result['input_ids'].copy() # Copy input_ids for later use during training
    return result

encoded_ds = dataset.map(
    preprocess_function,
    batched=True,
    fn_kwargs={"seq_len": 256},
    remove_columns=['text', 'label']
)



Map:   0%|          | 0/12000 [00:00<?, ? examples/s]

In [7]:
encoded_ds

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'special_tokens_mask', 'labels'],
        num_rows: 108000
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'special_tokens_mask', 'labels'],
        num_rows: 7600
    })
    val: Dataset({
        features: ['input_ids', 'attention_mask', 'special_tokens_mask', 'labels'],
        num_rows: 12000
    })
})

## Data Collator

In [8]:
# Task 4
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer,
                                                mlm_probability=0.10)

## Load Model

In [9]:
# Task 5
model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)

In [10]:
# Task 6
model

RobertaForMaskedLM(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-5): 6 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): La

## Define TrainingArguments

In [11]:
# Task 7

num_epochs = 12
batch_size = 32
lr = 3e-4
weight_decay = 0.001

# ['linear', 'cosine', 'cosine_with_restarts', 'polynomial', 'constant', 'constant_with_warmup', 'inverse_sqrt', 'reduce_lr_on_plateau']
lr_scheduler_type = "cosine"

training_args = TrainingArguments(
    output_dir = model_save_path,
    do_train=True,
    do_eval=True,

    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    learning_rate=lr,
    lr_scheduler_type=lr_scheduler_type,
    weight_decay=weight_decay,

    save_strategy="epoch",
    logging_strategy='epoch',
    evaluation_strategy="epoch",
    load_best_model_at_end=True,
)

## Define Trainer

In [12]:
# Task 8
trainer = Trainer(model=model,
                  args=training_args,
                  train_dataset = encoded_ds["train"],
                  eval_dataset = encoded_ds["val"],
                  data_collator=data_collator)

## Train Model

In [None]:
# Task 9

# TODO: Hyper parameter tuning
#   - batch size
#   - number of epochs
#   - weight decay
#   - learning rate
# Note: Should be executed in Google Colab
# Note: Does not yet work as intended... Training loss does not seem to go down...

trainer.train()

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss


In [None]:
trainer.save_model(model_save_path + f'/{num_epochs}epochs_{batch_size}batchsize/')

## Evaluation on Validation and Test Splits with Perplexity

In [None]:
# Task 10
# TODO: Calculate perplexity on validation and test splits
# Note: Check out this: https://huggingface.co/docs/transformers/perplexity

# from evaluate import load
# perplexity = load("perplexity", module_type="metric")

# predictions_train = trainer(encoded_ds["test"])
# predictions_val = trainer(encoded_ds["val"])

# results_train = perplexity.compute(predictions=predictions_train)
# results_val = perplexity.compute(predictions=predictions_val)

# print(results_train)
# print(results_val)

## Inference

In [None]:
# # Task 11

# text = "E-mail scam targets police chief Wiltshire Police warns about <mask> after its fraud squad chief was targeted."

# mask_filler = pipeline('mask-filler', trainer)
# mask_filler(text, top_k=5)