In [1]:
%pip install -q transformers datasets evaluate accelerate scikit-learn torch


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3.1[0m[39;49m -> [0m[32;49m23.1.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


## Imports

In [2]:
from datasets import (
    load_dataset, 
    DatasetDict, 
)
import torch
from typing import Dict, Any
from transformers import (
    AdamW,
    AdamWeightDecay,
    AutoConfig,
    AutoModel,
    AutoModelForMaskedLM,
    AutoTokenizer,
    DataCollatorForLanguageModeling,
    EvalPrediction,
    TrainingArguments, 
    Trainer
)

2023-07-02 16:52:16.573867: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-07-02 16:52:16.697858: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-07-02 16:52:16.698760: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Load Dataset

In [3]:
# Task 1 - Load the train and test splits from ag_news. Randomly select 10% of the training set as validation.

SEED = 42

dataset = load_dataset("ag_news")
dataset = dataset.shuffle(SEED)

train_val_dataset = dataset["train"].train_test_split(test_size=0.1, seed=SEED)  # Split training set into training and validation set

# Construct new dataset object from old test, new train and new validation sets
dataset = DatasetDict({
    'train': train_val_dataset["train"],
    'test': dataset["test"],
    'val': train_val_dataset['test']
})

Found cached dataset ag_news (/home/nano/.cache/huggingface/datasets/ag_news/default/0.0.0/bc2bcb40336ace1a0374767fc29bb0296cdaf8a6da7298436239c54d79180548)


  0%|          | 0/2 [00:00<?, ?it/s]

Loading cached shuffled indices for dataset at /home/nano/.cache/huggingface/datasets/ag_news/default/0.0.0/bc2bcb40336ace1a0374767fc29bb0296cdaf8a6da7298436239c54d79180548/cache-dd0ff9596fea92b0.arrow
Loading cached shuffled indices for dataset at /home/nano/.cache/huggingface/datasets/ag_news/default/0.0.0/bc2bcb40336ace1a0374767fc29bb0296cdaf8a6da7298436239c54d79180548/cache-12f3c4e4bf422cce.arrow
Loading cached split indices for dataset at /home/nano/.cache/huggingface/datasets/ag_news/default/0.0.0/bc2bcb40336ace1a0374767fc29bb0296cdaf8a6da7298436239c54d79180548/cache-8f4a15d7afc8e025.arrow and /home/nano/.cache/huggingface/datasets/ag_news/default/0.0.0/bc2bcb40336ace1a0374767fc29bb0296cdaf8a6da7298436239c54d79180548/cache-c359d8246d1cf46f.arrow


## Preprocessing Function

In [4]:
# Task 3

# TODO: Check if EOS Token is correctly inserted
tokenizer = AutoTokenizer.from_pretrained("distilroberta-base")
tokenizer.pad_token = tokenizer.eos_token

def preprocess_function(sample: Dict[str, Any], seq_len: int):
    return tokenizer(sample["text"], truncation=True, padding="max_length", max_length=seq_len)

encoded_ds = dataset.map(preprocess_function, 
                         fn_kwargs={"seq_len": 256},
                         remove_columns=['label'])

Loading cached processed dataset at /home/nano/.cache/huggingface/datasets/ag_news/default/0.0.0/bc2bcb40336ace1a0374767fc29bb0296cdaf8a6da7298436239c54d79180548/cache-40aaf69effc6bbaa.arrow


Map:   0%|          | 0/7600 [00:00<?, ? examples/s]

Loading cached processed dataset at /home/nano/.cache/huggingface/datasets/ag_news/default/0.0.0/bc2bcb40336ace1a0374767fc29bb0296cdaf8a6da7298436239c54d79180548/cache-7aa1ef56f85ed704.arrow


In [5]:
# Task 4
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, 
                                                mlm_probability=0.10)

In [6]:
# Task 5
model = AutoModelForMaskedLM.from_pretrained("distilroberta-base")

In [7]:
# Task 6
print(model)

RobertaForMaskedLM(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNor

In [10]:
# Task 7
# TODO: Learning Rate Scheduler, Weight Decay

# ['linear', 'cosine', 'cosine_with_restarts', 'polynomial', 'constant', 'constant_with_warmup', 'inverse_sqrt', 'reduce_lr_on_plateau']
lr_scheduler_type = "linear"

training_args = TrainingArguments(output_dir = './checkpoints/',
                                  do_train=True,
                                  do_eval=True,
                                  per_device_train_batch_size=32,
                                  per_device_eval_batch_size=32,
                                  learning_rate=2e-5,
                                  evaluation_strategy="epoch",
                                  num_train_epochs=5,
                                  load_best_model_at_end=True,
                                  save_strategy="epoch",
                                  lr_scheduler_type=lr_scheduler_type,
                                  weight_decay=0.1
                                  )

In [24]:
# Task 8
trainer = Trainer(model=model,
                  args=training_args,
                  train_dataset = encoded_ds["train"],
                  eval_dataset = encoded_ds["val"],
                  data_collator=data_collator)

In [25]:
# Task 9
trainer.train()



  0%|          | 0/16875 [00:00<?, ?it/s]

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


: 

: 

In [None]:
# Task 10

In [None]:
# Task 11
example = "E-mail scam targets police chief Wiltshire Police warns about <mask> after its fraud squad chief was targeted."

predictions = trainer.predict(example)
print(predictions)