Train a IMDb neg/pos classifier with DistilBERT

https://stackoverflow.com/questions/69087044/early-stopping-in-bert-trainer-instances

In [3]:
import os
print('os.getcwd()', os.getcwd())
import sys
sys.path.insert(1, '../')
print(sys.version)
import time

#plotting tools
from matplotlib import pyplot as plt 
from tqdm.notebook import tqdm as tqdm

import numpy as np

#torch libs
import torch
print('torch.__version__', torch.__version__)
NUM_GPUS = torch.cuda.device_count()

if NUM_GPUS > 0:
    last_gpu = torch.device('cuda:'+str(NUM_GPUS-1))

print('NUM_GPUS', NUM_GPUS, 'last_gpu', last_gpu)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
pipe_device = 0 if torch.cuda.is_available() else -1
print(pipe_device, device)

#huggingface transformers
import transformers
print('transformers.__version__',transformers.__version__)
from transformers import AutoTokenizer, pipeline
from transformers import GPT2Tokenizer, GPT2LMHeadModel, GPT2PreTrainedModel

from datasets import load_dataset

#jupyter stuff
%load_ext autoreload
%autoreload 2
%matplotlib inline

os.getcwd() /home/carson/rlhf/notebooks
3.8.10 (default, Nov 14 2022, 12:59:47) 
[GCC 9.4.0]
torch.__version__ 1.11.0+cu113
NUM_GPUS 2 last_gpu cuda:1
0 cuda
transformers.__version__ 4.25.1
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
# use this line instead to get the whole dataset
#imdb_dataset = load_dataset("imdb")

# use these lines to get a smaller version of the data where we split train into train-test
imdb_dataset = load_dataset("imdb", split="train")
imdb_dataset = imdb_dataset.train_test_split(test_size=0.1)

Found cached dataset imdb (/home/carson/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1)


In [3]:
print(imdb_dataset)
print('')
print(imdb_dataset['train'].features)

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 22500
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 2500
    })
})

{'text': Value(dtype='string', id=None), 'label': ClassLabel(names=['neg', 'pos'], id=None)}


In [4]:
imdb_dataset['train']['text'][0][:100]

"For long time I haven't seen such a good fantasy movie, magic fights here are even better than in LO"

In [4]:
print(np.mean(imdb_dataset['train']['label'])) # dataset should be balanced
print(imdb_dataset['train']['label'][::1000])

0.4988
[0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1]


load a pretrained distilbert

In [6]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

model_name = "distilbert-base-uncased"

model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.bias', 'pre_classifi

Preprocess Data

In [7]:
def tokenize(examples):
    outputs = tokenizer(examples['text'], truncation=True)
    return outputs

tokenized_dataset = imdb_dataset.map(tokenize, batched=True)

  0%|          | 0/23 [00:00<?, ?ba/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

In [11]:
print(tokenized_dataset)
print("")
print(tokenized_dataset['train']['text'][0][:100])
print("")
print(tokenized_dataset['train']['input_ids'][0][-10:])

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 22500
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 2500
    })
})

For long time I haven't seen such a good fantasy movie, magic fights here are even better than in LO

[5875, 4784, 1012, 16755, 7955, 2000, 2156, 2009, 1012, 102]


Prepare Trainer and Evaluation Metric

In [23]:
from datasets import load_metric
from sklearn.metrics import f1_score, accuracy_score, recall_score, precision_score

def compute_accuracy_metrics(eval_preds):
    metric = load_metric("accuracy")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

def compute_metrics(p):   
    
    pred, labels = p
    pred = np.argmax(pred, axis=1)
    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred)
    precision = precision_score(y_true=labels, y_pred=pred)
    f1 = f1_score(y_true=labels, y_pred=pred)  
    
    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

In [24]:
from transformers import TrainingArguments, Trainer, DataCollatorWithPadding
from transformers import EarlyStoppingCallback, IntervalStrategy

from huggingface_hub import Repository
repo = Repository(local_dir="large_data/distilbert-imdb", clone_from="clam004/distilbert-imdb")
repo.git_pull()

os.environ["TOKENIZERS_PARALLELISM"] = "true"

# Defining the TrainingArguments() arguments

training_args = TrainingArguments(
   output_dir="large_data/distilbert-imdb",
   evaluation_strategy = IntervalStrategy.STEPS, # "steps", "epoch", 
   eval_steps = 10, # Evaluation and Save happens every 50 steps
   save_total_limit = 1, # Only last 5 models are saved. Older ones are deleted.
   learning_rate=5e-5,
   per_device_train_batch_size=16,
   per_device_eval_batch_size=batch_size,
   num_train_epochs=5,
   weight_decay=0.01,
   push_to_hub=True,
   metric_for_best_model = 'f1',
   load_best_model_at_end=True,
)

data_collator = DataCollatorWithPadding(tokenizer)


trainer = Trainer(
    model=model, 
    tokenizer=tokenizer,
    data_collator=data_collator,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"], 
    compute_metrics=compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=2)]
)

/home/carson/rlhf/notebooks/large_data/distilbert-imdb is already a clone of https://huggingface.co/clam004/distilbert-imdb. Make sure you pull the latest changes with `repo.git_pull()`.
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
/home/carson/rlhf/notebooks/large_data/distilbert-imdb is already a clone of https://huggingface.co/clam004/distilbert-imdb. Make sure you pull the latest changes with `repo.git_pull()`.


In [None]:
trainer.train()

In [26]:
trainer.push_to_hub()

Saving model checkpoint to large_data/distilbert-imdb
Configuration saved in large_data/distilbert-imdb/config.json
Model weights saved in large_data/distilbert-imdb/pytorch_model.bin
tokenizer config file saved in large_data/distilbert-imdb/tokenizer_config.json
Special tokens file saved in large_data/distilbert-imdb/special_tokens_map.json


Upload file pytorch_model.bin:   0%|          | 4.00k/255M [00:00<?, ?B/s]

Upload file training_args.bin: 100%|##########| 3.30k/3.30k [00:00<?, ?B/s]

remote: Scanning LFS files for validity, may be slow...        
remote: LFS file scan complete.        
To https://huggingface.co/clam004/distilbert-imdb
   e844da1..84b7f41  main -> main

Dropping the following result as it does not have all the necessary fields:
{'task': {'name': 'Text Classification', 'type': 'text-classification'}, 'dataset': {'name': 'imdb', 'type': 'imdb', 'config': 'plain_text', 'split': 'train', 'args': 'plain_text'}}
To https://huggingface.co/clam004/distilbert-imdb
   84b7f41..dc2f4fb  main -> main



'https://huggingface.co/clam004/distilbert-imdb/commit/84b7f419d59564c4f55897b2f235d9bb1f3c7c6c'

expected output

```
To https://huggingface.co/clam004/distilbert-imdb

'https://huggingface.co/clam004/distilbert-imdb/commit/29940673ff3a7a22ab3607152dbaa821f1d10a4e'
```

In [27]:
# load the model you just pushed 
sentiment_pipe = pipeline("sentiment-analysis","clam004/distilbert-imdb", device=pipe_device)

Downloading:   0%|          | 0.00/615 [00:00<?, ?B/s]

loading configuration file config.json from cache at /home/carson/.cache/huggingface/hub/models--clam004--distilbert-imdb/snapshots/dc2f4fbd94e39ed21acb9551f92981f1e0354738/config.json
Model config DistilBertConfig {
  "_name_or_path": "clam004/distilbert-imdb",
  "activation": "gelu",
  "architectures": [
    "DistilBertForSequenceClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "problem_type": "single_label_classification",
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "torch_dtype": "float32",
  "transformers_version": "4.25.1",
  "vocab_size": 30522
}

loading configuration file config.json from cache at /home/carson/.cache/huggingface/hub/models--clam004--distilbert-imdb/snapshots/dc2f4fbd94e39ed21acb9551f92981f1e0354

Downloading:   0%|          | 0.00/268M [00:00<?, ?B/s]

loading weights file pytorch_model.bin from cache at /home/carson/.cache/huggingface/hub/models--clam004--distilbert-imdb/snapshots/dc2f4fbd94e39ed21acb9551f92981f1e0354738/pytorch_model.bin
All model checkpoint weights were used when initializing DistilBertForSequenceClassification.

All the weights of DistilBertForSequenceClassification were initialized from the model checkpoint at clam004/distilbert-imdb.
If your task is similar to the task the model of the checkpoint was trained on, you can already use DistilBertForSequenceClassification for predictions without further training.


Downloading:   0%|          | 0.00/360 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/711k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/125 [00:00<?, ?B/s]

loading file vocab.txt from cache at /home/carson/.cache/huggingface/hub/models--clam004--distilbert-imdb/snapshots/dc2f4fbd94e39ed21acb9551f92981f1e0354738/vocab.txt
loading file tokenizer.json from cache at /home/carson/.cache/huggingface/hub/models--clam004--distilbert-imdb/snapshots/dc2f4fbd94e39ed21acb9551f92981f1e0354738/tokenizer.json
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at /home/carson/.cache/huggingface/hub/models--clam004--distilbert-imdb/snapshots/dc2f4fbd94e39ed21acb9551f92981f1e0354738/special_tokens_map.json
loading file tokenizer_config.json from cache at /home/carson/.cache/huggingface/hub/models--clam004--distilbert-imdb/snapshots/dc2f4fbd94e39ed21acb9551f92981f1e0354738/tokenizer_config.json
