Train a IMDb neg/pos classifier with DistilBERT

https://stackoverflow.com/questions/69087044/early-stopping-in-bert-trainer-instances

In [1]:
import os
print('os.getcwd()', os.getcwd())
import sys
sys.path.insert(1, '../')
print(sys.version)
import time

#plotting tools
from matplotlib import pyplot as plt 
from tqdm.notebook import tqdm as tqdm

#torch libs
import torch
print('torch.__version__', torch.__version__)
NUM_GPUS = torch.cuda.device_count()

if NUM_GPUS > 0:
    last_gpu = torch.device('cuda:'+str(NUM_GPUS-1))

print('NUM_GPUS', NUM_GPUS, 'last_gpu', last_gpu)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
pipe_device = 0 if torch.cuda.is_available() else -1
print(pipe_device, device)

#huggingface transformers
import transformers
print('transformers.__version__',transformers.__version__)
from transformers import AutoTokenizer, pipeline
from transformers import GPT2Tokenizer, GPT2LMHeadModel, GPT2PreTrainedModel

from datasets import load_dataset

#jupyter stuff
%load_ext autoreload
%autoreload 2
%matplotlib inline

os.getcwd() /home/carson/rlhf/notebooks
3.8.10 (default, Nov 14 2022, 12:59:47) 
[GCC 9.4.0]
torch.__version__ 1.11.0+cu113
NUM_GPUS 2 last_gpu cuda:1
0 cuda
transformers.__version__ 4.25.1


In [2]:
imdb_dataset = load_dataset("imdb")

Found cached dataset imdb (/home/carson/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1)


  0%|          | 0/3 [00:00<?, ?it/s]

In [3]:
print(imdb_dataset)
print('')
print(imdb_dataset['train'].features)

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

{'text': Value(dtype='string', id=None), 'label': ClassLabel(names=['neg', 'pos'], id=None)}


In [4]:
imdb_dataset['train']['text'][0][:100]

'I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it w'

In [5]:
imdb_dataset['train']['label'][::1000]

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

load a pretrained distilbert

In [7]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

model_name = "distilbert-base-uncased"

model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias', 'pre_classifier

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Preprocess Data

In [8]:
def tokenize(examples):
    outputs = tokenizer(examples['text'], truncation=True)
    return outputs

tokenized_dataset = imdb_dataset.map(tokenize, batched=True)

  0%|          | 0/25 [00:00<?, ?ba/s]

  0%|          | 0/25 [00:00<?, ?ba/s]

  0%|          | 0/50 [00:00<?, ?ba/s]

In [9]:
print(tokenized_dataset)
print("")
print(tokenized_dataset['train']['text'][0][:100])
print("")
print(tokenized_dataset['train']['input_ids'][0][-10:])

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 50000
    })
})

I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it w

[2987, 1005, 1056, 2031, 2172, 1997, 1037, 5436, 1012, 102]


Prepare Trainer and Evaluation Metric

In [10]:
from datasets import load_metric
import numpy as np
from sklearn.metrics import f1_score

def compute_accuracy_metrics(eval_preds):
    metric = load_metric("accuracy")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

def compute_metrics(p):   
    
    pred, labels = p
    pred = np.argmax(pred, axis=1)
    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred)
    precision = precision_score(y_true=labels, y_pred=pred)
    f1 = f1_score(y_true=labels, y_pred=pred)  
    
    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

In [12]:
from transformers import TrainingArguments, Trainer, DataCollatorWithPadding
from transformers import EarlyStoppingCallback, IntervalStrategy

os.environ["TOKENIZERS_PARALLELISM"] = "true"

# Defining the TrainingArguments() arguments
batch_size = 16

training_args = TrainingArguments(
   output_dir="large_data/distilbert-imdb",
   evaluation_strategy = IntervalStrategy.STEPS, # "steps", "epoch", 
   eval_steps = 50, # Evaluation and Save happens every 50 steps
   save_total_limit = 1, # Only last 5 models are saved. Older ones are deleted.
   learning_rate=5e-5,
   per_device_train_batch_size=16,
   per_device_eval_batch_size=batch_size,
   num_train_epochs=5,
   weight_decay=0.01,
   push_to_hub=True,
   metric_for_best_model = 'f1',
   load_best_model_at_end=True,
)

data_collator = DataCollatorWithPadding(tokenizer)


trainer = Trainer(
    model=model, 
    tokenizer=tokenizer,
    data_collator=data_collator,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"], 
    compute_metrics=compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=2)]
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
/home/carson/rlhf/notebooks/large_data/distilbert-imdb is already a clone of https://huggingface.co/clam004/distilbert-imdb. Make sure you pull the latest changes with `repo.git_pull()`.


In [None]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 25000
  Num Epochs = 5
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 3910
  Number of trainable parameters = 66955010
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss


The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 32


In [None]:
trainer.push_to_hub()

expected output

```
To https://huggingface.co/clam004/distilbert-imdb

'https://huggingface.co/clam004/distilbert-imdb/commit/29940673ff3a7a22ab3607152dbaa821f1d10a4e'
```

In [None]:
# load the model you just pushed 
sentiment_pipe = pipeline("sentiment-analysis","clam004/distilbert-imdb", device=pipe_device)