Check the GPU and Memory

In [8]:
!/opt/bin/nvidia-smi

Fri Jul 10 09:58:22 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 418.67       Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:05.0 Off |                    0 |
| N/A   65C    P0    40W / 250W |  15071MiB / 16280MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
+-------

Install the packages we need

In [10]:
# run this cell, then restart the runtime before continuing
!pip install git+https://github.com/joeddav/transformers.git@data-collator-type-fix
!pip install git+https://github.com/huggingface/transformers.git
!pip install git+https://github.com/huggingface/nlp.git
!pip install transformers
!pip install nlp

Collecting git+https://github.com/joeddav/transformers.git@data-collator-type-fix
  Cloning https://github.com/joeddav/transformers.git (to revision data-collator-type-fix) to /tmp/pip-req-build-dbc5s8ai
  Running command git clone -q https://github.com/joeddav/transformers.git /tmp/pip-req-build-dbc5s8ai
  Running command git checkout -b data-collator-type-fix --track origin/data-collator-type-fix
  Switched to a new branch 'data-collator-type-fix'
  Branch 'data-collator-type-fix' set up to track remote branch 'data-collator-type-fix' from 'origin'.
Collecting tokenizers==0.8.0-rc3
[?25l  Downloading https://files.pythonhosted.org/packages/c3/24/fc3b869878ad596d9b3acfea7f1f163958893579f7b0c278cc45eb885dfd/tokenizers-0.8.0rc3-cp36-cp36m-manylinux1_x86_64.whl (3.0MB)
[K     |████████████████████████████████| 3.0MB 4.5MB/s 
Building wheels for collected packages: transformers
  Building wheel for transformers (setup.py) ... [?25l[?25hdone
  Created wheel for transformers: filename=t

Collecting git+https://github.com/huggingface/transformers.git
  Cloning https://github.com/huggingface/transformers.git to /tmp/pip-req-build-4zsctro5
  Running command git clone -q https://github.com/huggingface/transformers.git /tmp/pip-req-build-4zsctro5
Collecting tokenizers==0.8.1.rc1
  Using cached https://files.pythonhosted.org/packages/40/d0/30d5f8d221a0ed981a186c8eb986ce1c94e3a6e87f994eae9f4aa5250217/tokenizers-0.8.1rc1-cp36-cp36m-manylinux1_x86_64.whl
Building wheels for collected packages: transformers
  Building wheel for transformers (setup.py) ... [?25l[?25hdone
  Created wheel for transformers: filename=transformers-3.0.2-cp36-none-any.whl size=780998 sha256=641df285ee0591946343265dc4d56faf5b4c9a578e80064ed10c2897164ab3fd
  Stored in directory: /tmp/pip-ephem-wheel-cache-tvgjhova/wheels/33/eb/3b/4bf5dd835e865e472d4fc0754f35ac0edb08fe852e8f21655f
Successfully built transformers
Installing collected packages: tokenizers, transformers
  Found existing installation: token

Collecting git+https://github.com/huggingface/nlp.git
  Cloning https://github.com/huggingface/nlp.git to /tmp/pip-req-build-huv435fq
  Running command git clone -q https://github.com/huggingface/nlp.git /tmp/pip-req-build-huv435fq
Building wheels for collected packages: nlp
  Building wheel for nlp (setup.py) ... [?25l[?25hdone
  Created wheel for nlp: filename=nlp-0.3.0-cp36-none-any.whl size=111969 sha256=41657b0ff3efcdbd5044fc97805f4d9fccf6513839ce832d1096bf0452f7a03d
  Stored in directory: /tmp/pip-ephem-wheel-cache-4kfgwj3b/wheels/a2/98/bd/15f115cb85f1a049a96b915aa14466d3b51a023cd597e08e0d
Successfully built nlp


Import the package we gonna use

In [1]:
# Training
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
)
from nlp import load_dataset
import random
import numpy as np

# Evaluation
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.model_selection import train_test_split

Define two functions we gonna use

In [3]:
def compute_metrics(pred):
    """Compute precision, recall, and F1 score.

    Arg:
        pred: The model prediction.
    """
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, preds, average="binary"
    )
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}

def tokenize(batch):
    """Tokenize a batch of data (with padding and truncation).

    Arg:
        batch: A batch of training data.
    """
    return tokenizer(batch["text"], padding=True, truncation=True,)

Load model and tokenizer

In [4]:
# Load model and tokenizer
model = AutoModelForSequenceClassification.from_pretrained("distilroberta-base")
tokenizer = AutoTokenizer.from_pretrained("roberta-base", use_fast=True)

Some weights of the model checkpoint at distilroberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight', 'cl

Load dataset and preprocess data

In [5]:
# Load train and test datasets
yelp_train = load_dataset("yelp_polarity", split="train")
yelp_test = load_dataset("yelp_polarity", split="test")

# Randomly choose train and validation indices
# train_indices, val_indices = train_test_split(
#     range(len(yelp_train)), test_size=0.2, train_size=0.8, random_state=random.seed(42)
# )
train_indices, val_indices = train_test_split(
    range(50000), test_size=0.2, train_size=0.8, random_state=random.seed(42)
)

# Split train and validation data
train_dataset = yelp_train.select(indices=train_indices)
val_dataset = yelp_train.select(indices=val_indices)
test_dataset = yelp_test

# Preprocess
train_dataset = train_dataset.map(tokenize, batched=True)
val_dataset = val_dataset.map(tokenize, batched=True)
test_dataset = test_dataset.map(tokenize, batched=True)
train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])
val_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])
test_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=5787.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=3419.0, style=ProgressStyle(description…


Downloading and preparing dataset yelp_polarity/plain_text (download: 158.67 MiB, generated: 421.28 MiB, total: 579.95 MiB) to /root/.cache/huggingface/datasets/yelp_polarity/plain_text/1.0.0...


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=166373201.0, style=ProgressStyle(descri…




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))



HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Dataset yelp_polarity downloaded and prepared to /root/.cache/huggingface/datasets/yelp_polarity/plain_text/1.0.0. Subsequent calls will reuse this data.


100%|██████████| 40000/40000 [00:00<00:00, 66209.81it/s]
100%|██████████| 10000/10000 [00:00<00:00, 65801.67it/s]
100%|██████████| 40/40 [00:06<00:00,  5.72it/s]
100%|██████████| 10/10 [00:01<00:00,  6.09it/s]
100%|██████████| 38/38 [00:06<00:00,  5.63it/s]


Define training arguments and trainer

In [6]:
# Define training arguments
training_args = TrainingArguments(
    adam_epsilon=1e-08,
    eval_steps=10000,
    evaluate_during_training=True,
    gradient_accumulation_steps=1,
    learning_rate=5e-05,
    logging_dir="./logs",
    max_grad_norm=1.0,
    num_train_epochs=1,
    output_dir="./results",
    per_device_eval_batch_size=32,
    per_device_train_batch_size=32,
    save_steps=10000,
    seed=42,
    warmup_steps=0,
    weight_decay=0.0
)

# Define trainer
trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

Fine-tune the RoBerta Model

In [7]:
# Fine-tune the model
trainer.train()

HBox(children=(FloatProgress(value=0.0, description='Epoch', max=1.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=1250.0, style=ProgressStyle(description_w…





TrainOutput(global_step=1250, training_loss=0.1530088638946414)

In [9]:
# Evaluate the model on training set
train_score = trainer.evaluate(eval_dataset=train_dataset)

HBox(children=(FloatProgress(value=0.0, description='Evaluation', max=1250.0, style=ProgressStyle(description_…




In [10]:
# Evaluate the model on validation set
val_score = trainer.evaluate(eval_dataset=val_dataset)

HBox(children=(FloatProgress(value=0.0, description='Evaluation', max=313.0, style=ProgressStyle(description_w…




In [11]:
print("train_f1:", round(train_score["eval_f1"], 4)) 
print("train_acc:", round(train_score["eval_accuracy"], 4)) 
print("val_f1:", round(val_score["eval_f1"], 4)) 
print("val_acc:", round(val_score["eval_accuracy"], 4)) 

train_f1: 0.977
train_acc: 0.9787
val_f1: 0.9654
val_acc: 0.9673


In [None]:
# Evaluate the model on test set
test_score = trainer.evaluate(eval_dataset=test_dataset)
print("test_f1:", round(test_score["eval_f1"], 4)) 
print("test_acc:", round(test_score["eval_accuracy"], 4))

Save the model

In [None]:
trainer.model.save_pretrained("/content/drive/My Drive/models/distilroberta")
tokenizer.save_pretrained("/content/drive/My Drive/models/distilroberta")

Set up the pipline

In [None]:
from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer

# Sentiment analysis pipeline
model = AutoModelForSequenceClassification.from_pretrained("/content/drive/My Drive/models/distilroberta")
tokenizer = AutoTokenizer.from_pretrained("/content/drive/My Drive/models/distilroberta", use_fast=True)
ppl = pipeline('sentiment-analysis', model=model, tokenizer=tokenizer)