In [1]:
# run this cell, then restart the runtime before continuing
# !pip install git+https://github.com/joeddav/transformers.git@data-collator-type-fix
# !pip install git+https://github.com/huggingface/transformers.git
# !pip install git+https://github.com/huggingface/nlp.git
!pip install transformers
!pip install nlp



In [2]:
!rm -rf logs
!rm -rf results

In [3]:
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
)
from nlp import load_dataset
import torch
import random
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

In [4]:
def compute_metrics(pred):
    """Compute precision, recall, and F1 score.

    Arg:
        pred: The model prediction.
    """
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, preds, average="binary"
    )
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}

def tokenize(batch):
    """Tokenize a batch of data (with padding and truncation).

    Arg:
        batch: A batch of training data.
    """
    return tokenizer(batch["text"], padding=True, truncation=True,)

In [5]:
models = {
    "distilbert": "distilbert-base-uncased",
    "distilroberta": "distilroberta-base"
}

tokenizers = {
    "distilbert": "bert-base-uncased",
    "distilroberta": "roberta-base"
}

In [6]:
# Load model and tokenizer
model = AutoModelForSequenceClassification.from_pretrained(models["distilroberta"])
tokenizer = AutoTokenizer.from_pretrained(tokenizers["distilroberta"], use_fast=True)

Some weights of the model checkpoint at distilroberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight', 'cl

In [7]:
# # Load train and test datasets
# imdb_train = load_dataset("imdb", split="train")
# imdb_test = load_dataset("imdb", split="test")

# # Randomly choose train and validation indices
# train_indices, val_indices = train_test_split(
#     range(len(imdb_train)), test_size=0.2, train_size=0.8, random_state=random.seed(42)
# )

# # Split train and validation data
# train_dataset = imdb_train.select(indices=train_indices)
# val_dataset = imdb_train.select(indices=val_indices)
# test_dataset = imdb_test

# # Preprocess
# train_dataset = train_dataset.map(tokenize, batched=True)
# val_dataset = val_dataset.map(tokenize, batched=True)
# test_dataset = test_dataset.map(tokenize, batched=True)
# train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])
# val_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])
# test_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])

In [8]:
# Load train and test datasets
yelp_train = load_dataset("yelp_polarity", split="train")
yelp_test = load_dataset("yelp_polarity", split="test")

# Randomly choose train and validation indices
train_indices, val_indices = train_test_split(
    range(len(yelp_train)), test_size=0.2, train_size=0.8, random_state=random.seed(42)
)

# Split train and validation data
train_dataset = yelp_train.select(indices=train_indices)
val_dataset = yelp_train.select(indices=val_indices)
test_dataset = yelp_test

# Preprocess
train_dataset = train_dataset.map(tokenize, batched=True)
val_dataset = val_dataset.map(tokenize, batched=True)
test_dataset = test_dataset.map(tokenize, batched=True)
train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])
val_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])
test_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])

100%|██████████| 448000/448000 [00:08<00:00, 51945.52it/s]
100%|██████████| 112000/112000 [00:02<00:00, 52592.32it/s]
100%|██████████| 448/448 [02:11<00:00,  3.40it/s]
100%|██████████| 112/112 [00:32<00:00,  3.40it/s]


In [9]:
# Define training arguments
training_args = TrainingArguments(
    adam_epsilon=1e-08,
    eval_steps=10000,
    evaluate_during_training=True,
    gradient_accumulation_steps=1,
    learning_rate=5e-05,
    logging_dir="./logs",
    max_grad_norm=1.0,
    num_train_epochs=1,
    output_dir="./results",
    per_device_eval_batch_size=32,
    per_device_train_batch_size=32,
    save_steps=10000,
    seed=42,
    warmup_steps=0,
    weight_decay=0.0
)

# Define trainer
trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

In [10]:
training_args.__dict__

{'__cached__setup_devices': (device(type='cuda', index=0), 1),
 'adam_epsilon': 1e-08,
 'dataloader_drop_last': False,
 'debug': False,
 'do_eval': False,
 'do_predict': False,
 'do_train': False,
 'eval_steps': 10000,
 'evaluate_during_training': True,
 'fp16': False,
 'fp16_opt_level': 'O1',
 'gradient_accumulation_steps': 1,
 'learning_rate': 5e-05,
 'local_rank': -1,
 'logging_dir': './logs',
 'logging_first_step': False,
 'logging_steps': 500,
 'max_grad_norm': 1.0,
 'max_steps': -1,
 'no_cuda': False,
 'num_train_epochs': 1,
 'output_dir': './results',
 'overwrite_output_dir': False,
 'past_index': -1,
 'per_device_eval_batch_size': 32,
 'per_device_train_batch_size': 32,
 'per_gpu_eval_batch_size': None,
 'per_gpu_train_batch_size': None,
 'save_steps': 10000,
 'save_total_limit': None,
 'seed': 42,
 'tpu_metrics_debug': False,
 'tpu_num_cores': None,
 'warmup_steps': 0,
 'weight_decay': 0.0}

In [None]:
# Fine-tune the model
trainer.train()

HBox(children=(FloatProgress(value=0.0, description='Epoch', max=1.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=14000.0, style=ProgressStyle(description_…

In [None]:
# Evaluate the model on training set
train_score = trainer.evaluate(eval_dataset=train_dataset)

In [None]:
# Evaluate the model on validation set
val_score = trainer.evaluate(eval_dataset=val_dataset)

In [None]:
# Evaluate the model on test set
test_score = trainer.evaluate(eval_dataset=test_dataset)

In [None]:
print("train_f1:", round(train_score["eval_f1"], 4)) 
print("train_acc:", round(train_score["eval_accuracy"], 4)) 
print("val_f1:", round(val_score["eval_f1"], 4)) 
print("val_acc:", round(val_score["eval_accuracy"], 4)) 
print("test_f1:", round(test_score["eval_f1"], 4)) 
print("test_acc:", round(test_score["eval_accuracy"], 4))

In [None]:
# %load_ext tensorboard
# %tensorboard --logdir logs

In [None]:
# trainer.model.save_pretrained("/content/drive/My Drive/models/distilroberta")
# tokenizer.save_pretrained("/content/drive/My Drive/models/distilroberta")

In [None]:
# from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer

# # Sentiment analysis pipeline
# model = AutoModelForSequenceClassification.from_pretrained("/content/drive/My Drive/models/distilroberta")
# tokenizer = AutoTokenizer.from_pretrained("/content/drive/My Drive/models/distilroberta", use_fast=True)
# ppl = pipeline('sentiment-analysis', model=model, tokenizer=tokenizer)