In [1]:
import gzip
import shutil
import time


import pandas as pd
import requests
import torch
import torch.nn.functional as F
import torchtext

import transformers
from transformers import DistilBertTokenizerFast
from transformers import DistilBertForSequenceClassification


specify general settings


In [2]:
torch.backends.cudnn.deterministic = True
RANDOM_SEED = 123
torch.manual_seed(RANDOM_SEED)
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

NUM_EPOCHS = 3

In [3]:
url = ("https://github.com/rasbt/"
       "machine-learning-book/raw/"
       "main/ch08/movie_data.csv.gz")
datadir = "DataFiles"

filename = url.split("/")[-1]

with open(filename,"wb") as f:
    r = requests.get(url)
    f.write(r.content)

with gzip.open('movie_data.csv.gz', 'rb') as f_in:
    with open('movie_data.csv', 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)
        

In [3]:
df = pd.read_csv('movie_data.csv')
df.head()

Unnamed: 0,review,sentiment
0,"In 1974, the teenager Martha Moxley (Maggie Gr...",1
1,OK... so... I really like Kris Kristofferson a...,0
2,"***SPOILER*** Do not read this, if you think a...",0
3,hi for all the people who have seen this wonde...,1
4,"I recently bought the DVD, forgetting just how...",0


In [5]:
df

Unnamed: 0,review,sentiment
0,"In 1974, the teenager Martha Moxley (Maggie Gr...",1
1,OK... so... I really like Kris Kristofferson a...,0
2,"***SPOILER*** Do not read this, if you think a...",0
3,hi for all the people who have seen this wonde...,1
4,"I recently bought the DVD, forgetting just how...",0
...,...,...
49995,"OK, lets start with the best. the building. al...",0
49996,The British 'heritage film' industry is out of...,0
49997,I don't even know where to begin on this one. ...,0
49998,Richard Tyler is a little boy who is scared of...,0


In [4]:
train_texts = df.iloc[:35000]['review'].values
train_labels = df.iloc[:35000]['sentiment'].values

valid_texts = df.iloc[35000:40000]['review'].values
valid_labels = df.iloc[35000:40000]['sentiment'].values

test_texts = df.iloc[40000:]['review'].values
test_labels = df.iloc[40000:]['sentiment'].values

In [5]:
tokenizer = DistilBertTokenizerFast.from_pretrained(
    'distilbert-base-uncased'
)

train_encodings = tokenizer(list(train_texts), truncation=True, padding=True)
valid_encodings = tokenizer(list(valid_texts), truncation=True, padding=True)
test_encodings = tokenizer(list(test_texts), truncation=True, padding=True)
                           

In [6]:
class IMDbDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key:torch.tensor(val[idx])
                for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = IMDbDataset(train_encodings, train_labels)
valid_dataset = IMDbDataset(valid_encodings, valid_labels)
test_dataset = IMDbDataset(test_encodings, test_labels)

my_batch_size = 8
train_loader = torch.utils.data.DataLoader(
    train_dataset, batch_size=my_batch_size, shuffle=True)
valid_loader = torch.utils.data.DataLoader(
    valid_dataset, batch_size=my_batch_size, shuffle=False)
test_loader = torch.utils.data.DataLoader(
    test_dataset, batch_size=my_batch_size, shuffle=False)


In [9]:
model = DistilBertForSequenceClassification.from_pretrained(
    'distilbert-base-uncased')
model.to(DEVICE)
model.train()

optim = torch.optim.Adam(model.parameters(), lr=5e-5)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.bias', 'classifier.w

In [13]:
def compute_accuracy(model, data_loader, device):
    with torch.no_grad():
        correct_pred, num_examples = 0, 0
        for batch_idx, batch in enumerate(data_loader):
            ### Prepare data
            input_ids = batch['input_ids'].to(device)
            attention_mask = \
                batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids,
                            attention_mask=attention_mask)
            logits = outputs['logits']
            predicted_labels = torch.argmax(logits, 1)
            num_examples += labels.size(0)
            correct_pred += \
                (predicted_labels == labels).sum()
    return correct_pred.float()/num_examples * 100
            

In [14]:
start_time = time.time()

for epoch in range(NUM_EPOCHS):
    model.train()

    for batch_idx, batch in enumerate(train_loader):

        ### Prepare data
        input_ids = batch['input_ids'].to(DEVICE)
        attention_mask = batch['attention_mask'].to(DEVICE)
        labels = batch['labels'].to(DEVICE)

        ### Forward pass
        outputs = model(input_ids,
                        attention_mask=attention_mask,
                        labels=labels)
        loss, logits = outputs['loss'], outputs['logits']

        ### Backward pass
        optim.zero_grad()
        loss.backward()
        optim.step()

        ### Logging
        if not batch_idx % 250:
            print(f'Epoch: {epoch+1:04d}/{NUM_EPOCHS:04d}'
                  f' | Batch'
                  f'{batch_idx:04d}/'
                  f'{len(train_loader):04d} | '
                  f'Loss: {loss:.4f}')

    model.eval()

    with torch.set_grad_enabled(False):
        print(f'Training accuracy: '
              f'{compute_accuracy(model, train_loader, DEVICE):.2f}%'
              f'\nValid accuracy: '
              f'{compute_accuracy(model, valid_loader, DEVICE):.2f}%')

    print(f'Time elapsed: {(time.time() - start_time)/60:.2f} min')

print (f'Total Training Time: {(time.time() - start_time)/60:.2f} min')
print (f'Test accuracy" {compute_accuracy(model, test_loader, DEVICE):.2f}%')



Epoch: 0001/0003 | Batch0000/4375 | Loss: 0.0367
Epoch: 0001/0003 | Batch0250/4375 | Loss: 0.0067
Epoch: 0001/0003 | Batch0500/4375 | Loss: 0.0678
Epoch: 0001/0003 | Batch0750/4375 | Loss: 0.0595
Epoch: 0001/0003 | Batch1000/4375 | Loss: 0.4246
Epoch: 0001/0003 | Batch1250/4375 | Loss: 0.2028
Epoch: 0001/0003 | Batch1500/4375 | Loss: 0.0198
Epoch: 0001/0003 | Batch1750/4375 | Loss: 0.0220
Epoch: 0001/0003 | Batch2000/4375 | Loss: 0.1122
Epoch: 0001/0003 | Batch2250/4375 | Loss: 0.1124
Epoch: 0001/0003 | Batch2500/4375 | Loss: 0.2008
Epoch: 0001/0003 | Batch2750/4375 | Loss: 0.0463
Epoch: 0001/0003 | Batch3000/4375 | Loss: 0.0373
Epoch: 0001/0003 | Batch3250/4375 | Loss: 0.0057
Epoch: 0001/0003 | Batch3500/4375 | Loss: 0.0906
Epoch: 0001/0003 | Batch3750/4375 | Loss: 0.1155
Epoch: 0001/0003 | Batch4000/4375 | Loss: 0.0285
Epoch: 0001/0003 | Batch4250/4375 | Loss: 0.0076
Training accuracy: 98.44%
Valid accuracy: 92.22%
Time elapsed: 62.44 min
Epoch: 0002/0003 | Batch0000/4375 | Loss: 0.0

#### Results
In my home pc (*RTX 1060 6GB*, batch_size **8**)

Training accuracy: 99.47%
Valid accuracy: 91.36%
Time elapsed: 186.10 min
Total Training Time: 186.10 min
Test accuracy" 91.35%

## Use the trainer API


In [7]:
model = DistilBertForSequenceClassification.from_pretrained(
    'distilbert-base-uncased')
model.to(DEVICE)
model.train()

optim = torch.optim.Adam(model.parameters(), lr=5e-5)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.weight', 'pre_classi

In [8]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=my_batch_size,
    per_device_eval_batch_size=my_batch_size,
    logging_dir='./logs',
    logging_steps=10,
)

In [9]:
#DEPRECATED, USE evaluate instead

#from datasets import load_metric
#import numpy as np

#metric = load_metric("accuracy")

import evaluate
import numpy as np
metric = evaluate.load("accuracy")



In [15]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    # note: logits are a numpy array, not a pytorch tensor
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(
        predictions=predictions, references=labels)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
    optimizers=(optim, None) #optim and learning rate scheduler
)

In [16]:
start_time = time.time()
trainer.train()

Step,Training Loss
10,0.3383
20,0.3028
30,0.1993
40,0.0097
50,0.087
60,0.5093
70,0.2519
80,0.2235
90,0.0902
100,0.2154


TrainOutput(global_step=13125, training_loss=0.17670628249521056, metrics={'train_runtime': 7979.7797, 'train_samples_per_second': 13.158, 'train_steps_per_second': 1.645, 'total_flos': 1.390907685888e+16, 'train_loss': 0.17670628249521056, 'epoch': 3.0})

In [17]:
print (f'Total Training Time: {(time.time() - start_time)/60:.2f} min')
print(trainer.evaluate())


Total Training Time: 133.00 min


{'eval_loss': 0.3392660617828369, 'eval_accuracy': 0.9359, 'eval_runtime': 292.6901, 'eval_samples_per_second': 34.166, 'eval_steps_per_second': 4.271, 'epoch': 3.0}
