In [None]:
!pip install transformers datasets

In [None]:
from datasets import load_dataset
import numpy as np 

In [None]:
raw_datasets = load_dataset("glue", "sst2")

In [None]:
raw_datasets
# dataset objects for train validation and test

In [None]:
raw_datasets['train']
# select one of the datasets

In [None]:
dir(raw_datasets['train'])
# atributes and methods of the dataset

In [None]:
type(raw_datasets['train'])

In [None]:
raw_datasets['train'].data
# shows type of each column with examples

In [None]:
raw_datasets['train'][0]
# sentence, label

In [None]:
raw_datasets['train'][50000:50003]
# range of idices, dictionary of lists

In [None]:
raw_datasets['train'].features
# names of the labels 

In [None]:
from transformers import AutoTokenizer

In [None]:
# checkpoint bert-base-uncased also possible 
checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [None]:
tokenized_sentences = tokenizer(raw_datasets['train'][0:3]['sentence'])
from pprint import pprint 
pprint(tokenized_sentences) 
# attention mask and token ids

In [None]:
# truncate full dataset
def tokenize_fn(batch): 
    return tokenizer(batch['sentence'], truncation=True) 

In [None]:
tokenized_datasets = raw_datasets.map(tokenize_fn, batched=True)

In [None]:
from transformers import TrainingArguments

# will begin to overfit just with a few epochs
# defaults to save on each training step, which is too much
# default is no evaluation
training_args = TrainingArguments(
    'my_trainer', 
    evaluation_strategy='epoch', 
    save_strategy='epoch',
    num_train_epochs=1
)

In [None]:
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained(
    checkpoint, 
    num_labels = 2
)

In [None]:
type(model) 

In [None]:
model 
# summary of the model, input, output dims of the layers 

In [None]:
!pip install torchinfo

In [None]:
from torchinfo import summary 
# input shape of the model should be specified to show output shapes
# summary(model, input_size=(16, 512), dtypes=['torch.IntTensor'], device = 'cpu')
summary(model)
# prints parameters of each layer

In [None]:
# verify that we have trained all weights by comparing to old weights
params_before = []
for name, p in model.named_parameters():
    params_before.append(p.detach().cpu().numpy())

In [None]:
from transformers import Trainer
from datasets import load_metric

In [None]:
metric = load_metric('glue', 'sst2')
metric.compute(predictions = [1, 0, 1], references = [1, 0, 0])

In [None]:
def compute_metrics(logits_and_labels): 
    # metric = load_metric('glue', 'sst2') 
    logits, labels = logits_and_labels

    # get class predictions
    predictions = np.argmax(logits, axis=-1) 

    # return accuracy
    return metric.compute(predictions = predictions, references = labels) 

In [None]:
trainer = Trainer(
    model, 
    training_args, 
    train_dataset= tokenized_datasets['train'], 
    eval_dataset=tokenized_datasets['validation'], 
    tokenizer=tokenizer, 
    compute_metrics=compute_metrics, 
)

In [None]:
trainer.train()

In [None]:
trainer.save_model('my_saved_model')
# 15 mins

In [None]:
!ls

In [None]:
!ls my_saved_model

In [None]:
from transformers import pipeline 

In [None]:
newmodel = pipeline('text-classification', model="my_saved_model", device=0)

In [None]:
newmodel("This movie is great!")

In [None]:
newmodel('This movie sucks!')

In [None]:
!cat my_saved_model/config.json
# not containing information about label names 

In [None]:
import json

In [None]:
config_path = 'my_saved_model/config.json'
with open(config_path) as f: 
    j = json.loads(f) 

j['id2label'] = {0: 'negative', 1: 'positive'}

with open(config_path, 'w') as f:
    json.dump(j, f, indent=2)

!cat my_saved_model/config.json

In [None]:
# reload model 
newmodel = pipeline('text-classification', model = 'my_saved_model', device=0) 

In [None]:
newmodel("This movie is great")

In [None]:
params_after = []
for name, p in model.named_parameters():
    params_after.append(p.detach().cpu().numpy())

In [None]:
# validate change in parameters 
for p1, p2 in zip(params_before, params_after): 
    print(np.sum(np.abs(p1 - p2)))