In [None]:
! pip install -U accelerate
! pip install -U transformers

In [None]:
!pip install datasets

In [4]:
from datasets import load_dataset
import numpy as np

In [None]:
raw_datasets = load_dataset('glue', 'sst2')

In [None]:
raw_datasets

In [None]:
raw_datasets['train']

In [None]:
dir(raw_datasets['train'])

In [None]:
type(raw_datasets['train'])

In [None]:
raw_datasets['train'].data

In [None]:
raw_datasets['train'][0]

In [None]:
raw_datasets['train'][50_000:50_003]


In [None]:
raw_datasets['train'].features


In [14]:
from transformers import AutoTokenizer

In [None]:
# checkpoint = "bert-base-uncased"
checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [None]:
tokenized_sentences = tokenizer(raw_datasets['train'][0:3]['sentence'])
from pprint import pprint
pprint(tokenized_sentences)

In [17]:
def tokenize_fn(batch):
    return tokenizer(batch['sentence'], truncation=True)

In [None]:
tokenized_datasets = raw_datasets.map(tokenize_fn, batched=True)

In [19]:
from transformers import TrainingArguments

In [20]:
training_args = TrainingArguments(
    'my_trainer',
    evaluation_strategy = 'epoch',
    save_strategy = 'epoch',
    num_train_epochs=1
)

In [21]:
from transformers import AutoModelForSequenceClassification

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(checkpoint,num_labels=2)

In [None]:
type(model)


In [None]:
model


In [None]:
!pip install torchinfo

In [None]:
from torchinfo import summary
# summary(model, input_size=(16,512), dtypes=['torch.IntTensor'], device='cpu)
summary(model)

In [27]:

params_before = []
for name, p in model.named_parameters():
    params_before.append(p.detach().cpu().numpy())

In [None]:
params_before

In [29]:
from transformers import Trainer
from datasets import load_metric


In [None]:
metric = load_metric("glue", "sst2")

# call also load specific metrics
# metric = load_metric("f1")
# metric = load_metric("bleu")

In [None]:
metric.compute(predictions=[1, 0, 1], references=[1, 0, 0])


In [32]:
def compute_metrics(logits_and_labels):
    metric = load_metric("glue", "sst2") # could also be outside
    logits, labels = logits_and_labels
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [33]:
trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()


In [None]:
trainer.save_model('my_saved_model')


In [None]:
!ls

In [None]:
!ls my_saved_model

In [None]:
from transformers import pipeline

In [None]:
new_model = pipeline('text-classification', model='my_saved_model', device=0)

In [None]:
new_model("This movie is great!")

In [None]:
new_model("This movie is sucks")

In [None]:
!cat my_saved_model/config.json

In [None]:
import json

In [None]:

config_path = "my_saved_model/config.json"
with open(config_path) as f:
    j = json.load(f)

j['id2label'] = {0: 'negative', 1: 'positive'}
with open(config_path, 'w') as f:
    json.dump(j, f, indent=2)


In [None]:
!cat my_saved_model/config.json

In [None]:
new_model = pipeline('text-classification', model='my_saved_model', device=0)

In [None]:
new_model("This movie is great!")


In [None]:
new_model("This movie is sucks")

In [None]:
params_after = []
for name, p in model.named_parameters():
    params_after.append(p.detach().cpu().numpy())

In [None]:
for p1, p2 in zip(params_before, params_after):
    print(np.sum(np.abs(p1 - p2)))