<a href="https://colab.research.google.com/github/choprahetarth/Hands-on-NLP/blob/main/Pytorch_HF_Tutorial.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
from transformers import pipeline
import torch
import torch.nn.functional as F

## pipeline method

In [None]:
classifier = pipeline("sentiment-analysis") # great and easy way to generate the model inference

In [None]:
results = classifier(["We are very happy to show you the transformers library",
                  "We hope you don't hate it"])

In [None]:
for result in results:
  print(result)

## use a concrete model

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification 

In [None]:
model_name = "distilbert-base-uncased-finetuned-sst-2-english"

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

classifier = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)

results = classifier(["We are very happy to show you the transformers library",
                  "We hope you don't hate it"])

In [None]:
for result in results:
  print(result)

## use model and tokenizer differently

In [None]:
tokens = tokenizer.tokenize("We are very happy to show you the transformers library")
token_ids = tokenizer.convert_tokens_to_ids(tokens)
input_ids = tokenizer("We are very happy to show you the transformers library")

print(tokens)
print(token_ids)
print(input_ids) # we can see that it has an additional 101 and 102 tokens which are the Starting and Ending tokens. 

In [None]:
X_train = ["We are very happy to show you the transformers library","We hope you don't hate it"]
batch = tokenizer(X_train, padding = True, truncation = True, max_length = 512, return_tensors="pt")

In [None]:
batch

## Pytorch Classification

In [None]:
with torch.no_grad():
  outputs = model(**batch, labels=torch.tensor([1,0])) ## ** is for unpacking
  print(outputs) ## model here returns sequence classifier output object, due to the nature of the model provided by us to huggingface (AutoModelForSequenceClassification)
  predictions = F.softmax(outputs.logits, dim=1) ## apply softmax on logits
  print(predictions)
  labels = torch.argmax(predictions,dim=1) # get the argmax 
  print(labels)
  labels = [model.config.id2label[label_id] for label_id in  labels.tolist()] # convert the argmax's to label names
  print(labels)

## save model

In [None]:
save_directory = "saved"
tokenizer.save_pretrained(save_directory)
model.save_pretrained(save_directory)

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(save_directory)
tokenizer = AutoTokenizer.from_pretrained(save_directory)

## use different german model

In [None]:
 model_name = "oliverguhr/german-sentiment-bert"

 german_sentences = ["Nett, Sie kennen zu lernen", 
                     "Gibt es ein Restaurant in der Nähe?",
                     "Haben Sie noch Zimmer frei?"]

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

batch = tokenizer(german_sentences,
                  max_length=125,
                  truncation=True,
                  padding=True,
                  return_tensors="pt")

In [None]:
with torch.no_grad():
  outputs=model(**batch)

In [None]:
applied_softmax = F.softmax(outputs.logits, dim=1)

In [None]:
label_ids = torch.argmax(applied_softmax,dim=1)

In [None]:
labels = [model.config.id2label[label_id] for label_id in  label_ids.tolist()]

In [None]:
labels

## fine tuning existing model

In [3]:
from pathlib import Path
import os
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import Dataset
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification

In [4]:
!wget "http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
!tar -xf aclImdb_v1.tar.gz

--2022-12-01 07:52:51--  http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
Resolving ai.stanford.edu (ai.stanford.edu)... 171.64.68.10
Connecting to ai.stanford.edu (ai.stanford.edu)|171.64.68.10|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 84125825 (80M) [application/x-gzip]
Saving to: ‘aclImdb_v1.tar.gz.1’


2022-12-01 07:52:54 (23.5 MB/s) - ‘aclImdb_v1.tar.gz.1’ saved [84125825/84125825]



In [5]:
def read_imdb_split(split_dir):
  split_dir = Path(split_dir)
  texts = []
  labels = []
  for label_dir in ['pos','neg']:
    for text_file in (split_dir/label_dir).iterdir():
      texts.append(text_file.read_text())
      labels.append(0 if label_dir is "neg" else 1)
    
  return texts, labels


# list of samples, list of labels
train_texts, train_labels = read_imdb_split('aclImdb/train')
test_texts, test_labels = read_imdb_split('aclImdb/test')

In [6]:
train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=0.2)

In [7]:
model_name = 'distilbert-base-uncased'
tokenizer = DistilBertTokenizerFast.from_pretrained(model_name)

In [8]:
# for the fine tuning, the model takes in encodings and not straight text
train_encodings = tokenizer(train_texts, padding=True, max_length=512, truncation=True, return_tensors ='pt')
test_encodings = tokenizer(test_texts, padding=True, max_length=512, truncation=True, return_tensors = 'pt')
val_encodings = tokenizer(val_texts, padding=True, max_length=512, truncation=True, return_tensors = 'pt')

In [9]:
for key, val in test_encodings.items():
  print(key)
  print(val[2]) # basically this tokenizer provides 2 items
  # one is the key, input_id and attention_mask
  # and the corresponding val points to the values of tokenized id and attention mask
  # val[1], val[2], val[3]... correpsond to different samples of the dataset

input_ids
tensor([  101,  2034,  1010,  1996,  3893,  2015,  1024,  2019,  6581,  3105,
         2012, 10775,  3923, 12793,  2000,  4848,  1996,  6888,  1997,  1996,
         2143,  1012,  2070,  1997,  1996,  7171,  2071,  2022,  5265,  2011,
         2139,  9610,  7277,  2080,  1012,  8234,  7871, 10207,  1010,  3376,
         1012,  1026,  7987,  1013,  1028,  1026,  7987,  1013,  1028,  1996,
         4997,  2015,  1024,  1996,  3441,  2024,  2524,  2000,  2903,  1012,
         4895, 22852,  1010,  4895,  2072,  1011,  8789,  3494,  3653,  2368,
         1998, 16819,  2531,  1003,  1997,  1996,  2051,  1010,  2004,  2065,
         2027,  2020,  1999,  2070,  2785,  1997,  4937, 17122,  1012,  2023,
         2003,  4445,  1996,  4980,  3490,  1997,  2010,  3041,  1010,  2172,
         2488,  5691,  4496,  1996, 19181, 13375,  2057,  1005,  2310,  2035,
         2272,  2000,  2113,  1998,  9120,  1012, 15451, 28195,  2003,  9987,
        20220,  1999,  2023,  3185,  1012,   102,     

In [10]:
# the trainer module takes in a torch dataset

class IMDbDataset(torch.utils.data.Dataset):
  def __init__(self, encodings, labels):
    self.encodings  = encodings
    self.labels = labels
  
  def __getitem__(self,idx):
    item = {key: val[idx] for key, val in self.encodings.items()}
    item['labels'] = torch.tensor(self.labels[idx])
    return item

  def __len__(self):
    return len(self.labels)

In [11]:
train_dataset = IMDbDataset(train_encodings, train_labels)
val_dataset = IMDbDataset(val_encodings, val_labels)
test_dataset = IMDbDataset(test_encodings, test_labels)

In [12]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir = './results', # output directory
    num_train_epochs = 3,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
)

model = DistilBertForSequenceClassification.from_pretrained(model_name)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

trainer.train()

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier

Step,Training Loss
10,0.6982
20,0.6989
30,0.695
40,0.6923
50,0.6938
60,0.6845
70,0.6737
80,0.6557
90,0.609
100,0.535


KeyboardInterrupt: ignored