<a href="https://colab.research.google.com/github/choprahetarth/Hands-on-NLP/blob/main/Pytorch_HF_Tutorial.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
from transformers import pipeline
import torch
import torch.nn.functional as F

## pipeline method

In [4]:
classifier = pipeline("sentiment-analysis") # great and easy way to generate the model inference

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


In [5]:
results = classifier(["We are very happy to show you the transformers library",
                  "We hope you don't hate it"])

In [6]:
for result in results:
  print(result)

{'label': 'POSITIVE', 'score': 0.9998044371604919}
{'label': 'POSITIVE', 'score': 0.831962525844574}


## use a concrete model

In [7]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification 

In [8]:
model_name = "distilbert-base-uncased-finetuned-sst-2-english"

In [9]:
model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

classifier = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)

results = classifier(["We are very happy to show you the transformers library",
                  "We hope you don't hate it"])

In [10]:
for result in results:
  print(result)

{'label': 'POSITIVE', 'score': 0.9998044371604919}
{'label': 'POSITIVE', 'score': 0.831962525844574}


## use model and tokenizer differently

In [11]:
tokens = tokenizer.tokenize("We are very happy to show you the transformers library")
token_ids = tokenizer.convert_tokens_to_ids(tokens)
input_ids = tokenizer("We are very happy to show you the transformers library")

print(tokens)
print(token_ids)
print(input_ids) # we can see that it has an additional 101 and 102 tokens which are the Starting and Ending tokens. 

['we', 'are', 'very', 'happy', 'to', 'show', 'you', 'the', 'transformers', 'library']
[2057, 2024, 2200, 3407, 2000, 2265, 2017, 1996, 19081, 3075]
{'input_ids': [101, 2057, 2024, 2200, 3407, 2000, 2265, 2017, 1996, 19081, 3075, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [12]:
X_train = ["We are very happy to show you the transformers library","We hope you don't hate it"]
batch = tokenizer(X_train, padding = True, truncation = True, max_length = 512, return_tensors="pt")

In [14]:
batch

{'input_ids': tensor([[  101,  2057,  2024,  2200,  3407,  2000,  2265,  2017,  1996, 19081,
          3075,   102],
        [  101,  2057,  3246,  2017,  2123,  1005,  1056,  5223,  2009,   102,
             0,     0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]])}

## Pytorch Classification

In [15]:
with torch.no_grad():
  outputs = model(**batch, labels=torch.tensor([1,0])) ## ** is for unpacking
  print(outputs) ## model here returns sequence classifier output object, due to the nature of the model provided by us to huggingface (AutoModelForSequenceClassification)
  predictions = F.softmax(outputs.logits, dim=1) ## apply softmax on logits
  print(predictions)
  labels = torch.argmax(predictions,dim=1) # get the argmax 
  print(labels)
  labels = [model.config.id2label[label_id] for label_id in  labels.tolist()] # convert the argmax's to label names
  print(labels)

SequenceClassifierOutput(loss=tensor(0.8919), logits=tensor([[-4.1465,  4.3928],
        [-0.8004,  0.7992]]), hidden_states=None, attentions=None)
tensor([[1.9560e-04, 9.9980e-01],
        [1.6804e-01, 8.3196e-01]])
tensor([1, 1])
['POSITIVE', 'POSITIVE']


## save model

In [16]:
save_directory = "saved"
tokenizer.save_pretrained(save_directory)
model.save_pretrained(save_directory)

In [17]:
model = AutoModelForSequenceClassification.from_pretrained(save_directory)
tokenizer = AutoTokenizer.from_pretrained(save_directory)

## use different german model

In [18]:
 model_name = "oliverguhr/german-sentiment-bert"

 german_sentences = ["Nett, Sie kennen zu lernen", 
                     "Gibt es ein Restaurant in der Nähe?",
                     "Haben Sie noch Zimmer frei?"]

In [19]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

batch = tokenizer(german_sentences,
                  max_length=125,
                  truncation=True,
                  padding=True,
                  return_tensors="pt")

Downloading:   0%|          | 0.00/255k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/436M [00:00<?, ?B/s]

In [20]:
with torch.no_grad():
  outputs=model(**batch)

In [25]:
applied_softmax = F.softmax(outputs.logits, dim=1)

In [29]:
label_ids = torch.argmax(applied_softmax,dim=1)

In [31]:
labels = [model.config.id2label[label_id] for label_id in  label_ids.tolist()]

In [32]:
labels

['neutral', 'neutral', 'neutral']

## fine tuning existing model