# Introduction To HuggingFace Transformers 

In [1]:
# Built-in library
import re
import json
from typing import Any, Dict, List, Optional, Union
import logging
import warnings

# Standard imports
import numpy as np
from pprint import pprint
import pandas as pd

# Visualization
import matplotlib.pyplot as plt


# Pandas settings
pd.options.display.max_rows = 1_000
pd.options.display.max_columns = 1_000
pd.options.display.max_colwidth = 600

warnings.filterwarnings("ignore")

# Black code formatter (Optional)
%load_ext lab_black
# auto reload imports
%load_ext autoreload
%autoreload 2

In [2]:
from transformers import pipeline


classifier = pipeline("sentiment-analysis")
classifier(
    [
        "I've been waiting for a HuggingFace course my whole life.",
        "I hate this so much!",
    ]
)

2023-09-04 04:06:08.549971: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


[{'label': 'POSITIVE', 'score': 0.9598051905632019},
 {'label': 'NEGATIVE', 'score': 0.9994558691978455}]

### Preprocessing with a tokenizer

```text
- Transformer models can’t process raw text directly, so the first step of our pipeline is to convert the text inputs into numbers that the model can make sense of. 
- To do this we use a tokenizer, which will be responsible for:
  - Splitting the input into words, subwords, or symbols (like punctuation) that are called tokens
  - Mapping each token to an integer
  - Adding additional inputs that may be useful to the model

- All this preprocessing needs to be done in exactly the same way as when the model was pretrained.
- The AutoTokenizer class and its from_pretrained() method are used to download and cache the data associated with the model's tokenizer. 
- This is done automatically using the checkpoint name of the model. The data is only downloaded the first time the code is run.
```

In [3]:
from transformers import AutoTokenizer


# The default checkpoint of the sentiment-analysis pipeline is:
# distilbert-base-uncased-finetuned-sst-2-english
checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [4]:
# Transformer models only accept tensors as input.
# To specify the type of tensors we want to get back (PyTorch, TensorFlow, or plain NumPy), use the return_tensors argument:
raw_inputs = [
    "I've been waiting for a HuggingFace course my whole life.",
    "I hate this so much!",
]
inputs = tokenizer(raw_inputs, padding=True, truncation=True, return_tensors="pt")
pprint(inputs)

{'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]]),
 'input_ids': tensor([[  101,  1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,
          2607,  2026,  2878,  2166,  1012,   102],
        [  101,  1045,  5223,  2023,  2061,  2172,   999,   102,     0,     0,
             0,     0,     0,     0,     0,     0]])}


### Going Through The Model

```text
- Download the pretrained model the same way just like the tokenizer. 
- 🤗 Transformers provides an AutoModel class which also has a from_pretrained() method:
```

In [5]:
from transformers import AutoModel

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
model = AutoModel.from_pretrained(checkpoint)

### High-dimensional Vector

```text
- The vector output by the Transformer module is usually large. It generally has three dimensions:
  - Batch size: The number of sequences processed at a time (2 in our example).
  - Sequence length: The length of the numerical representation of the sequence (16 in our example).
  - Hidden size: The vector dimension of each model input.
  
- It's said to be “high dimensional” because of the last value. The hidden size can be very large (768 is common for smaller models, and in larger models this can reach 3072 or more).

```

In [6]:
outputs = model(**inputs)
print(outputs.last_hidden_state.shape)

torch.Size([2, 16, 768])


### [Model Heads](https://huggingface.co/learn/nlp-course/chapter2/2?fw=pt)

```text

- The model heads take the high-dimensional vector of hidden states as input and project them onto a different dimension. 
- They are usually composed of one or a few linear layers.
- The output of the Transformer model is sent directly to the model head to be processed.
```

In [7]:
from transformers import AutoModelForSequenceClassification


# In this example, we'll need a model with a sequence classification head (to be able to classify the sentences as positive or negative).
# So, we won’t actually use the AutoModel class, but AutoModelForSequenceClassification:
checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
outputs = model(**inputs)

In [8]:
# Since we have just two sentences and two labels, the result we get from our model is of shape 2 x 2.
print(outputs.logits.shape)

torch.Size([2, 2])


### Postprocessing the output

```text

- 
```

In [9]:
# The values we get as output from our model don’t necessarily make sense by themselves. Let’s take a look:
print(outputs.logits)

tensor([[-1.5607,  1.6123],
        [ 4.1692, -3.3464]], grad_fn=<AddmmBackward0>)


In [10]:
import torch


# The model predicted [-1.5607, 1.6123] for the first sentence and [4.1692, -3.3464] for the second one.
# These are logits, not probabilities. To convert them to probabilities, they need to go through a SoftMax layer.
predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
print(predictions)

tensor([[4.0195e-02, 9.5981e-01],
        [9.9946e-01, 5.4418e-04]], grad_fn=<SoftmaxBackward0>)


In [11]:
# The model predicted [0.0402, 0.9598] for the first sentence and [0.9995, 0.0005] for the second one. These are probability scores.
# The labels corresponding to each position can be found by inspecting the id2label attribute of the model config.
model.config.id2label

{0: 'NEGATIVE', 1: 'POSITIVE'}

### Ex 1:

```text
✏️ Choose two (or more) texts of your own and run them through the sentiment-analysis pipeline. 
- Replicate the steps you saw here yourself and check that you obtain the same results!
```

In [13]:
# Method 1: Using the pipeline
raw_inputs = [
    "Yesterday's football match was not the greatest",
    "I'm looking forward to starting my consultancy firm",
]
task = "sentiment-analysis"
clf = pipeline(task=task)
clf(raw_inputs)

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


[{'label': 'NEGATIVE', 'score': 0.9977226853370667},
 {'label': 'POSITIVE', 'score': 0.9979140162467957}]

In [17]:
tokenizer??

[0;31mSignature:[0m     
[0mtokenizer[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mtext[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mstr[0m[0;34m,[0m [0mList[0m[0;34m[[0m[0mstr[0m[0;34m][0m[0;34m,[0m [0mList[0m[0;34m[[0m[0mList[0m[0;34m[[0m[0mstr[0m[0;34m][0m[0;34m][0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mtext_pair[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mstr[0m[0;34m,[0m [0mList[0m[0;34m[[0m[0mstr[0m[0;34m][0m[0;34m,[0m [0mList[0m[0;34m[[0m[0mList[0m[0;34m[[0m[0mstr[0m[0;34m][0m[0;34m][0m[0;34m,[0m [0mNoneType[0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mtext_target[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mstr[0m[0;34m,[0m [0mList[0m[0;34m[[0m[0mstr[0m[0;34m][0m[0;34m,[0m [0mList[0m[0;34m[[0m[0mList[0m[0;34m[[0m[0mstr[0m[0;34m][0m[0;34m][0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m


In [25]:
from rich import print


# Method 2: Manual Approach
checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
input = tokenizer(raw_inputs, padding=True, truncation=True, return_tensors="pt")
print(input)

In [26]:
clf_model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
output = clf_model(**input)
print(output)

In [30]:
import torch.nn.functional as F

logits_ = output.logits
prob = F.softmax(logits_, dim=1)
print(prob)

In [31]:
clf_model.config.id2label

{0: 'NEGATIVE', 1: 'POSITIVE'}