In [1]:
! pip install datasets evaluate transformers diffusers accelerate ftfy pyarrow --quiet

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/44.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.8/44.8 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[?25h

## Tokenizer

In [3]:
from transformers import AutoTokenizer # class

# tokenizers are specific to the models!
checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
# distilbert/distilbert-base-uncased-finetuned-sst-2-english
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

In [4]:
raw_inputs = [
    "I am learning Operating Systems and it's so fun.", # one num
    "I write terrible C++ code!", # sec num
] # text inputs that the model cannot ingest directly
inputs = tokenizer(raw_inputs, padding=True, truncation=True, return_tensors="pt", max_length=15)
print(inputs)

{'input_ids': tensor([[ 101, 1045, 2572, 4083, 4082, 3001, 1998, 2009, 1005, 1055, 2061, 4569,
         1012,  102],
        [ 101, 1045, 4339, 6659, 1039, 1009, 1009, 3642,  999,  102,    0,    0,
            0,    0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0]])}


In [None]:
[4, 5, 6] # row 1 -> inp 1
[1, 2, <pad_tokens>] # row 2 -> inp 2

# unable to form a matrix -> matmul not possible -> model processing np

In [5]:
# another way to do it - a closer look into the tokenizer
sequence = "I am learning Operating Systems and it's so fun."
tokens = tokenizer.tokenize(sequence)

print("--------TOKENIZED SENTENCE--------")
print(tokens)

print("\n\n--------TOKENS MAPPED TO THEIR IDS--------")
ids = tokenizer.convert_tokens_to_ids(tokens)
print(ids)

--------TOKENIZED SENTENCE--------
['i', 'am', 'learning', 'operating', 'systems', 'and', 'it', "'", 's', 'so', 'fun', '.']


--------TOKENS MAPPED TO THEIR IDS--------
[1045, 2572, 4083, 4082, 3001, 1998, 2009, 1005, 1055, 2061, 4569, 1012]


**TAKE A PAUSE!**

**SPECIAL TOKENS!**

In [6]:
# already tokenized input

# mapping the token ids back to the tokens
decoded_string = tokenizer.decode([1045, 2572, 4083, 4082, 3001, 1998, 2009, 1005, 1055, 2061, 4569, 1012])
print(decoded_string)

i am learning operating systems and it ' s so fun.


In [7]:
# raw text
decoded_string = tokenizer.decode([101, 1045, 2572, 4083, 4082, 3001, 1998, 2009, 1005, 1055, 2061, 4569,
         1012,  102])
print(decoded_string)

[CLS] i am learning operating systems and it ' s so fun. [SEP]


**Padding - ways to do it**

In [10]:
# Will pad the sequences up to the maximum sequence length
model_inputs = tokenizer(sequence, padding="longest")

print("--------TOKEN IDS (after pad the sequences up to the maximum sequence length)--------")
print(model_inputs["input_ids"])

print("\n--------DECODED STRING (after pad the sequences up to the maximum sequence length)--------")
decoded_padded_string = tokenizer.decode(model_inputs["input_ids"])
print(decoded_padded_string)



# Will pad the sequences up to the model max length
# (512 for BERT or DistilBERT) model max length = the max sentence length that the model can ingest
model_inputs = tokenizer(sequence, padding="max_length")



print("--------TOKEN IDS (after Will pad the sequences up to the model max length)--------")
print(model_inputs["input_ids"])

print("\n--------DECODED STRING (after Will pad the sequences up to the model max length)--------")
decoded_padded_string = tokenizer.decode(model_inputs["input_ids"])
print(decoded_padded_string)



# Will pad the sequences up to the specified max length
model_inputs = tokenizer(sequence, padding="max_length", max_length=8)

print("--------TOKEN IDS (after padding to max_length=8)--------")
print(model_inputs["input_ids"])

print("\n--------DECODED STRING (after padding to max_length=8)--------")
decoded_padded_string = tokenizer.decode(model_inputs["input_ids"])
print(decoded_padded_string)

--------TOKEN IDS (after pad the sequences up to the maximum sequence length)--------
[101, 1045, 2572, 4083, 4082, 3001, 1998, 2009, 1005, 1055, 2061, 4569, 1012, 102]

--------DECODED STRING (after pad the sequences up to the maximum sequence length)--------
[CLS] i am learning operating systems and it ' s so fun. [SEP]
--------TOKEN IDS (after Will pad the sequences up to the model max length)--------
[101, 1045, 2572, 4083, 4082, 3001, 1998, 2009, 1005, 1055, 2061, 4569, 1012, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [9]:
print("--------TOKEN IDS (after padding to max_length=8)--------")
print(model_inputs["input_ids"])

print("\n--------DECODED STRING (after padding to max_length=8)--------")
decoded_padded_string = tokenizer.decode(model_inputs["input_ids"])
print(decoded_padded_string)

--------TOKEN IDS (after padding to max_length=8)--------
[101, 1045, 2572, 4083, 4082, 3001, 1998, 2009, 1005, 1055, 2061, 4569, 1012, 102]

--------DECODED STRING (after padding to max_length=8)--------
[CLS] i am learning operating systems and it ' s so fun. [SEP]


## Models

In [11]:
from transformers import AutoModel

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
model = AutoModel.from_pretrained(checkpoint)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/100 [00:00<?, ?it/s]

DistilBertModel LOAD REPORT from: distilbert-base-uncased-finetuned-sst-2-english
Key                   | Status     |  | 
----------------------+------------+--+-
pre_classifier.bias   | UNEXPECTED |  | 
pre_classifier.weight | UNEXPECTED |  | 
classifier.weight     | UNEXPECTED |  | 
classifier.bias       | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


In [12]:
outputs = model(**inputs)
print(outputs.last_hidden_state.shape) # bs, seq_len, dim

torch.Size([2, 14, 768])


    "I am learning Operating Systems and it's so fun.", # one num
    "I write terrible C++ code!", # sec num

In [13]:
from transformers import AutoModelForSequenceClassification
import torch

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
outputs = model(**inputs)


# there are a total of 2 classes -- use this info to make sense of the shape of logits
print("--------SHAPE OF LOGIT TENSOR--------")
print(outputs.logits.shape)

print("\n\n--------LOGIT TENSOR--------")
print(outputs.logits)

# convert logits to probs --> softmax (math func)
predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
predictions = torch.argmax(predictions, dim=1)
print("\n\n--------PREDICTIONS--------")
print(predictions) # os one, c++ one

Loading weights:   0%|          | 0/104 [00:00<?, ?it/s]

--------SHAPE OF LOGIT TENSOR--------
torch.Size([2, 2])


--------LOGIT TENSOR--------
tensor([[-4.3357,  4.6875],
        [ 4.6717, -3.7884]], grad_fn=<AddmmBackward0>)


--------PREDICTIONS--------
tensor([1, 0])


--------SHAPE OF LOGIT TENSOR--------

**torch.Size([2, 2])**

The **first 2** represents the **batch size**, which is the number of **input sequences (sentences)** you provided

(in this case, "I am learning Operating Systems..." and "I write terrible C++ code!").

The **second 2** represents the number of classes for the **classification task**.

Since this model is fine-tuned for sentiment analysis **(likely positive/negative)**, it has two possible output classes.



--------LOGIT TENSOR--------

**tensor([[-4.3357,  4.6875],[ 4.6717, -3.7884]], grad_fn=<AddmmBackward0>)**


logits. Logits are the raw, unnormalized scores generated by the **last layer of the model** for each class. They are not probabilities yet.

**Each row corresponds to an input sequence,** and
**each column corresponds to the score for a specific class**:

For the first sentence ("I am learning Operating Systems and it's so fun."), the scores are [-4.3357, 4.6875]. The second score is significantly higher.

For the second sentence ("I write terrible C++ code!"), the scores are [4.6717, -3.7884]. The first score is significantly higher.



--------PREDICTIONS--------

**tensor([1, 0])**

 the final predicted class labels after applying a softmax function to the logits (to convert them into probabilities) and then argmax (to select the class with the highest probability). These are the indices of the predicted classes:

**For the first sentence, the prediction is 1** (because 4.6875 at index 1 was the highest logit).
**For the second sentence, the prediction is 0** (because 4.6717 at index 0 was the highest logit).
