In [83]:
from transformers import pipeline

In [84]:
classifier = pipeline('sentiment-analysis')

raw_inputs = ['Cinderella is a sweet fairy tale story', "The experience was painful and torturing"]
              
output = classifier(raw_inputs)

output

[{'label': 'POSITIVE', 'score': 0.999880850315094},
 {'label': 'NEGATIVE', 'score': 0.9789105653762817}]

# get clue about the model to get the tokenizer that used with it 
help(classifier.model)

In [85]:
framework = classifier.framework

print(f"Underlying framework: {framework}")

Underlying framework: pt


### Tokenization


- Use **AutoTokenizer** class and its **from_pretrained** method to download the tokenizer by referring the checkpoint name of our model 

- default checkpoint of **sentiment-analysis**
    - distilbert-base-uncased-finetuned-sst-2-english [here](https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english)


In [86]:
from transformers import AutoTokenizer

In [87]:
checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"

tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [88]:
## can put in a list or multiple lists (like now)

# if return_tensors not defined , will return as list
inputs = tokenizer(raw_inputs, padding = True, truncation = True, return_tensors = framework)

# inputs is a disctionary have two keys
# input_ids
# attention_mask
print(inputs)


#list_inputs = tokenizer(raw_inputs, padding = True, truncation = True)

#print(list_inputs)

{'input_ids': tensor([[  101, 21686,  2003,  1037,  4086,  8867,  6925,  2466,   102],
        [  101,  1996,  3325,  2001,  9145,  1998, 17153, 16037,   102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1]])}


### import pretrained model 

In [89]:
from transformers import AutoModel

In [90]:
model = AutoModel.from_pretrained(checkpoint)

Some weights of the model checkpoint at distilbert-base-uncased-finetuned-sst-2-english were not used when initializing DistilBertModel: ['classifier.bias', 'pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [91]:
outputs = model(**inputs)

# batch size, sequence length, hidden size (vector dimension of each model input)
# 2 = batch size
# 11 = sequence length
# 768 = The vector dimension of each model input.

# can call with the following three methods, its a namedtuples
print(outputs.last_hidden_state.shape)
print(outputs['last_hidden_state'].shape)
print(outputs[0].shape)

torch.Size([2, 9, 768])
torch.Size([2, 9, 768])
torch.Size([2, 9, 768])


In [92]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

outputs = model(**inputs)

# two sentences with two labels hence (2,2)
print(outputs.logits.shape)
print(outputs.logits)


torch.Size([2, 2])
tensor([[-4.3428,  4.6923],
        [ 2.0215, -1.8161]], grad_fn=<AddmmBackward>)


In [93]:
import torch

predictions = torch.nn.functional.softmax(outputs.logits, dim = 1)

print(predictions)

tensor([[1.1913e-04, 9.9988e-01],
        [9.7891e-01, 2.1089e-02]], grad_fn=<SoftmaxBackward>)


In [97]:
# get the label

labelmap = model.config.id2label

print('labels from index to class')
print(labelmap)


# alternatively, use torch.argmax
final_predictions = torch.max(predictions, dim = 1)

print(final_predictions)

print([labelmap.get(i.item()) for i in final_predictions.indices])

torch.return_types.max(
values=tensor([0.9999, 0.9789], grad_fn=<MaxBackward0>),
indices=tensor([1, 0]))
