Source: HuggingFace Crash Course - https://www.youtube.com/watch?v=GSt00_-0ncQ

In [None]:
!pip install transformers

In [2]:
from transformers import pipeline

classifier = pipeline("sentiment-analysis")

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]



In [3]:
res = classifier("I am very happy to show you the transformers library.")
print(res)

[{'label': 'POSITIVE', 'score': 0.9997844099998474}]


In [4]:
results = classifier(["I am very happy to show you the transformers library.",
                      "I hope you don't hate it"])

for result in results:
  print(result)

{'label': 'POSITIVE', 'score': 0.9997844099998474}
{'label': 'POSITIVE', 'score': 0.5313729047775269}


In [5]:
model_name = "distilbert-base-uncased-finetuned-sst-2-english"  #however, it is the default model for sentiment-analysis tasks

classifier = pipeline("sentiment-analysis", model = model_name)

results = classifier(["I am very happy to show you the transformers library.",
                      "I hope you don't hate it"])

for result in results:
  print(result)

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

{'label': 'POSITIVE', 'score': 0.9997844099998474}
{'label': 'POSITIVE', 'score': 0.5313729047775269}


In [6]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model_name = "distilbert-base-uncased-finetuned-sst-2-english" #it can be changed to another model

model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

classifier = pipeline("sentiment-analysis", model = model, tokenizer = tokenizer)

results = classifier(["I am very happy to show you the transformers library.",
                      "I hope you don't hate it"])

for result in results:
  print(result)

{'label': 'POSITIVE', 'score': 0.9997844099998474}
{'label': 'POSITIVE', 'score': 0.5313729047775269}


In [7]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model_name = "distilbert-base-uncased-finetuned-sst-2-english" #it can be changed to another model

model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

tokens = tokenizer.tokenize("I am very happy to show you the transformers library.")
token_ids = tokenizer.convert_tokens_to_ids(tokens)
input_ids = tokenizer("I am very happy to show you the transformers library.")

print(f"   Tokens: {tokens}")
print(f"Token IDs: {token_ids}")
print(f"Input IDs: {input_ids}")

   Tokens: ['i', 'am', 'very', 'happy', 'to', 'show', 'you', 'the', 'transformers', 'library', '.']
Token IDs: [1045, 2572, 2200, 3407, 2000, 2265, 2017, 1996, 19081, 3075, 1012]
Input IDs: {'input_ids': [101, 1045, 2572, 2200, 3407, 2000, 2265, 2017, 1996, 19081, 3075, 1012, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [8]:
X_train = ["I am very happy to show you the transformers library.",
                      "I hope you don't hate it"]

batch = tokenizer(X_train, padding=True, truncation=True, max_length=512, return_tensors="pt")
print(batch)

{'input_ids': tensor([[  101,  1045,  2572,  2200,  3407,  2000,  2265,  2017,  1996, 19081,
          3075,  1012,   102],
        [  101,  1045,  3246,  2017,  2123,  1005,  1056,  5223,  2009,   102,
             0,     0,     0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0]])}


In [9]:
import torch
import torch.nn.functional as F

with torch.no_grad():
  outputs = model(**batch) #try to add kwarg: labels=torch.tensor([1,0])
  print(outputs)
  predictions = F.softmax(outputs.logits, dim=1)
  print(predictions)
  labels = torch.argmax(predictions, dim=1)
  print(labels)
  labels = [model.config.id2label[label_id] for label_id in labels.tolist()]
  print(labels)

SequenceClassifierOutput(loss=None, logits=tensor([[-4.0925,  4.3496],
        [-0.0388,  0.0869]]), hidden_states=None, attentions=None)
tensor([[2.1554e-04, 9.9978e-01],
        [4.6863e-01, 5.3137e-01]])
tensor([1, 1])
['POSITIVE', 'POSITIVE']


Suppose, that we have fine-tuned a model. This is how to **save - and later, load - the model**:

In [12]:
save_directory = "saved"
tokenizer.save_pretrained(save_directory)
model.save_pretrained(save_directory)

tokenizer = AutoTokenizer.from_pretrained(save_directory) #parameter: either a model name or its directory
model = AutoModelForSequenceClassification.from_pretrained(save_directory)

In [13]:
model_name = "oliverguhr/german-sentiment-bert"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

X_train_german = ["Mit keinem guten Ergebnis", "Das war unfair", "Das ist gar nicht mal so gut", "nicht so schlecht wie erwartet",
         "Das war gut!", "Sie fährt ein grünes Auto"]

batch = tokenizer(X_train_german, padding=True, truncation=True, max_length=512, return_tensors="pt")
print(batch)

with torch.no_grad():
  outputs = model(**batch)
  label_ids = torch.argmax(outputs.logits, dim=1)
  print(label_ids)
  labels = [model.config.id2label[label_id] for label_id in label_ids.tolist()]
  print(labels)

tokenizer_config.json:   0%|          | 0.00/161 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/255k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

{'input_ids': tensor([[    3,   304,  8524,  5569,  2011,     4,     0,     0,     0],
        [    3,   295,   185,   174,  8716,   124,     4,     0,     0],
        [    3,   295,   127,  2523,   149,  2723,   181,  1522,     4],
        [    3,   149,   181,  6975,   246,  6303,     4,     0,     0],
        [    3,   295,   185,  1522, 26982,     4,     0,     0,     0],
        [    3,   371,  9755,    39, 19044, 26902,  3512,     4,     0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 0, 0],
        [1, 1, 1, 1, 1, 1, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 0]])}
tensor([1, 1, 1, 0, 0, 2])
['negative', 'negative', 'negative'