In [1]:
from transformers import pipeline

## Models

In [2]:
bert_large_case = 'bert-large-cased'
bert_large_uncased = 'bert-large-uncased'
bert_base_multi_uncased = 'nlptown/bert-base-multilingual-uncased-sentiment'

# Classifying

In [3]:
classifier = pipeline(task='sentiment-analysis', model=bert_base_multi_uncased)
classifier('Elon musk is worst enemy on earth.')

A matching Triton is not available, some optimizations will not be enabled.
Error caught was: No module named 'triton'


[{'label': '1 star', 'score': 0.884320080280304}]

In [4]:
twitter_trained = 'cardiffnlp/twitter-roberta-base-sentiment-latest'
sentiment_task = pipeline("sentiment-analysis",
                          model=twitter_trained, tokenizer=twitter_trained)
sentiment_task("Elon must is the best evil out there")

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[{'label': 'negative', 'score': 0.7869551181793213}]

# Unmasking

In [5]:
unmasker = pipeline('fill-mask', model='bert-large-cased')
unmasker("I want to eat [MASK] after dinner.")

Some weights of the model checkpoint at bert-large-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'bert.pooler.dense.weight', 'bert.pooler.dense.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[{'score': 0.3337928354740143,
  'token': 1380,
  'token_str': 'something',
  'sequence': 'I want to eat something after dinner.'},
 {'score': 0.0629892349243164,
  'token': 5953,
  'token_str': 'lunch',
  'sequence': 'I want to eat lunch after dinner.'},
 {'score': 0.0581633634865284,
  'token': 1122,
  'token_str': 'it',
  'sequence': 'I want to eat it after dinner.'},
 {'score': 0.055391523987054825,
  'token': 4014,
  'token_str': 'dinner',
  'sequence': 'I want to eat dinner after dinner.'},
 {'score': 0.05277910456061363,
  'token': 6462,
  'token_str': 'breakfast',
  'sequence': 'I want to eat breakfast after dinner.'}]

# Classification pipeline
### Preprocess text (username and link placeholders)

In [6]:
from scipy.special import softmax
import numpy as np
from transformers import AutoTokenizer, AutoConfig
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoModelForSequenceClassification

In [7]:
def preprocess(text):
    new_text = []
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)

In [8]:
MODEL = f"cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
config = AutoConfig.from_pretrained(MODEL)
# PT
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
# model.save_pretrained(MODEL)
text = "I do not know how hugging face works and I am just trying random things out"
text = preprocess(text)
encoded_input = tokenizer(text, return_tensors='pt')
output = model(**encoded_input)
scores = output[0][0].detach().numpy()
scores = softmax(scores)
# # TF
# model = TFAutoModelForSequenceClassification.from_pretrained(MODEL)
# model.save_pretrained(MODEL)
# text = "Covid cases are increasing fast!"
# encoded_input = tokenizer(text, return_tensors='tf')
# output = model(encoded_input)
# scores = output[0][0].numpy()
# scores = softmax(scores)
# Print labels and scores
ranking = np.argsort(scores)
ranking = ranking[::-1]
for i in range(scores.shape[0]):
    l = config.id2label[ranking[i]]
    s = scores[ranking[i]]
    print(f"{i+1}) {l} {np.round(float(s), 4)}")


Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


1) negative 0.4884
2) neutral 0.4788
3) positive 0.0328
