In [15]:
import json

from transformers import (
    pipeline,
    TFAutoModelForSequenceClassification,
    AutoTokenizer
)

In [19]:
checkpoint = "cardiffnlp/twitter-roberta-base-sentiment"

# getting pre-trained model
model = TFAutoModelForSequenceClassification.from_pretrained(checkpoint)
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

All the layers of TFRobertaForSequenceClassification were initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.


In [20]:
tokenizer([
    "I've been waiting this for my whole life",
    "I hate this"
])

{'input_ids': [[0, 100, 348, 57, 2445, 42, 13, 127, 1086, 301, 2], [0, 100, 4157, 42, 2]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1]]}

In [21]:
# predefined labels
LABELS = {
    "LABEL_0": "NEGATIVE ",
    "LABEL_1": "NEUTRAL",
    "LABEL_2": "POSITIVE"
}

In [22]:
# setting up the sentiment analyzer pipline
analyzer = pipeline(task="sentiment-analysis",
                    model=model,
                    tokenizer=tokenizer)

In [23]:
analyzer([
    "I've been waiting this for my whole life",
    "I hate this shit"
])

[{'label': 'LABEL_1', 'score': 0.5709302425384521},
 {'label': 'LABEL_0', 'score': 0.9736179113388062}]

In [24]:
# preprocess the tweets
def clean(text: str) -> str:
    cleaned = []
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = '' if t.startswith('http') else t
        cleaned.append(t)
    return " ".join(cleaned)

# load, and parse tweets
def load(path: str):
    with open(path, mode="r") as file:
        for line in file.readlines():
            try:
                tweet = json.loads(line)
                text = tweet['text']
                yield clean(text)
            except Exception:
                    continue

In [28]:
tweets_path = "../data/20221101000000.json"
loader = load(tweets_path)
# inference top 10
for i, t in enumerate(loader):
    if i >= 10:
        break
    analysis = analyzer(t)[0]
    print(f"TEXT: {t}")
    print(f"LABEL: {LABELS[analysis['label']]}")
    print(f"LABEL: {analysis['score']}")
    print("-" * 10)

TEXT: akun masih flop ternyata, agaknya shaddy memang ditakdirkan untuk jbjb aja..
LABEL: NEUTRAL
LABEL: 0.6582995653152466
----------
TEXT: RT @user Psra cerrar este día de #Culos🍑
Les dejamos un breve videito rico🔥😈😋 cogiendo delicioso🔥
Esperando sea de su agrado, quién…
LABEL: NEUTRAL
LABEL: 0.5533027052879333
----------
TEXT: RT @user ‼️‼️💚SALE ENDING SOON💚‼️‼️
Watch our latest hottest collab ⬇️⬇️
https://t.co/kGpSdd3HUR 
(New videos, different angles,…
LABEL: NEUTRAL
LABEL: 0.5087047219276428
----------
TEXT: 念のため今日休みにしておいてよかった…
LABEL: NEUTRAL
LABEL: 0.7614464163780212
----------
TEXT: @user けいちゃーん♡ありがとーー😭💕💕
ほんとかっこよくてとろけたよ🫠🫠💕💕
実物の破壊力にやられたよー😇
ほんとイケメンだったよー❣️
けいちゃんがハッピーになってくれての🥺💕
うれしいよ❣️
LABEL: NEUTRAL
LABEL: 0.7424249053001404
----------
TEXT: RT @user 
LABEL: NEUTRAL
LABEL: 0.692699670791626
----------
TEXT: @user NUH até desceu a bosta presa
LABEL: NEUTRAL
LABEL: 0.8003782629966736
----------
TEXT: RT @user Essa culpa eu não carrego, amados.
LABEL: NEUTRAL
LABEL: 0.807551145553588

In [29]:
# saving model's config, and weights to the disk
model_path = f"../models/{checkpoint}"
analyzer.save_pretrained(model_path)