In [61]:
import tweeteval
import pandas as pd

from tweeteval import Task, SCORERS

# Easy access to well-defined  tasks, scorers etc.

In [62]:
tasks = [task.name for task in Task]
tasks

['emoji', 'emotion', 'hate', 'irony', 'offensive', 'sentiment', 'stance']

In [63]:
task.hate

<Task.hate: 'hate'>

In [65]:
scorer = SCORERS[Task.emoji]
scorer(["a", "b", "c"], ["a", "a", "a"])

0.16666666666666666

# Getting data

In [68]:
task = Task.emotion

texts, labels = tweeteval.task_data(task, split="train", humanize=False)
df = pd.DataFrame({"text": texts, "labels": labels})
df

Unnamed: 0,text,labels
0,“Worry is a down payment on a problem you may ...,2
1,My roommate: it's okay that we can't spell bec...,0
2,No but that's so cute. Atsu was probably shy a...,1
3,Rooneys fucking untouchable isn't he? Been fuc...,0
4,it's pretty depressing when u hit pan on ur fa...,3
...,...,...
3252,I get discouraged because I try for 5 fucking ...,3
3253,The @user are in contention and hosting @user ...,3
3254,@user @user @user @user @user as a fellow UP g...,0
3255,You have a #problem? Yes! Can you do #somethin...,0


In [69]:
df.text.apply(tweeteval.preprocess)

0       “Worry is a down payment on a problem you may ...
1       My roommate: it's okay that we can't spell bec...
2       No but that's so cute. Atsu was probably shy a...
3       Rooneys fucking untouchable isn't he? Been fuc...
4       it's pretty depressing when u hit pan on ur fa...
                              ...                        
3252    I get discouraged because I try for 5 fucking ...
3253    The @user are in contention and hosting @user ...
3254    @user @user @user @user @user as a fellow UP g...
3255    You have a #problem? Yes! Can you do #somethin...
3256    @user @user i will fight this guy! Don't insul...
Name: text, Length: 3257, dtype: object

# Scoring

In [70]:
labels = tweeteval.test_labels(Task.hate)
tweeteval.score(labels, Task.hate)

1.0

# Embed

In [71]:
from tweeteval import embed

embedder = embed.TransformersEmbedder(model="cardiffnlp/twitter-roberta-base")
embedder.transform(df.text[:100])

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base were not used when initializing RobertaModel: ['lm_head.decoder.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/100 [00:00<?, ?it/s]

array([[ 0.13634089,  0.09893978,  0.08310488, ..., -0.04570993,
         0.18209243,  0.1740966 ],
       [ 0.12134975,  0.15258133,  0.08446873, ...,  0.09217748,
         0.07154384,  0.00937113],
       [ 0.15525904,  0.03006641,  0.06937746, ..., -0.20738386,
         0.0229534 ,  0.04626111],
       ...,
       [ 0.00549381,  0.04475764,  0.00641718, ...,  0.0542536 ,
         0.1654289 ,  0.0561096 ],
       [ 0.0761512 , -0.07705796, -0.04995267, ..., -0.1911625 ,
        -0.04284684,  0.01591019],
       [ 0.0822336 , -0.06972731, -0.00229299, ...,  0.05023243,
         0.14528626,  0.02427786]], dtype=float32)

# Classify

In [72]:
task = Task.emoji

texts, labels = tweeteval.task_data(task, split="test")
texts = [tweeteval.preprocess(t) for t in texts]

clf = tweeteval.PretrainedCardiffClassifier(task=task)
pred = clf.predict(texts[:100])
tweeteval.map_labels(pred, task)

Classifying:   0%|          | 0/100 [00:00<?, ?it/s]

['😍',
 '📷',
 '😎',
 '😍',
 '😍',
 '🎄',
 '😂',
 '📷',
 '☀',
 '📷',
 '❤',
 '😍',
 '😎',
 '😂',
 '🔥',
 '😎',
 '❤',
 '✨',
 '❤',
 '😂',
 '🔥',
 '❤',
 '❤',
 '❤',
 '😂',
 '😊',
 '🔥',
 '🎄',
 '❤',
 '❤',
 '😂',
 '📷',
 '😊',
 '✨',
 '🎄',
 '❤',
 '📷',
 '😉',
 '❤',
 '✨',
 '❤',
 '❤',
 '❤',
 '✨',
 '😎',
 '❤',
 '😍',
 '💯',
 '😍',
 '✨',
 '😊',
 '😎',
 '🔥',
 '❤',
 '😍',
 '😎',
 '😎',
 '🇺🇸',
 '😂',
 '📷',
 '😂',
 '🇺🇸',
 '🔥',
 '😊',
 '😍',
 '🔥',
 '😎',
 '😍',
 '✨',
 '❤',
 '📷',
 '😍',
 '😍',
 '❤',
 '☀',
 '🇺🇸',
 '😍',
 '🇺🇸',
 '😎',
 '✨',
 '😂',
 '❤',
 '❤',
 '📷',
 '😉',
 '🎄',
 '❤',
 '❤',
 '😊',
 '❤',
 '❤',
 '❤',
 '📷',
 '😘',
 '❤',
 '❤',
 '❤',
 '❤',
 '✨',
 '💕']

# Evaluate tasks

In [73]:
from sklearn.linear_model import SGDClassifier

task = Task.stance
embedder = embed.TfIdfEmbedder()
# embedder = embed.TransformersEmbedder(model="cardiffnlp/twitter-roberta-base")
clf = SGDClassifier(loss="hinge", penalty="l2", alpha=1e-3, random_state=42, max_iter=5, tol=None)

score, pred = tweeteval.eval_classifier(clf, task=task, preproc=True, embedder=embedder)
score

0.4621226874391431

In [41]:
tweeteval.map_labels(pred, task=task)

['against',
 'against',
 'against',
 'against',
 'against',
 'against',
 'against',
 'against',
 'against',
 'against',
 'against',
 'against',
 'against',
 'against',
 'against',
 'against',
 'against',
 'against',
 'against',
 'against',
 'against',
 'against',
 'against',
 'against',
 'favor',
 'against',
 'against',
 'against',
 'against',
 'none',
 'against',
 'against',
 'against',
 'against',
 'against',
 'against',
 'against',
 'against',
 'against',
 'against',
 'against',
 'against',
 'none',
 'against',
 'against',
 'against',
 'against',
 'favor',
 'against',
 'against',
 'against',
 'against',
 'against',
 'against',
 'against',
 'against',
 'against',
 'against',
 'against',
 'against',
 'against',
 'against',
 'against',
 'none',
 'against',
 'against',
 'none',
 'favor',
 'none',
 'against',
 'against',
 'against',
 'against',
 'against',
 'against',
 'against',
 'against',
 'none',
 'against',
 'against',
 'against',
 'against',
 'against',
 'against',
 'none',
 'again

In [76]:
from tweeteval import classify

model = classify.TfidfLogreg()
model.fit(df.text, df.labels)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Pipeline(steps=[('functiontransformer',
                 FunctionTransformer(func=<function TfidfLogreg.<locals>.<lambda> at 0x7fb44b1ba040>)),
                ('tfidfvectorizer', TfidfVectorizer()),
                ('truncatedsvd', TruncatedSVD(n_components=300)),
                ('logisticregressioncv', LogisticRegressionCV(max_iter=500))])

In [81]:
model.predict(df.text)

array(['2', '0', '3', ..., '0', '0', '0'], dtype=object)