In [223]:
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer, Trainer
import numpy as np
import pandas as pd
from datasets import Dataset
from sklearn.metrics import classification_report
import re

In [224]:
model_name = "siebert/sentiment-roberta-large-english"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
trainer = Trainer(model=model)
DATA = '../Data/Twitter/test.csv'

In [225]:
model.config.id2label

{0: 'NEGATIVE', 1: 'POSITIVE'}

In [226]:
pd.set_option('display.max_colwidth', None)
df = pd.read_csv(DATA, encoding='latin')
df

Unnamed: 0,target,text
0,-1,@ilydi hayuuk but cannot.. exammers on monday. thinking of after siiih
1,1,ben ?u an Ã¯Â¿Â½ekiyorum. RELOADED rulz re: http://ff.im/2ZzUt
2,1,have nice weekend !!guys !!! !!!
3,1,"@smont I got home some time after 2, not sure exactly when. Maybe half 2. Nathan was still there when I left :O Good on him"
4,1,@alyssaaction beach ready in street clothes hahaha! Sexy
...,...,...
19995,-1,"Youth was awesome! Church later! Left my charger there laptop died, rip buddy"
19996,-1,Good morning everyone! Had an awesome dream about Danny.... What a bummer it was just a dream x x x
19997,1,@vahnee btw: very very few of my &quot;followers&quot; are playing along.. :-P if that makes u feel any better..
19998,1,@wikiworf LOL i never followed him anyway... just wondering that's all!


In [227]:
df['target'].replace({-1:0},inplace=True)
df.rename(columns={'target':'label'}, inplace=True)
# Remove urls
url_pattern = re.compile(r'https?://\S+|www\.\S+|\b\w+\.com\b|\b\w+\.org\b')
df['text'] = df['text'].apply(lambda x: url_pattern.sub('', x))
#removing usernames
username_pattern = re.compile(r'@\w+')
df['text'] = df['text'].apply(lambda x: username_pattern.sub('', x))
df

Unnamed: 0,label,text
0,0,hayuuk but cannot.. exammers on monday. thinking of after siiih
1,1,ben ?u an Ã¯Â¿Â½ekiyorum. RELOADED rulz re:
2,1,have nice weekend !!guys !!! !!!
3,1,"I got home some time after 2, not sure exactly when. Maybe half 2. Nathan was still there when I left :O Good on him"
4,1,beach ready in street clothes hahaha! Sexy
...,...,...
19995,0,"Youth was awesome! Church later! Left my charger there laptop died, rip buddy"
19996,0,Good morning everyone! Had an awesome dream about Danny.... What a bummer it was just a dream x x x
19997,1,btw: very very few of my &quot;followers&quot; are playing along.. :-P if that makes u feel any better..
19998,1,LOL i never followed him anyway... just wondering that's all!


In [228]:
df['label'].value_counts()

1    10039
0     9961
Name: label, dtype: int64

In [229]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

dataset = Dataset.from_pandas(df)
dataset[0]

{'label': 0,
 'text': ' hayuuk but cannot.. exammers on monday.  thinking of after siiih'}

In [230]:
test_dataset = dataset.map(tokenize_function, batched=True)
test_dataset

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

Dataset({
    features: ['label', 'text', 'input_ids', 'attention_mask'],
    num_rows: 20000
})

In [231]:
predictions = trainer.predict(test_dataset)

  0%|          | 0/2500 [00:00<?, ?it/s]

In [232]:
preds = predictions.predictions.argmax(-1)
labels = pd.Series(preds).map(model.config.id2label)
scores = (np.exp(predictions[0])/np.exp(predictions[0]).sum(-1,keepdims=True)).max(1)

In [233]:
df = pd.DataFrame(list(zip(preds,labels,scores)), columns=['pred','label','score'])
df

Unnamed: 0,pred,label,score
0,0,NEGATIVE,0.994775
1,1,POSITIVE,0.969346
2,1,POSITIVE,0.998875
3,1,POSITIVE,0.998744
4,1,POSITIVE,0.998540
...,...,...,...
19995,0,NEGATIVE,0.972612
19996,0,NEGATIVE,0.997234
19997,0,NEGATIVE,0.998691
19998,0,NEGATIVE,0.998333


In [234]:
y_pred = preds
y_true = test_dataset['label']

In [235]:
print(classification_report(y_true, y_pred, target_names=['negative','positive']))

              precision    recall  f1-score   support

    negative       0.78      0.71      0.74      9961
    positive       0.73      0.80      0.77     10039

    accuracy                           0.75     20000
   macro avg       0.76      0.75      0.75     20000
weighted avg       0.76      0.75      0.75     20000

