In [1]:
import torch
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer

In [2]:
# Create class for data preparation
class SimpleDataset:
    def __init__(self, tokenized_texts):
        self.tokenized_texts = tokenized_texts
    
    def __len__(self):
        return len(self.tokenized_texts["input_ids"])
    
    def __getitem__(self, idx):
        return {k: v[idx] for k, v in self.tokenized_texts.items()}

In [3]:
# load tokenizer and model, create trainer
model_name = "j-hartmann/emotion-english-distilroberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
trainer = Trainer(model=model)

Downloading tokenizer_config.json:   0%|          | 0.00/294 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/0.98k [00:00<?, ?B/s]

Downloading vocab.json:   0%|          | 0.00/780k [00:00<?, ?B/s]

Downloading merges.txt:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

Downloading special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/313M [00:00<?, ?B/s]

In [4]:
path = r'C:\Users\fashaikh\Desktop\Thesis main\partitioned_data\LDSEnclaves.parquet'
df = pd.read_parquet(path)

text = list(df['cleanedContent'])

In [5]:
# Tokenize texts and create prediction data set
tokenized_texts = tokenizer(text,truncation=True,padding=True)
pred_dataset = SimpleDataset(tokenized_texts)

In [6]:
# Run predictions
predictions = trainer.predict(pred_dataset)

***** Running Prediction *****
  Num examples = 6831
  Batch size = 8


  0%|          | 0/854 [00:00<?, ?it/s]

In [7]:
# Transform predictions to labels
preds = predictions.predictions.argmax(-1)
labels = pd.Series(preds).map(model.config.id2label)
scores = (np.exp(predictions[0])/np.exp(predictions[0]).sum(-1,keepdims=True)).max(1)

In [8]:
# scores raw
temp = (np.exp(predictions[0])/np.exp(predictions[0]).sum(-1,keepdims=True))

In [9]:
# work in progress
# container
anger = []
disgust = []
fear = []
joy = []
neutral = []
sadness = []
surprise = []

# extract scores (as many entries as exist in pred_texts)
for i in range(len(text)):
  anger.append(temp[i][0])
  disgust.append(temp[i][1])
  fear.append(temp[i][2])
  joy.append(temp[i][3])
  neutral.append(temp[i][4])
  sadness.append(temp[i][5])
  surprise.append(temp[i][6])

In [10]:
# Create DataFrame with texts, predictions, labels, and scores
df = pd.DataFrame(list(zip(text,preds,labels,scores,  anger, disgust, fear, joy, neutral, sadness, surprise)), columns=['text','pred','label','score', 'anger', 'disgust', 'fear', 'joy', 'neutral', 'sadness', 'surprise'])
df.head()

Unnamed: 0,text,pred,label,score,anger,disgust,fear,joy,neutral,sadness,surprise
0,@user WY isn't WY without wild horses. Stop BL...,2,fear,0.77689,0.169959,0.000374,0.77689,0.007091,0.004405,0.03518,0.006101
1,@user BLM must abandon plan to eradicate 40% o...,0,anger,0.428445,0.428445,0.001105,0.420934,0.029228,0.010283,0.10204,0.007966
2,@user @user @user @user @user - as your consti...,0,anger,0.835694,0.835694,0.000482,0.118623,0.011356,0.003269,0.027564,0.003012
3,@user BLM must abandon plan to eradicate 40% o...,0,anger,0.428445,0.428445,0.001105,0.420934,0.029228,0.010283,0.10204,0.007966
4,@user BLM must abandon plan to eradicate 40% o...,0,anger,0.428445,0.428445,0.001105,0.420934,0.029228,0.010283,0.10204,0.007966


In [11]:
df['label'].value_counts()

label
anger       2819
fear        1736
neutral     1046
surprise     481
sadness      365
joy          320
disgust       64
Name: count, dtype: int64

In [13]:
import tweetnlp
import tqdm

In [14]:
# MULTI-LABEL MODEL 
model = tweetnlp.load_model('emotion')

https://huggingface.co/cardiffnlp/twitter-roberta-base-emotion-multilabel-latest/resolve/main/config.json not found in cache or force_download set to True, downloading to C:\Users\fashaikh\.cache\huggingface\transformers\tmp4yv07br6


Downloading config.json:   0%|          | 0.00/1.17k [00:00<?, ?B/s]

storing https://huggingface.co/cardiffnlp/twitter-roberta-base-emotion-multilabel-latest/resolve/main/config.json in cache at C:\Users\fashaikh/.cache\huggingface\transformers\a093d176a154ca924192427bcbca1a8c313d9519aa9d4bb0d346fc6563649fbc.e4359fd65495b9b21f6e032c53184cb71b81a0ba2ef982df8959daa4fa0e293a
creating metadata file for C:\Users\fashaikh/.cache\huggingface\transformers\a093d176a154ca924192427bcbca1a8c313d9519aa9d4bb0d346fc6563649fbc.e4359fd65495b9b21f6e032c53184cb71b81a0ba2ef982df8959daa4fa0e293a
loading configuration file https://huggingface.co/cardiffnlp/twitter-roberta-base-emotion-multilabel-latest/resolve/main/config.json from cache at C:\Users\fashaikh/.cache\huggingface\transformers\a093d176a154ca924192427bcbca1a8c313d9519aa9d4bb0d346fc6563649fbc.e4359fd65495b9b21f6e032c53184cb71b81a0ba2ef982df8959daa4fa0e293a
Model config RobertaConfig {
  "_name_or_path": "cardiffnlp/twitter-roberta-base-emotion-multilabel-latest",
  "architectures": [
    "RobertaForSequenceClassif

Downloading tokenizer_config.json:   0%|          | 0.00/409 [00:00<?, ?B/s]

storing https://huggingface.co/cardiffnlp/twitter-roberta-base-emotion-multilabel-latest/resolve/main/tokenizer_config.json in cache at C:\Users\fashaikh/.cache\huggingface\transformers\e85e9ebc19ce8eef43c1d35ac19e42260dd63737a9e1d3b6d399b301729d4fee.e5993163038bca1cc1c23c854d00149bd80b4fde64fb7f906313dad0886c5783
creating metadata file for C:\Users\fashaikh/.cache\huggingface\transformers\e85e9ebc19ce8eef43c1d35ac19e42260dd63737a9e1d3b6d399b301729d4fee.e5993163038bca1cc1c23c854d00149bd80b4fde64fb7f906313dad0886c5783
https://huggingface.co/cardiffnlp/twitter-roberta-base-emotion-multilabel-latest/resolve/main/vocab.json not found in cache or force_download set to True, downloading to C:\Users\fashaikh\.cache\huggingface\transformers\tmpcpdmxn9g


Downloading vocab.json:   0%|          | 0.00/780k [00:00<?, ?B/s]

storing https://huggingface.co/cardiffnlp/twitter-roberta-base-emotion-multilabel-latest/resolve/main/vocab.json in cache at C:\Users\fashaikh/.cache\huggingface\transformers\5711ed52026b6c6a3990b6b9b61ad36d14d88b69262bbc191a6d13938c592223.bfdcc444ff249bca1a95ca170ec350b442f81804d7df3a95a2252217574121d7
creating metadata file for C:\Users\fashaikh/.cache\huggingface\transformers\5711ed52026b6c6a3990b6b9b61ad36d14d88b69262bbc191a6d13938c592223.bfdcc444ff249bca1a95ca170ec350b442f81804d7df3a95a2252217574121d7
https://huggingface.co/cardiffnlp/twitter-roberta-base-emotion-multilabel-latest/resolve/main/merges.txt not found in cache or force_download set to True, downloading to C:\Users\fashaikh\.cache\huggingface\transformers\tmppmi57zph


Downloading merges.txt:   0%|          | 0.00/446k [00:00<?, ?B/s]

storing https://huggingface.co/cardiffnlp/twitter-roberta-base-emotion-multilabel-latest/resolve/main/merges.txt in cache at C:\Users\fashaikh/.cache\huggingface\transformers\a43a5e7fedcd400e0e49e11de9ea1f02114d536afd61f7834d7ec8df68d91ff1.f5b91da9e34259b8f4d88dbc97c740667a0e8430b96314460cdb04e86d4fc435
creating metadata file for C:\Users\fashaikh/.cache\huggingface\transformers\a43a5e7fedcd400e0e49e11de9ea1f02114d536afd61f7834d7ec8df68d91ff1.f5b91da9e34259b8f4d88dbc97c740667a0e8430b96314460cdb04e86d4fc435
https://huggingface.co/cardiffnlp/twitter-roberta-base-emotion-multilabel-latest/resolve/main/tokenizer.json not found in cache or force_download set to True, downloading to C:\Users\fashaikh\.cache\huggingface\transformers\tmp51bgv8ut


Downloading tokenizer.json:   0%|          | 0.00/2.01M [00:00<?, ?B/s]

storing https://huggingface.co/cardiffnlp/twitter-roberta-base-emotion-multilabel-latest/resolve/main/tokenizer.json in cache at C:\Users\fashaikh/.cache\huggingface\transformers\d5196c83ae47649f98f810f831dc984aafa2d16847f7809f1ef929abd857b13c.1d5d530b5229dbca3dfd2235e27250542ef41720aa101041bc4c7a01ea22b470
creating metadata file for C:\Users\fashaikh/.cache\huggingface\transformers\d5196c83ae47649f98f810f831dc984aafa2d16847f7809f1ef929abd857b13c.1d5d530b5229dbca3dfd2235e27250542ef41720aa101041bc4c7a01ea22b470
https://huggingface.co/cardiffnlp/twitter-roberta-base-emotion-multilabel-latest/resolve/main/special_tokens_map.json not found in cache or force_download set to True, downloading to C:\Users\fashaikh\.cache\huggingface\transformers\tmpady_2af8


Downloading special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

storing https://huggingface.co/cardiffnlp/twitter-roberta-base-emotion-multilabel-latest/resolve/main/special_tokens_map.json in cache at C:\Users\fashaikh/.cache\huggingface\transformers\653dd6788645906ff4b76f83858a49606b2505243ade87c2aa2aa00a83b5a77a.50c9a6a3342271e7e900bb03520d7f844b78e2b2ef8352a0239b688c7d12bdc6
creating metadata file for C:\Users\fashaikh/.cache\huggingface\transformers\653dd6788645906ff4b76f83858a49606b2505243ade87c2aa2aa00a83b5a77a.50c9a6a3342271e7e900bb03520d7f844b78e2b2ef8352a0239b688c7d12bdc6
loading file https://huggingface.co/cardiffnlp/twitter-roberta-base-emotion-multilabel-latest/resolve/main/vocab.json from cache at C:\Users\fashaikh/.cache\huggingface\transformers\5711ed52026b6c6a3990b6b9b61ad36d14d88b69262bbc191a6d13938c592223.bfdcc444ff249bca1a95ca170ec350b442f81804d7df3a95a2252217574121d7
loading file https://huggingface.co/cardiffnlp/twitter-roberta-base-emotion-multilabel-latest/resolve/main/merges.txt from cache at C:\Users\fashaikh/.cache\huggin

Downloading pytorch_model.bin:   0%|          | 0.00/476M [00:00<?, ?B/s]

storing https://huggingface.co/cardiffnlp/twitter-roberta-base-emotion-multilabel-latest/resolve/main/pytorch_model.bin in cache at C:\Users\fashaikh/.cache\huggingface\transformers\38eb574d544ae6dd9583d92cb7b7fb8404faf4e83794c63937e1ffaf80a73a5b.fa8aafca908cdf08d8a16aab630fb2e521c1bfe7305d198cccf065e0c86011e8
creating metadata file for C:\Users\fashaikh/.cache\huggingface\transformers\38eb574d544ae6dd9583d92cb7b7fb8404faf4e83794c63937e1ffaf80a73a5b.fa8aafca908cdf08d8a16aab630fb2e521c1bfe7305d198cccf065e0c86011e8
loading weights file https://huggingface.co/cardiffnlp/twitter-roberta-base-emotion-multilabel-latest/resolve/main/pytorch_model.bin from cache at C:\Users\fashaikh/.cache\huggingface\transformers\38eb574d544ae6dd9583d92cb7b7fb8404faf4e83794c63937e1ffaf80a73a5b.fa8aafca908cdf08d8a16aab630fb2e521c1bfe7305d198cccf065e0c86011e8
All model checkpoint weights were used when initializing RobertaForSequenceClassification.

All the weights of RobertaForSequenceClassification were initi

In [15]:
path = r'C:\Users\fashaikh\Desktop\Thesis main\partitioned_data\LDSEnclaves.parquet'
df2 = pd.read_parquet(path)

text = list(df2['cleanedContent'])

In [17]:
res = []

for i in text:
    sentiment_result = model.emotion(i, return_probability=True)
    res.append(sentiment_result)
    
data = pd.json_normalize(res)

df3 = pd.concat(objs=[df2, data], axis=1)

In [18]:
df3['label'].value_counts()

label
anger           3251
anticipation    1363
joy              667
disgust          520
optimism         512
fear             390
sadness          125
surprise           3
Name: count, dtype: int64