# Master Thesis Script
## LEIA Emotion labelling

Load LEIA base model from GitHub and classify all original tweets in the dataset

In [1]:
import numpy as np 
import pandas as pd
from transformers import pipeline
import torch
import regex as re
import transformers
import json
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig
import numpy as np
from scipy.special import softmax



In [4]:
f = open(r'/kaggle/input/twitter-dataset/Merged_Data_20230501_2.json')
  
# returns JSON object as dictionary
data = pd.read_json(f)
  
# Closing file
f.close()

In [5]:
data_OG = data[(data['referenced_tweets.retweeted.id'] == "None")]

In [6]:
MODEL_L = f"LEIA/LEIA-base"
tokenizer_L = AutoTokenizer.from_pretrained(MODEL_L)
config_L = AutoConfig.from_pretrained(MODEL_L)
# PT
model_L = AutoModelForSequenceClassification.from_pretrained(MODEL_L)

Downloading (…)okenizer_config.json:   0%|          | 0.00/473 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/843k [00:00<?, ?B/s]

Downloading (…)solve/main/bpe.codes:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

Downloading (…)in/added_tokens.json:   0%|          | 0.00/22.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/167 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.17k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/540M [00:00<?, ?B/s]

In [7]:
# Connect to GPU and push model to GPU
if torch.cuda.is_available():       
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 2 GPU(s) available.
Device name: Tesla T4


In [8]:
tweet_text = data_OG.text.values.tolist()

In [9]:
# remove emojis
def deEmojify(inputString):
    return inputString.encode('ascii', 'ignore').decode('ascii')

In [10]:
tweet_text_lst_clean = []
for item in tweet_text:
    item_new = re.sub("&amp;", "&", item)
    item_new = deEmojify(item_new)
    item_new = item_new.replace('\\n', ' ')
    item_new = item_new.replace('\\', '')
    tweet_text_lst_clean.append(item_new)

In [11]:
def preprocess(text):
    new_text = []
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)

In [12]:
model_L.to(device)

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(64001, 768, padding_idx=1)
      (position_embeddings): Embedding(130, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
             

In [13]:
def apply_classifier(text_lst):
    emotion_results = []
    for text in text_lst:
        text = preprocess(text)
        encoded_input = tokenizer_L(text, return_tensors='pt')
        encoded_input.to(device)
        output = model_L(**encoded_input)
        output = output.logits
        output = output.cpu()
        scores = output[0].detach().numpy()
        scores = softmax(scores)
        ranking = np.argsort(scores)
        ranking = ranking[::-1]
        result_dict = {}
        for i in range(scores.shape[0]):
            l = config_L.id2label[ranking[i]]
            s = scores[ranking[i]]
            result_dict[l] = s 
        emotion_results.append(result_dict)
    return emotion_results

In [14]:
tweet_sentiment_results = apply_classifier(tweet_text_lst_clean)

In [15]:
df_emotions = pd.DataFrame(tweet_sentiment_results)

In [16]:
df_emotions.to_csv("df_emotions.csv")