<a href="https://colab.research.google.com/github/delphi12/NLP_Project/blob/main/Bert_GoEmotions.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers



In [2]:
!pip install accelerate

Collecting accelerate
  Downloading accelerate-0.29.3-py3-none-any.whl (297 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m297.6/297.6 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.w

In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import nltk

# Bag of words and Tokenizer
from nltk.tokenize import sent_tokenize
from nltk.tokenize import RegexpTokenizer
from nltk.tokenize import TweetTokenizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer  # TF-IDF
from sklearn.preprocessing import label_binarize

# Classification methods
from sklearn.naive_bayes import MultinomialNB   # Naive Bayes
from sklearn.tree import DecisionTreeClassifier  # Decision Tree
from sklearn.neural_network import MLPClassifier # Multi-Layer Perceptron
from sklearn.neighbors import KNeighborsClassifier # KNN


# Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics
from sklearn.metrics import classification_report

Testing Data

In [3]:
data_path = 'Data/'
rdfTrain = pd.read_csv(data_path + 'train.tsv', sep = '\t', header = None, names = ['text', 'label', 'id'])
rdfTrain.head()

Unnamed: 0,text,label,id
0,My favourite food is anything I didn't have to...,27,eebbqej
1,"Now if he does off himself, everyone will thin...",27,ed00q6i
2,WHY THE FUCK IS BAYLESS ISOING,2,eezlygj
3,To make her feel threatened,14,ed7ypvh
4,Dirty Southern Wankers,3,ed0bdzj


In [4]:
# validation set
data_path = 'Data/'
rdfDev = pd.read_csv(data_path + 'dev.tsv', sep = '\t', header = None, names = ['text', 'label', 'id'])
rdfDev.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5426 entries, 0 to 5425
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    5426 non-null   object
 1   label   5426 non-null   object
 2   id      5426 non-null   object
dtypes: object(3)
memory usage: 127.3+ KB


In [5]:
# test set
data_path = 'Data/'
rdfTest = pd.read_csv(data_path + 'test.tsv', sep = '\t', header = None, names = ['text', 'label', 'id'])
rdfTest.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5427 entries, 0 to 5426
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    5427 non-null   object
 1   label   5427 non-null   object
 2   id      5427 non-null   object
dtypes: object(3)
memory usage: 127.3+ KB


In [6]:
def label_neutral(row):
    if row['label'] == '27':
        return 1
    else:
        return 0

In [7]:
pos_labels = ['admiration','approval', 'amusement', 'caring', 'desire',
              'excitement', 'gratitude', 'joy', 'love','optimism', 'pride', 'relief']
neg_labels = ['anger', 'annoyance', 'disappointment', 'disapproval', 'disgust',
              'embarrassment','fear', 'grief', 'nervousness', 'remorse', 'sadness']
ambi_labels = ['confusion', 'curiosity', 'realization', 'surprise']

In [8]:
emotion_to_idx = {
    'admiration' : '0',
    'amusement' : '1',
    'anger' : '2',
    'annoyance' : '3',
    'approval' : '4',
    'caring' : '5',
    'confusion' : '6',
    'curiosity' : '7',
    'desire' : '8',
    'disappointment' : '9',
    'disapproval' : '10',
    'disgust' : '11',
    'embarrassment' : '12',
    'excitement' : '13',
    'fear' : '14',
    'gratitude' : '15',
    'grief' : '16',
    'joy' : '17',
    'love' : '18',
    'nervousness' : '19',
    'optimism' : '20',
    'pride' : '21',
    'realization' : '22',
    'relief' : '23',
    'remorse' : '24',
    'sadness' : '25',
    'surprise' : '26',
    'neutral' : '27'
}

In [9]:
label_to_emotion = {j:i for i, j in emotion_to_idx.items()}
label_to_emotion

{'0': 'admiration',
 '1': 'amusement',
 '2': 'anger',
 '3': 'annoyance',
 '4': 'approval',
 '5': 'caring',
 '6': 'confusion',
 '7': 'curiosity',
 '8': 'desire',
 '9': 'disappointment',
 '10': 'disapproval',
 '11': 'disgust',
 '12': 'embarrassment',
 '13': 'excitement',
 '14': 'fear',
 '15': 'gratitude',
 '16': 'grief',
 '17': 'joy',
 '18': 'love',
 '19': 'nervousness',
 '20': 'optimism',
 '21': 'pride',
 '22': 'realization',
 '23': 'relief',
 '24': 'remorse',
 '25': 'sadness',
 '26': 'surprise',
 '27': 'neutral'}

In [10]:
def label_pos_neg_neutral(row):
    '''
    Sentiment Analysis label:
    0 - Negative
    1 - Positive
    2 - Neutral/Ambigous
    '''
    sentiment = [0, 0, 0]
    labels = row['label'].split(",")
    for l in labels:
        label = label_to_emotion[l]
        if label in pos_labels:
            sentiment[1] += 1
        elif label in neg_labels:
            sentiment[0] += 1
        else:
            sentiment[2] += 1
    return np.argmax(np.array(sentiment))

In [11]:
anger_list = [ "anger", "annoyance", "disapproval", "disgust"]
fear_list = ["fear", "nervousness"]
joy_list = ["joy", "amusement", "approval", "excitement", "gratitude","love", "optimism", "relief", "pride", "admiration", "desire", "caring"]
sadness_list = ["sadness", "disappointment", "embarrassment", "grief", "remorse"]
surprise_list = ["surprise", "realization", "confusion", "curiosity"]

In [12]:
def label_emotion_group(row):
    '''
    Groupping Emotion Label:
    0 - Anger, 1 - Fear, 2- Joy,
    3 - Sadness, 4 - Surprise, 5 - Neutral/Ambigous
    '''
    sentiment = [0]*6
    labels = row['label'].split(",")
    for l in labels:
        if l == '27':
            return 5 # Neutral

        label = label_to_emotion[l]
        if label in anger_list:
            sentiment[0] += 1
        elif label in fear_list:
            sentiment[1] += 1
        elif label in joy_list:
            sentiment[2] += 1
        elif label in sadness_list:
            sentiment[3] += 1
        elif label in surprise_list:
            sentiment[4] += 1
        else:
            sentiment[5] += 1
    return np.argmax(np.array(sentiment))

In [13]:
label_idx = {
    '0' :  '0', # admiration, desire
    '10' :  '1', # disapproval, disgust, disappointment, embarrassment
    '2' : '2',  # anger, annoyance
    '13' : '3', # excitement, amusement
    '18' : '4', # love, caring
    '4' : '5',  # approval
    '15' : '6', # gratitude
    '7' : '7',  # curiosity
    '25' : '8', # sadness , grief, remorse
    '17' : '9', # joy , pride, relief
    '20' : '10', # optimism
    '6' : '11', # confusion
    '22' : '12', # realization
    '26' : '13', # surprise
    '14' :  '14' # fear, nervousness
}

In [14]:
# Ignoring neutral and merging emotions
def multi_class(df):
    data = []
    for idx, row in df.iterrows():
        labels = row['label'].split(",")
        for l in labels:
            l_txt = label_to_emotion[l]
            if l_txt == 'neutral':
                continue
            elif l_txt == 'desire':
                l = emotion_to_idx['admiration']
            elif l_txt == 'amusement':
                l = emotion_to_idx['excitement']
            elif l_txt == 'pride' or l_txt == 'relief':
                l = emotion_to_idx['joy']
            elif l_txt == 'caring':
                l = emotion_to_idx['love']
            elif l_txt == 'embarrassment' or l_txt == 'disgust' or l_txt == 'disappointment':
                l = emotion_to_idx['disapproval']
            elif l_txt == 'nervousness':
                l = emotion_to_idx['fear']
            elif l_txt == 'remorse' or l_txt == 'grief':
                l = emotion_to_idx['sadness']
            elif l_txt == 'annoyance':
                l = emotion_to_idx['anger']
            idx = label_idx[l]
            data.append([row['text'],idx])
    data = np.array(data)
    new_df = pd.DataFrame(data=data, columns=['text','labels'])
    return new_df

In [15]:
def emotion_label(df):
    data = []
    for idx, row in df.iterrows():
        labels = row['label'].split(",")
        for l in labels:
            data.append([row['text'],l])
    data = np.array(data)
    new_df = pd.DataFrame(data=data, columns=['text','labels'])
    return new_df

In [16]:
def transformData(rdfTrain, rdfDev, rdfTest, n_categories = 2):

    dfTrain, dfDev, dfTest = rdfTrain, rdfDev, rdfTest
    if n_categories == 2:
        dfTrain["labels"] = dfTrain.apply(lambda row: label_neutral(row), axis = 1)
        dfDev["labels"] = dfDev.apply(lambda row: label_neutral(row), axis = 1)
        dfTest["labels"] = dfTest.apply(lambda row: label_neutral(row), axis = 1)

    if n_categories == 3:
        dfTrain["labels"] = dfTrain.apply(lambda row: label_pos_neg_neutral(row), axis = 1)
        dfDev["labels"] = dfDev.apply(lambda row: label_pos_neg_neutral(row), axis = 1)
        dfTest["labels"] = dfTest.apply(lambda row: label_pos_neg_neutral(row), axis = 1)

    if n_categories == 6:
        dfTrain["labels"] = dfTrain.apply(lambda row: label_emotion_group(row), axis = 1)
        dfDev["labels"] = dfDev.apply(lambda row: label_emotion_group(row), axis = 1)
        dfTest["labels"] = dfTest.apply(lambda row: label_emotion_group(row), axis = 1)

    if n_categories == 28:
        dfTrain = multi_class(dfTrain)
        print(dfTrain.head())
        dfDev = multi_class(dfDev)
        dfTest = multi_class(dfTest)

    print("Training distribution: ", dfTrain.labels.value_counts())
    print("Dev data distribution: ", dfDev.labels.value_counts())
    print("Test data distribution: ", dfTest.labels.value_counts())

    return dfTrain, dfDev, dfTest

In [17]:
dfTrain, dfDev, dfTest = transformData(rdfTrain, rdfDev, rdfTest, n_categories = 28)

                                                text labels
0                     WHY THE FUCK IS BAYLESS ISOING      2
1                        To make her feel threatened     14
2                             Dirty Southern Wankers      2
3  OmG pEyToN iSn'T gOoD eNoUgH tO hElP uS iN tHe...     13
4  Yes I heard abt the f bombs! That has to be wh...      6
Training distribution:  labels
0     4771
1     4387
2     4037
3     3181
4     3173
5     2939
6     2662
7     2191
8     1948
9     1716
10    1581
11    1368
12    1110
13    1060
14     760
Name: count, dtype: int64
Dev data distribution:  labels
1     587
0     565
2     498
4     405
3     399
5     397
6     358
7     248
8     224
10    209
9     205
11    152
13    129
12    127
14    111
Name: count, dtype: int64
Test data distribution:  labels
0     587
1     578
2     518
4     373
3     367
6     352
5     351
7     284
8     218
9     188
10    186
11    153
12    145
13    141
14    101
Name: count, dtype: int64


In [18]:
dfTrain = dfTrain[['text', 'labels']]
dfDev = dfDev[['text', 'labels']]
dfTest = dfTest[['text', 'labels']]

In [42]:
data_path = 'Data/multi_class_15/'
dfTrain.to_csv(data_path + 'train.csv', sep='\t', header=False, index=False)
dfDev.to_csv(data_path + 'dev.csv', sep='\t', header=False, index=False)
dfTest.to_csv(data_path + 'test.csv', sep='\t', header=False, index=False)

In [19]:
# Split the training data into training and validation sets
from sklearn.model_selection import train_test_split
train_texts, val_texts, train_labels, val_labels = train_test_split(dfTrain['text'], dfTrain['labels'], test_size=0.2, random_state=1011)


In [20]:
llm_model = "google-bert/bert-base-cased"

In [21]:
# Load pre-trained BERT model and tokenizer
from transformers import BertForSequenceClassification
from transformers import BertTokenizer

model = BertForSequenceClassification.from_pretrained(llm_model, num_labels=28)
tokenizer = BertTokenizer.from_pretrained(llm_model)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [22]:
# Tokenize and encode text data
train_encodings = tokenizer(list(train_texts), truncation=True, padding=True)
val_encodings = tokenizer(list(val_texts), truncation=True, padding=True)
test_encodings = tokenizer(list(dfTest['text']), truncation=True, padding=True)


In [23]:
import torch
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels is not None:
            label = int(self.labels.iloc[idx])  # Convert label to integer
            item['labels'] = torch.tensor(label)
        return item

    def __len__(self):
        return len(self.encodings.input_ids)

In [24]:
train_dataset = CustomDataset(train_encodings, train_labels)
val_dataset = CustomDataset(val_encodings, val_labels)
test_dataset = CustomDataset(test_encodings)

In [25]:
# Define training arguments
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=8,   # batch size per device during training
    per_device_eval_batch_size=16,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
)

In [26]:
# Define Trainer
from transformers import Trainer

trainer = Trainer(
    model=model,                         # the instantiated Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset             # evaluation dataset
)

In [27]:
trainer.train()

Step,Training Loss
10,3.5087
20,3.4876
30,3.4924
40,3.446
50,3.3677
60,3.3385
70,3.2952
80,3.2265
90,3.1226
100,3.0939


TrainOutput(global_step=11067, training_loss=1.2101687730096529, metrics={'train_runtime': 1759.4047, 'train_samples_per_second': 50.313, 'train_steps_per_second': 6.29, 'total_flos': 3458043170099424.0, 'train_loss': 1.2101687730096529, 'epoch': 3.0})

In [28]:
best_model_path = 'Model'
trainer.save_model(best_model_path)

In [30]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

best_model = AutoModelForSequenceClassification.from_pretrained(best_model_path, num_labels=28)

In [31]:
def compute_metrics(eval_pred):

    f1_metric = evaluate.load("f1")

    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)

    results = {}
    results.update(f1_metric.compute(predictions=predictions, references = labels, average="micro"))

    return results

In [32]:
trainer = Trainer(
        model=best_model,
        tokenizer=tokenizer,
        args=training_args,
        compute_metrics=compute_metrics,
    )

In [33]:
predictions = trainer.predict(test_dataset=test_dataset)

# Access predictions
predicted_labels = predictions.predictions.argmax(axis=1)

In [35]:
# Create a DataFrame with the original text, labels, and predicted labels
results_df = pd.DataFrame({
    'text': dfTest['text'],
    'label': dfTest['labels'],
    'predicted_label': predicted_labels
})

# Save the DataFrame to a CSV file
results_df.to_csv('predictions.csv', index=False)

In [43]:
import pandas as pd

# Load the predictions CSV file
predictions_df = pd.read_csv('predictions.csv')

In [48]:
import pandas as pd

# Load the predictions CSV file
predictions_df = pd.read_csv('predictions.csv')

correct_predictions = (predictions_df['predicted_label'] == predictions_df['label']).sum()
total_predictions = len(predictions_df)
accuracy = correct_predictions / total_predictions

print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.60
