## <font color=DarkTurquoise>Import packages</font>

In [24]:
import numpy as np
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
import nlp
import tensorflow as tf
import random
import os
%matplotlib inline
import re
from transformers import BertTokenizer
import tensorflow as tf
from transformers import TFBertForSequenceClassification
from sklearn.model_selection import train_test_split

## <font color=DarkTurquoise>Dataset</font>

We obtain the dataset from [this Kaggle link](https://www.kaggle.com/datasets/parulpandey/emotion-dataset/data), which is a dataset of English Twitter messages categorized into five basic emotions: sadness (0), joy (1), love (2), anger (3), and fear (4). We will use this dataset to train a BERT model and fine-tune it to perform emotion recognition on our movie plot summaries.


In [25]:
'''# Load the training/validation/testing dataset from the specified file path
train = pd.read_csv('data/data_for_training/training.csv')
val = pd.read_csv('data/data_for_training/validation.csv')
test = pd.read_csv('data/data_for_training/test.csv')
train'''

"# Load the training/validation/testing dataset from the specified file path\ntrain = pd.read_csv('data/data_for_training/training.csv')\nval = pd.read_csv('data/data_for_training/validation.csv')\ntest = pd.read_csv('data/data_for_training/test.csv')\ntrain"

In [26]:
data = pd.read_csv('data/tweet_emotions/tweet_emotions.csv')
data['sentiment'] = data['sentiment'].replace('empty', 'neutral')
label_mapping = {label: idx for idx, label in enumerate(data['sentiment'].unique())}
data['sentiment_label'] = data['sentiment'].map(label_mapping)
print("Label Mapping:", label_mapping)
data

Label Mapping: {'neutral': 0, 'sadness': 1, 'enthusiasm': 2, 'worry': 3, 'surprise': 4, 'love': 5, 'fun': 6, 'hate': 7, 'happiness': 8, 'boredom': 9, 'relief': 10, 'anger': 11}


Unnamed: 0,sentiment,content,sentiment_label
0,neutral,@tiffanylue i know i was listenin to bad habi...,0
1,sadness,Layin n bed with a headache ughhhh...waitin o...,1
2,sadness,Funeral ceremony...gloomy friday...,1
3,enthusiasm,wants to hang out with friends SOON!,2
4,neutral,@dannycastillo We want to trade with someone w...,0
...,...,...,...
39995,neutral,@JohnLloydTaylor,0
39996,love,Happy Mothers Day All my love,5
39997,love,Happy Mother's Day to all the mommies out ther...,5
39998,happiness,@niariley WASSUP BEAUTIFUL!!! FOLLOW ME!! PEE...,8


In [27]:
df = data[['content', 'sentiment_label']].rename(columns={'content': 'text', 'sentiment_label': 'label'})
train, temp = train_test_split(df, test_size=0.2, random_state=42)
val, test = train_test_split(temp, test_size=0.5, random_state=42)
test

Unnamed: 0,text,label
19316,So glad the days almost over... Another nite o...,3
29330,as landice said; &quot;uhmazing.&quot; you are...,5
8755,@nick_carter It says the video is private,0
2360,@madeofhoney1 im sorry. i dont wanna cuz of ho...,1
14077,"@mercadoasaria I don't know you, but you made ...",0
...,...,...
8904,Flash lost my frisby on a roof. Sad days LOL,1
3615,Aww.. I lost 3 followers. FOLLOW ME !,7
5140,@getape I had bad net issues on Weds so couldn...,1
27383,"Also, I designed the banner for http://mudroom...",5


In [28]:
# Load a pre-trained BERT tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Tokenize the 'text' column from the training/validation dataset
'''train_encodings = tokenizer(list(train['text']), truncation=True, padding=True, max_length=128)
val_encodings = tokenizer(list(val['text']), truncation=True, padding=True, max_length=128)'''

train_encodings = tokenizer(list(train['text']), truncation=False, padding=True)
val_encodings = tokenizer(list(val['text']), truncation=False, padding=True)


In [29]:
# Create a TensorFlow Dataset for training/validation data
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings), 
    train['label']         
))

val_dataset = tf.data.Dataset.from_tensor_slices((
    dict(val_encodings), 
    val['label']          
))



In [30]:
# Load a pre-trained BERT model for sequence classification
model = TFBertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=12)  

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


After three epochs, the model achieved a loss of 0.1232, accuracy of 94.27%, validation loss of 0.1704, and validation accuracy of 93.35%. We saved the model parameters and tested its performance.

In [31]:
# Compile the model
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=5e-5),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=["accuracy"]
)

# Train the model
model.fit(
    train_dataset.shuffle(1000).batch(16),
    validation_data=val_dataset.batch(16),
    epochs=3
)


Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.src.callbacks.History at 0x2378facdb50>

We randomly input several sentences to test the performance of the current model.

In [32]:
# Example of unlabeled data
unlabeled_texts = ["I feel like everything is falling apart.", "I can't stop smiling; everything feels perfect.",
                "You mean the world to me, and I cherish every moment with you.","This is completely unacceptable; I am furious.",
                "The thought of losing everything keeps me awake at night."]

# Encode the unlabeled data
unlabeled_encodings = tokenizer(unlabeled_texts, truncation=True, padding=True, max_length=128, return_tensors="tf")

# Predict the labels for the unlabeled data
predictions = model.predict(unlabeled_encodings.data)
predicted_labels = tf.argmax(predictions.logits, axis=1).numpy()

# Print the predicted labels alongside their corresponding texts
for text, label in zip(unlabeled_texts, predicted_labels):
    print(f"Text: {text}, Predicted Label: {label}")



Text: I feel like everything is falling apart., Predicted Label: 1
Text: I can't stop smiling; everything feels perfect., Predicted Label: 8
Text: You mean the world to me, and I cherish every moment with you., Predicted Label: 5
Text: This is completely unacceptable; I am furious., Predicted Label: 7
Text: The thought of losing everything keeps me awake at night., Predicted Label: 3


In [33]:
# Save the model
model.save_pretrained("./bert_finetuned_model_1")
tokenizer.save_pretrained("./bert_finetuned_model_1")


('./bert_finetuned_model_1\\tokenizer_config.json',
 './bert_finetuned_model_1\\special_tokens_map.json',
 './bert_finetuned_model_1\\vocab.txt',
 './bert_finetuned_model_1\\added_tokens.json')