<a href="https://colab.research.google.com/github/chinmayeeadiga/emotion-detection/blob/main/Project_Notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**EMOTION DETECTION BY TEXT**

Number of emotion classes : 6

Types of emotion classes :
*   Sadness
*   Joy
*   Love
*   Anger
*   Fear
*   Surprise

Dataset Used : Emotion Dataset by dair-ai

Models Used : Convolutional Neural Network (CNN)

In [None]:
#importing dependencies
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import LabelEncoder
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')

In [None]:
#loading the dataset
splits = {'train': 'split/train-00000-of-00001.parquet', 'validation': 'split/validation-00000-of-00001.parquet', 'test': 'split/test-00000-of-00001.parquet'}
X_train = pd.read_parquet("hf://datasets/dair-ai/emotion/" + splits["train"])
X_val = pd.read_parquet("hf://datasets/dair-ai/emotion/" + splits["validation"])
X_test = pd.read_parquet("hf://datasets/dair-ai/emotion/" + splits["test"])

In [None]:
# cleaning the data
!pip install wordcloud emoji
import re
import emoji
import string
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+|@\w+", '', text)
    text = emoji.demojize(text, delimiters=(" ", " "))
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(word) for word in tokens
              if word not in stop_words and word.isalpha()]
    return " ".join(tokens)

X_train['clean_text'] = X_train['text'].apply(preprocess)
X_test['clean_text'] = X_test['text'].apply(preprocess)
X_val['clean_text'] = X_val['text'].apply(preprocess)

In [None]:
# getting the data ready
y_train = X_train['label']
y_test = X_test['label']
y_val = X_val['label']

X_train = X_train[['clean_text']]
X_train.columns = ['clean_text']
X_test = X_test[['clean_text']]
X_test.columns = ['clean_text']
X_val = X_val[['clean_text']]
X_val.columns = ['clean_text']

**VISUALIZATION**

In [None]:
#bar plots
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(8, 5))
sns.countplot(x=y_train, palette='Set2')
plt.title("Label Distribution in y_train")
plt.xlabel("Emotion Label")
plt.ylabel("Number of Samples")
plt.show()

In [None]:
#word clouds
from wordcloud import WordCloud

text = " ".join(X_train['clean_text'].tolist())

wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)

plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.title("Word Cloud of X_train")
plt.show()

**WORD EMBEDDINGS**

In [None]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove.6B.zip

In [None]:
embeddings_index = {}
with open("glove.6B.100d.txt", encoding="utf8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

In [None]:
#Tokenization
tokenizer = Tokenizer(num_words=20000, oov_token='<OOV>')
tokenizer.fit_on_texts(X_train['clean_text'])

X_train_seq = tokenizer.texts_to_sequences(X_train['clean_text'])
X_val_seq = tokenizer.texts_to_sequences(X_val['clean_text'])
X_test_seq = tokenizer.texts_to_sequences(X_test['clean_text'])

In [None]:
#Padding
max_len = max(len(seq) for seq in X_train_seq)
X_train = pad_sequences(X_train_seq, maxlen=max_len, padding='post')
X_val = pad_sequences(X_val_seq, maxlen=max_len, padding='post')
X_test = pad_sequences(X_test_seq, maxlen=max_len, padding='post')

In [None]:
#Load GloVe Embeddings
embedding_index = {}
with open('/content/glove.6B.100d.txt', encoding='utf8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], dtype='float32')
        embedding_index[word] = vector

In [None]:
#Create Embedding Matrix
embedding_dim = 100
word_index = tokenizer.word_index
num_words = min(20000, len(word_index) + 1)

embedding_matrix = np.zeros((num_words, embedding_dim))
for word, i in word_index.items():
    if i >= num_words:
        continue
    embedding_vector = embedding_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

**CNN MODELS**

In [None]:
#Prepare labels (convert to categorical)
num_classes = y_train.nunique()
y_train_new = to_categorical(y_train, num_classes=num_classes)
y_val_new = to_categorical(y_val, num_classes=num_classes)
y_test_new = to_categorical(y_test, num_classes=num_classes)

In [None]:
from keras.models import Model
from keras.layers import Input, Embedding, Conv1D, GlobalMaxPooling1D, Concatenate
from keras.layers import Dense, Dropout, BatchNormalization
from keras.callbacks import EarlyStopping, ModelCheckpoint

input_layer = Input(shape=(max_len,))

embedding_layer = Embedding(input_dim=num_words,
                            output_dim=embedding_dim,
                            weights=[embedding_matrix],
                            input_length=max_len,
                            trainable=True)(input_layer)

conv_3 = Conv1D(128, kernel_size=3, activation='relu')(embedding_layer)
conv_3 = BatchNormalization()(conv_3)
pool_3 = GlobalMaxPooling1D()(conv_3)

conv_4 = Conv1D(128, kernel_size=4, activation='relu')(embedding_layer)
conv_4 = BatchNormalization()(conv_4)
pool_4 = GlobalMaxPooling1D()(conv_4)

conv_5 = Conv1D(128, kernel_size=5, activation='relu')(embedding_layer)
conv_5 = BatchNormalization()(conv_5)
pool_5 = GlobalMaxPooling1D()(conv_5)

merged = Concatenate()([pool_3, pool_4, pool_5])

dropout = Dropout(0.5)(merged)
dense = Dense(128, activation='relu')(dropout)
output = Dense(num_classes, activation='softmax')(dense)

model = Model(inputs=input_layer, outputs=output)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

model.summary()


In [None]:
early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
checkpoint = ModelCheckpoint('best_cnn_model.h5', save_best_only=True, monitor='val_loss')

history = model.fit(X_train, y_train_new,
                    epochs=20,
                    batch_size=32,
                    validation_data=(X_val, y_val_new),
                    callbacks=[early_stop, checkpoint])


In [None]:
y_pred_probs = model.predict(X_test)
y_pred = np.argmax(y_pred_probs, axis=1)
y_true = np.argmax(y_test_new, axis=1)

In [None]:
# evaluating the model
precision = precision_score(y_true, y_pred, average='macro')
recall = recall_score(y_true, y_pred, average='macro')
f1 = f1_score(y_true, y_pred, average='macro')
accuracy = accuracy_score(y_true, y_pred)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")
print(f"F1 Score:  {f1:.4f}")
print("\nClassification Report:")
print(classification_report(y_true, y_pred))

**USING THE MODEL**

In [None]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))
max_len = 35

def preprocess_text(text):
    text = text.lower()
    text = emoji.demojize(text)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(w) for w in tokens if w not in stop_words]
    return " ".join(tokens)

import gradio as gr
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
label_encoder.fit(y_train)

def predict_emotion(text):
    cleaned = preprocess_text(text)
    seq = tokenizer.texts_to_sequences([cleaned])
    padded = pad_sequences(seq, maxlen=max_len, padding="post")
    pred = model.predict(padded)[0]
    threshold = 0.3

    top_indices = np.where(pred > threshold)[0]
    if len(top_indices) == 0:
        return "Neutral"

    emotions = [str(e) for e in label_encoder.classes_[top_indices]]
    return ", ".join(emotions)


In [None]:
import ipywidgets as widgets
from IPython.display import display

input_box = widgets.Textarea(placeholder='Enter text here', layout=widgets.Layout(width='100%', height='100px'))
output_box = widgets.Output()
button = widgets.Button(description='Detect Emotion')

def on_click(b):
    output_box.clear_output()
    text = input_box.value
    with output_box:
        print(predict_emotion(text))

button.on_click(on_click)

display(input_box, button, output_box)


In [None]:
import pickle
from google.colab import files

files.download("best_cnn_model.h5")

with open('tokenizer.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)
files.download("tokenizer.pkl")

with open("label_encoder.pkl", "wb") as f:
    pickle.dump(label_encoder, f)
files.download("label_encoder.pkl")