In [None]:
import nltk

In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv(r"Emotion_classify_Data.csv")

In [None]:
df.head()

Unnamed: 0,Comment,Emotion
0,i seriously hate one subject to death but now ...,fear
1,im so full of life i feel appalled,anger
2,i sit here to write i start to dig out my feel...,fear
3,ive been really angry with r and i feel like a...,joy
4,i feel suspicious if there is no one outside l...,fear


In [None]:
df['Emotion'].nunique()

3

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5937 entries, 0 to 5936
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Comment  5937 non-null   object
 1   Emotion  5937 non-null   object
dtypes: object(2)
memory usage: 92.9+ KB


In [None]:
df.describe()

Unnamed: 0,Comment,Emotion
count,5937,5937
unique,5934,3
top,i feel pretty tortured because i work a job an...,anger
freq,2,2000


In [None]:
df['Emotion'].unique()

array(['fear', 'anger', 'joy'], dtype=object)

In [None]:
df.isnull().sum()

Unnamed: 0,0
Comment,0
Emotion,0


In [None]:
df.duplicated().sum()

np.int64(0)

In [None]:
df['Emotion'].value_counts()

Unnamed: 0_level_0,count
Emotion,Unnamed: 1_level_1
anger,2000
joy,2000
fear,1937


In [None]:
import re
import spacy

# Load spaCy model
nlp = spacy.load("en_core_web_sm")


In [None]:
# Remove non-text values
df = df.dropna(subset=["Comment"])
df["Comment"] = df["Comment"].astype(str)

In [None]:
# Normalization
df["Comment"] = df["Comment"].str.lower()

In [None]:
def spacy_preprocess(text):
    if not isinstance(text, str):
        return ""
    doc = nlp(text.lower())
    return " ".join(
        token.lemma_
        for token in doc
        if token.is_alpha  # remove punctuation & numbers only
    )

In [None]:
# apply preprocess
df["clean_text"] = df["Comment"].apply(spacy_preprocess)

In [None]:
from sklearn.model_selection import train_test_split

X = df["clean_text"]
y = df["Emotion"]

X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.20, random_state=42, stratify=y
)

X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.50, random_state=42, stratify=y_temp
)

In [None]:
print(X_train.shape)
print(X_val.shape)
print(X_test.shape)

(4749,)
(594,)
(594,)


In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_val_seq   = tokenizer.texts_to_sequences(X_val)

MAX_LEN = 50  # reasonable for most sentences
X_train_pad = pad_sequences(X_train_seq, maxlen=MAX_LEN, padding="post")
X_val_pad   = pad_sequences(X_val_seq, maxlen=MAX_LEN, padding="post")

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)
y_val_enc   = le.transform(y_val)
y_test_enc  = le.transform(y_test)

In [None]:
print(X_train_pad.mean())
print(X_train_pad.std())
print(np.unique(X_train_pad[:5], axis=1))

154.23768793430196
639.4956968290884
[[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    2    2    3    4    8   10   21
    63  113  138 1090 1669]
 [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    4  324 1670    2  733    9    4    5  284  513
    14  246    7 2095    3]
 [   0    2    2    7   12   18   21   30   44   46   77   81  194  223
   334  852 1415 2096    8   72   80    2   16    3   62  254   11 1223
     2  223   24   77    4]
 [   0    6  617    0  616   83    4    0  734    0    0    0 2099    0
     0    0    0   79  616  734   26    2 1091   26    3  325 2953    8
  2954    4 2097 2098   17]
 [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0   28   11 2100   33  998  255   11
   186  358    6    0  247]]


In [None]:
seq_lengths = [len(seq) for seq in X_train_seq]
max_len = max(seq_lengths)
print(max_len)

64


In [None]:
MAX_LEN = int(np.percentile(seq_lengths, 95))
MAX_LEN

42

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Input,Dropout,BatchNormalization
import numpy as np

num_classes = len(np.unique(y_train))
vocab_size = 20000
max_len = 50


model = Sequential([
    Input(shape=(max_len,)),
    Embedding(input_dim=vocab_size, output_dim=128),
    LSTM(128),
    BatchNormalization(),
    Dense(64, activation='relu'),
    Dense(32, activation='relu'),
    Dense(num_classes, activation='softmax')
])

model.compile(
    optimizer="adam",
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"]
)

model.summary()

In [None]:
from tensorflow.keras.callbacks import EarlyStopping,ReduceLROnPlateau

early_stop = EarlyStopping(
    monitor='val_loss',
    patience=10,
    restore_best_weights=True
)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, min_lr=1e-5)

history = model.fit(
    X_train_pad,
    y_train_enc,
    validation_data=(X_val_pad, y_val_enc),
    epochs=60,
    batch_size=32,
    callbacks=[early_stop,reduce_lr]
)


Epoch 1/60
[1m149/149[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 110ms/step - accuracy: 0.3244 - loss: 1.1032 - val_accuracy: 0.3434 - val_loss: 1.0987 - learning_rate: 0.0010
Epoch 2/60
[1m149/149[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 116ms/step - accuracy: 0.3523 - loss: 1.0920 - val_accuracy: 0.4764 - val_loss: 0.9937 - learning_rate: 0.0010
Epoch 3/60
[1m149/149[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 107ms/step - accuracy: 0.6073 - loss: 0.7656 - val_accuracy: 0.5471 - val_loss: 0.8794 - learning_rate: 0.0010
Epoch 4/60
[1m149/149[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 115ms/step - accuracy: 0.8147 - loss: 0.4401 - val_accuracy: 0.3367 - val_loss: 1.7935 - learning_rate: 0.0010
Epoch 5/60
[1m149/149[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 113ms/step - accuracy: 0.8756 - loss: 0.3447 - val_accuracy: 0.7054 - val_loss: 0.6818 - learning_rate: 0.0010
Epoch 6/60
[1m149/149[0m [32m━━━━━━━━━━━━━━━━━━━━[0

In [None]:
X_test_seq  = tokenizer.texts_to_sequences(X_test)
X_test_pad  = pad_sequences(X_test_seq, maxlen=MAX_LEN, padding="post")

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
import numpy as np


y_pred_probs = model.predict(X_test_pad)  # shape: (samples, num_classes)
y_pred = np.argmax(y_pred_probs, axis=1)  # convert probs to integer labels

conf_mat  = confusion_matrix(y_test_enc, y_pred)

print("Confusion Matrix:\n", conf_mat)

print("\nClassification Report:\n", classification_report(y_test_enc, y_pred))

[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 32ms/step
Confusion Matrix:
 [[185   8   7]
 [  9 178   7]
 [  6   4 190]]

Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.93      0.93       200
           1       0.94      0.92      0.93       194
           2       0.93      0.95      0.94       200

    accuracy                           0.93       594
   macro avg       0.93      0.93      0.93       594
weighted avg       0.93      0.93      0.93       594



In [None]:
#generability testing
y_pred_probs = model.predict(X_train_pad)  # shape: (samples, num_classes)
y_pred = np.argmax(y_pred_probs, axis=1)
print("\nClassification Report:\n", classification_report(y_train_enc, y_pred))

[1m149/149[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 23ms/step

Classification Report:
               precision    recall  f1-score   support

           0       0.99      1.00      1.00      1600
           1       1.00      0.99      1.00      1549
           2       1.00      1.00      1.00      1600

    accuracy                           1.00      4749
   macro avg       1.00      1.00      1.00      4749
weighted avg       1.00      1.00      1.00      4749

