In [1]:
# 1. Import Libraries
import pandas as pd
import re
import nltk
import numpy as np

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

In [2]:
# 2. NLTK Setup
nltk.download('stopwords')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\deepa\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\deepa\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
# 3. Load and Preprocess Data
df = pd.read_csv("emotions.csv")  
df.dropna(inplace=True)
df.drop_duplicates(inplace=True)
def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-z\s]", "", text)
    tokens = text.split()
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)

df['cleaned_text'] = df['text'].apply(clean_text)

In [4]:
# 4. Label Encoding
le = LabelEncoder()
df['label_encoded'] = le.fit_transform(df['label'])

In [5]:
# 5. Tokenization and Padding
max_words = 10000
max_len = 100

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(df['cleaned_text'])
sequences = tokenizer.texts_to_sequences(df['cleaned_text'])
X = pad_sequences(sequences, maxlen=max_len)

y = to_categorical(df['label_encoded'])


In [6]:
# 6. Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# 7. Build LSTM Model
model = Sequential()
model.add(Embedding(input_dim=max_words, output_dim=128, input_length=max_len))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(y.shape[1], activation='softmax'))  # Multiclass

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])




In [8]:
# 8. Train Model
model.fit(X_train, y_train, epochs=5, batch_size=64, validation_split=0.1)


Epoch 1/5
[1m4682/4682[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m687s[0m 146ms/step - accuracy: 0.8136 - loss: 0.4612 - val_accuracy: 0.9395 - val_loss: 0.0921
Epoch 2/5
[1m4682/4682[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m600s[0m 128ms/step - accuracy: 0.9401 - loss: 0.0961 - val_accuracy: 0.9424 - val_loss: 0.0883
Epoch 3/5
[1m4682/4682[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m616s[0m 132ms/step - accuracy: 0.9414 - loss: 0.0903 - val_accuracy: 0.9421 - val_loss: 0.0896
Epoch 4/5
[1m4682/4682[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m954s[0m 204ms/step - accuracy: 0.9426 - loss: 0.0867 - val_accuracy: 0.9417 - val_loss: 0.0887
Epoch 5/5
[1m4682/4682[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m595s[0m 127ms/step - accuracy: 0.9433 - loss: 0.0848 - val_accuracy: 0.9420 - val_loss: 0.0908


<keras.src.callbacks.history.History at 0x23a72da1050>

In [9]:
# 9. Evaluate Model
y_pred_prob = model.predict(X_test)
y_pred = np.argmax(y_pred_prob, axis=1)
y_true = np.argmax(y_test, axis=1)

acc = accuracy_score(y_true, y_pred)
print("Accuracy:", acc)
print("Classification Report:")
print(classification_report(y_true, y_pred, target_names=le.classes_.astype(str)))


[1m2601/2601[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m79s[0m 30ms/step
Accuracy: 0.9413277260438571
Classification Report:
              precision    recall  f1-score   support

           0       0.96      1.00      0.98     24121
           1       1.00      0.92      0.95     28220
           2       0.77      1.00      0.87      6824
           3       0.95      0.94      0.95     11448
           4       0.91      0.90      0.91      9574
           5       0.89      0.74      0.81      3038

    accuracy                           0.94     83225
   macro avg       0.91      0.92      0.91     83225
weighted avg       0.95      0.94      0.94     83225



In [None]:
model.save('emotion_model.h5')
from keras.models import load_model
model = load_model('emotion_model.h5')





In [14]:
print(list(le.classes_))


[0, 1, 2, 3, 4, 5]


In [13]:
import pickle
with open("tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)

In [14]:
with open("label_encoder.pkl", "wb") as f:
    pickle.dump(le, f)


In [1]:
import tensorflow as tf
print(tf.__version__)

2.17.0
