In [2]:
# Importing the required libraries
import keras
import numpy as np
from keras.models import Sequential,Model
from keras.layers import Dense,Bidirectional
from nltk.tokenize import word_tokenize,sent_tokenize
from keras.layers import *
from sklearn.model_selection import cross_val_score 
import nltk
import pandas as pd
import spacy
from sklearn.model_selection import train_test_split 
from sklearn.metrics import classification_report
from sklearn.preprocessing import OneHotEncoder

In [35]:
#step2:load the dataset
df = pd.read_csv(r"D:\jangoProject\nlp\EmotionDetection\isear.csv")
print('Before removing extra rows:', df.shape)

# The isear.csv contains rows with value 'No response' in the second column,
# so we need to remove such rows.

# Normalize the text by stripping whitespace and converting to lowercase
df.drop(df[df.iloc[:,1].astype(str).str.lower().str.contains(r'no\s*response', na=False)
   ].index,
    inplace=True
)
print(df.shape)

Before removing extra rows: (7651, 2)
(7564, 2)


In [125]:
import spacy
import pandas as pd

nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])  # Disable unused pipeline components to speed up

def tokenize_text(text):
    doc = nlp(text)
    tokens = [
        token.text.lower()
        for token in doc
        if not token.is_punct and not token.is_space
        # optionally uncomment to remove stopwords:
        # and not token.is_stop
    ]
    return tokens

# Apply the tokenize_text function to the text column (second column at index 1)
tokenizeText = df.iloc[:, 1].apply(tokenize_text)

# Convert the list of token lists into a pandas Series (optional, but good for alignment with DataFrame)
tokenizeText = pd.Series(tokenizeText)

print(tokenizeText[0])


['every', 'time', 'i', 'imagine', 'that', 'someone', 'i', 'love', 'or', 'i', 'could', 'contact', 'a', 'serious', 'illness', 'even', 'death']


step3:This section converts pre-tokenized sentences back into plain text strings so that Keras's Tokenizer can process them. The tokenizer is then fitted on this text corpus to create a vocabulary mapping words to integer indices. Finally, each text is converted to a sequence of integers representing token IDs, which can be fed into downstream models such as embedding layers or LSTMs. This step bridge tokenization and numerical encoding for model training.

In [126]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


# Convert token lists back to strings
texts = tokenizeText.apply(lambda tokens: ' '.join(tokens)).tolist()

# Initialize and fit tokenizer on texts
tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)


# Convert texts to integer sequences
sequences = tokenizer.texts_to_sequences(texts)


In [127]:
# Pad the integer sequences to a fixed length (100 tokens) using post-padding.
# This ensures all input sequences have the same length for batch processing.
# The result is a 2D NumPy array suitable as input to embedding layers and LSTM models.
X_train_pad =pad_sequences(sequences,maxlen=100,padding='post')
X_train_pad.shape

(7564, 100)

In [128]:


X = X_train_pad  # padded input sequences (numpy array)

# Extract labels column, convert to numpy array and reshape to 2D for encoder
labels = df.iloc[:, 0].to_numpy().reshape(-1, 1)

# Initialize OneHotEncoder
encoder = OneHotEncoder(handle_unknown='ignore',sparse_output=False)

# Fit and transform labels
y = encoder.fit_transform(labels)

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42
)
print(X_train.shape)  # (num_train_samples, 100)
print(X_test.shape)   # (num_test_samples, 100)
print(y_train.shape)  # (num_train_samples, num_classes)
print(y_test.shape)   # (num_test_samples, num_classes)


(6051, 100)
(1513, 100)
(6051, 7)
(1513, 7)


In [142]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout, SpatialDropout1D, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras import regularizers

# Fix random seed for reproducibility (optional)
SEED = 42
tf.random.set_seed(SEED)
np.random.seed(SEED)

vocab_size = len(tokenizer.word_index) + 1
embedding_dim = 50
max_length = 100
num_classes = y.shape[1]

    # Build model
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_length))
model.add(SpatialDropout1D(0.3))
model.add(Bidirectional(LSTM(128, return_sequences=True, dropout=0.3, recurrent_dropout=0.3)))
model.add(Bidirectional(LSTM(64, dropout=0.3, recurrent_dropout=0.3)))
model.add(BatchNormalization())
model.add(Dropout(0.5))
model.add(Dense(num_classes, activation='softmax', kernel_regularizer=regularizers.l2(0.001)))

model.compile(
    loss='categorical_crossentropy',
        optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
        metrics=['accuracy']
    )

model.summary()





In [143]:

    # Setup early stopping callback
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

    # Fit model with validation on test data (consider splitting train for validation if desired)
history = model.fit(
        X_train,
        y_train,
        epochs=30,
        batch_size=64,
        validation_data=(X_test, y_test),
        callbacks=[early_stopping],
        verbose=1
    )

Epoch 1/30
[1m95/95[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m85s[0m 779ms/step - accuracy: 0.1614 - loss: 2.0422 - val_accuracy: 0.2789 - val_loss: 1.9415
Epoch 2/30
[1m95/95[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m72s[0m 754ms/step - accuracy: 0.3610 - loss: 1.6790 - val_accuracy: 0.2472 - val_loss: 1.8696
Epoch 3/30
[1m95/95[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m55s[0m 581ms/step - accuracy: 0.5249 - loss: 1.3373 - val_accuracy: 0.3589 - val_loss: 1.7426
Epoch 4/30
[1m95/95[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m65s[0m 684ms/step - accuracy: 0.6038 - loss: 1.1130 - val_accuracy: 0.5109 - val_loss: 1.5080
Epoch 5/30
[1m95/95[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m58s[0m 613ms/step - accuracy: 0.6886 - loss: 0.9141 - val_accuracy: 0.5169 - val_loss: 1.3662
Epoch 6/30
[1m95/95[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m64s[0m 680ms/step - accuracy: 0.7381 - loss: 0.7691 - val_accuracy: 0.5043 - val_loss: 1.4232
Epoch 7/30
[1m95/95[

In [137]:
test_loss, test_accuracy=model.evaluate(X_test,y_test)
print(f'Test loss: {test_loss}, Test accuracy: {test_accuracy}')

[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 73ms/step - accuracy: 0.5297 - loss: 1.3270
Test loss: 1.3846439123153687, Test accuracy: 0.5102445483207703


In [138]:
pred=model.predict(y_test)
print(labels)


[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 28ms/step
[['fear']
 ['anger']
 ['sadness']
 ...
 ['disgust']
 ['shame']
 ['guilt']]


In [139]:
y=model.predict(X_test)


[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 70ms/step


In [140]:
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences

def predict_emotion(text, model, tokenizer, encoder, max_length=100):
   

    # Convert text to integer sequence and pad
    seq = tokenizer.texts_to_sequences([text])
    padded = pad_sequences(seq, maxlen=max_length, padding='post')

    # Predict class probabilities
    probs = model.predict(padded)

    # Get index of max probability class
    class_idx = np.argmax(probs, axis=1)[0]

    # Decode class index to original label
    label = encoder.categories_[0][class_idx]

    return label

# Example usage:
predicted_label = predict_emotion("When I think about the short time that we live and relate it to the periods of my life when I think that I did not use this  ", model, tokenizer, encoder)
print("Predicted Emotion:", predicted_label)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step
Predicted Emotion: sadness


In [141]:
# Saving model
model.save('model.h5')

# Saving tokenizer and encoder with pickle
import pickle

with open('tokenizer.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)

with open('encoder.pkl', 'wb') as f:
    pickle.dump(encoder, f)

# Loading for predict.py

from tensorflow.keras.models import load_model
import pickle

model = load_model('model.h5')

with open('tokenizer.pkl', 'rb') as f:
    tokenizer = pickle.load(f)

with open('encoder.pkl', 'rb') as f:
    encoder = pickle.load(f)


