In [2]:
!pip install tensorflow

Collecting tensorflow
  Downloading tensorflow-2.19.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.1 kB)
Collecting astunparse>=1.6.0 (from tensorflow)
  Downloading astunparse-1.6.3-py2.py3-none-any.whl.metadata (4.4 kB)
Collecting flatbuffers>=24.3.25 (from tensorflow)
  Downloading flatbuffers-25.2.10-py2.py3-none-any.whl.metadata (875 bytes)
Collecting google-pasta>=0.1.1 (from tensorflow)
  Downloading google_pasta-0.2.0-py3-none-any.whl.metadata (814 bytes)
Collecting libclang>=13.0.0 (from tensorflow)
  Downloading libclang-18.1.1-py2.py3-none-manylinux2010_x86_64.whl.metadata (5.2 kB)
Collecting tensorboard~=2.19.0 (from tensorflow)
  Downloading tensorboard-2.19.0-py3-none-any.whl.metadata (1.8 kB)
Collecting tensorflow-io-gcs-filesystem>=0.23.1 (from tensorflow)
  Downloading tensorflow_io_gcs_filesystem-0.37.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (14 kB)
Collecting wheel<1.0,>=0.23.0 (from astunparse>=1.6.0->tensorflow

In [3]:
import numpy as np
import pandas as pd
import tensorflow as tf
import re
import nltk
import string
import json
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [4]:
# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [5]:
# Load datasets
def load_data(file_path):
    df = pd.read_csv(file_path, header=None, names=["data"])
    df[["tweet", "label"]] = df["data"].str.split(";", expand=True)
    df.drop(columns=["data"], inplace=True)
    return df

In [6]:
train_df = load_data("/content/drive/MyDrive/train.txt")
val_df = load_data("/content/drive/MyDrive/val.txt")
test_df = load_data("/content/drive/MyDrive/test.txt")

In [7]:
# Handle missing values
train_df.dropna(inplace=True)
val_df.dropna(inplace=True)
test_df.dropna(inplace=True)

In [8]:
# Text preprocessing
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [9]:
def preprocess_text(text):
    text = text.lower()  # Lowercase
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = text.translate(str.maketrans("", "", string.punctuation))  # Remove punctuation
    words = text.split()
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]  # Remove stopwords & lemmatize
    return " ".join(words)

In [10]:
train_df["tweet"] = train_df["tweet"].apply(preprocess_text)
val_df["tweet"] = val_df["tweet"].apply(preprocess_text)
test_df["tweet"] = test_df["tweet"].apply(preprocess_text)

In [11]:
# Encode labels
label_encoder = LabelEncoder()
train_df["label"] = label_encoder.fit_transform(train_df["label"])
val_df["label"] = label_encoder.transform(val_df["label"])
test_df["label"] = label_encoder.transform(test_df["label"])

In [12]:
# Save label encoder classes
with open("label_encoder.json", "w") as f:
    json.dump(label_encoder.classes_.tolist(), f)

In [13]:
# Tokenization
max_words = 10000  # Max vocabulary size
max_len = 128  # Max sequence length
tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(train_df["tweet"])

In [14]:
# Convert text to sequences
train_sequences = tokenizer.texts_to_sequences(train_df["tweet"])
val_sequences = tokenizer.texts_to_sequences(val_df["tweet"])
test_sequences = tokenizer.texts_to_sequences(test_df["tweet"])

In [15]:
# Padding sequences
train_padded = pad_sequences(train_sequences, maxlen=max_len, padding="post", truncating="post")
val_padded = pad_sequences(val_sequences, maxlen=max_len, padding="post", truncating="post")
test_padded = pad_sequences(test_sequences, maxlen=max_len, padding="post", truncating="post")

In [16]:
# Convert labels to categorical (one-hot encoding)
num_classes = len(label_encoder.classes_)
train_labels = to_categorical(train_df["label"], num_classes=num_classes)
val_labels = to_categorical(val_df["label"], num_classes=num_classes)
test_labels = to_categorical(test_df["label"], num_classes=num_classes)

In [17]:
# Build BiGRU model
model = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(input_dim=max_words, output_dim=128, input_length=max_len),
    tf.keras.layers.Bidirectional(tf.keras.layers.GRU(64, return_sequences=True, dropout=0.3, recurrent_dropout=0.3)),
    tf.keras.layers.Bidirectional(tf.keras.layers.GRU(32, dropout=0.3, recurrent_dropout=0.3)),
    tf.keras.layers.Dense(32, activation="relu"),
    tf.keras.layers.Dropout(0.3),  # Prevents overfitting
    tf.keras.layers.Dense(num_classes, activation="softmax")
])



In [18]:
# Compile model
model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])

In [19]:
# Implement early stopping
early_stopping = tf.keras.callbacks.EarlyStopping(monitor="val_loss", patience=3, restore_best_weights=True)

In [20]:
# Train model
epochs = 20
batch_size = 64
history = model.fit(
    train_padded, train_labels,
    validation_data=(val_padded, val_labels),
    epochs=epochs,
    batch_size=batch_size,
    callbacks=[early_stopping]
)

Epoch 1/20
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m144s[0m 539ms/step - accuracy: 0.3773 - loss: 1.5175 - val_accuracy: 0.8140 - val_loss: 0.5374
Epoch 2/20
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m135s[0m 540ms/step - accuracy: 0.8667 - loss: 0.4142 - val_accuracy: 0.9110 - val_loss: 0.2684
Epoch 3/20
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m133s[0m 531ms/step - accuracy: 0.9309 - loss: 0.2073 - val_accuracy: 0.9155 - val_loss: 0.2340
Epoch 4/20
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m131s[0m 522ms/step - accuracy: 0.9546 - loss: 0.1362 - val_accuracy: 0.9260 - val_loss: 0.2406
Epoch 5/20
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m130s[0m 520ms/step - accuracy: 0.9649 - loss: 0.1008 - val_accuracy: 0.9250 - val_loss: 0.2304
Epoch 6/20
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m131s[0m 523ms/step - accuracy: 0.9728 - loss: 0.0829 - val_accuracy: 0.9260 - val_loss: 0.2502
Epoc

In [21]:
# Evaluate on test set
test_loss, test_acc = model.evaluate(test_padded, test_labels)
print(f"Test Accuracy: {test_acc:.4f}")

[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 84ms/step - accuracy: 0.9269 - loss: 0.2273
Test Accuracy: 0.9275


In [23]:
# Save model and tokenizer
model.save("bigru_emotion_model.keras")  # Recommended Keras format
tokenizer_json = tokenizer.to_json()
with open("tokenizer.json", "w") as f:
    f.write(tokenizer_json)

print("Model and tokenizer saved successfully!")

Model and tokenizer saved successfully!


In [29]:
# Load the trained model and tokenizer for real-time predictions
model = tf.keras.models.load_model("bigru_emotion_model.keras")

In [30]:
# Load tokenizer correctly
from tensorflow.keras.preprocessing.text import tokenizer_from_json

with open("tokenizer.json", "r") as f:
    tokenizer_json = json.load(f)

# Convert dictionary to JSON string before loading
tokenizer = tokenizer_from_json(json.dumps(tokenizer_json))


In [31]:
with open("label_encoder.json", "r") as f:
    label_classes = json.load(f)

label_encoder = LabelEncoder()
label_encoder.classes_ = np.array(label_classes)


In [32]:
# Function to predict emotion from user input
def predict_emotion(text):
    processed_text = preprocess_text(text)  # Preprocess input text
    sequence = tokenizer.texts_to_sequences([processed_text])  # Convert to sequence
    padded_sequence = pad_sequences(sequence, maxlen=128, padding="post", truncating="post")  # Pad sequence

    prediction = model.predict(padded_sequence)  # Get model prediction
    predicted_label = np.argmax(prediction)  # Get label index
    emotion = label_encoder.inverse_transform([predicted_label])[0]  # Convert index to label

    return emotion


In [35]:
# Take user input and predict emotion
while True:
    user_input = input("Enter a sentence (or type 'exit' to quit): ")
    if user_input.lower() == "exit":
        print("Exiting...")
        break
    emotion = predict_emotion(user_input)
    print(f"Predicted Emotion: {emotion}")

Enter a sentence (or type 'exit' to quit): I can't stop crying, everything feels so hopeless.
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 83ms/step
Predicted Emotion: sadness
Enter a sentence (or type 'exit' to quit): I am so nervous about my exam results tomorrow.
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 85ms/step
Predicted Emotion: fear
Enter a sentence (or type 'exit' to quit): I just received the best news of my life, I am thrilled!
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 71ms/step
Predicted Emotion: joy
Enter a sentence (or type 'exit' to quit): I feel so loved and appreciated by my friends and family
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 69ms/step
Predicted Emotion: love
Enter a sentence (or type 'exit' to quit): How could they betray me like this? I am so mad!
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 72ms/step
Predicted Emotion: anger
Enter a sentence (or type 'exit' to quit