In [1]:
pip install pandas numpy scikit-learn tensorflow



In [2]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from sklearn.metrics import classification_report

In [3]:
train_df = pd.read_csv("training.csv")
val_df = pd.read_csv("validation.csv")
test_df = pd.read_csv("test.csv")

print("Train shape:", train_df.shape)
print("Validation shape:", val_df.shape)
print("Test shape:", test_df.shape)

Train shape: (16000, 2)
Validation shape: (2000, 2)
Test shape: (2000, 2)


In [4]:
# Tokenize and pad sequences
tokenizer = Tokenizer(oov_token="<OOV>")
tokenizer.fit_on_texts(train_df['text'])

# Convert text to sequences
max_len = 100  # adjust as needed
X_train = pad_sequences(tokenizer.texts_to_sequences(train_df['text']), maxlen=max_len)
X_val = pad_sequences(tokenizer.texts_to_sequences(val_df['text']), maxlen=max_len)
X_test = pad_sequences(tokenizer.texts_to_sequences(test_df['text']), maxlen=max_len)

# Labels
y_train = train_df['label'].values
y_val = val_df['label'].values
y_test = test_df['label'].values

num_classes = len(set(y_train))


In [5]:
model = Sequential([
    Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=64, input_length=max_len),
    LSTM(64, return_sequences=False),
    Dropout(0.5),
    Dense(32, activation='relu'),
    Dense(num_classes, activation='softmax')
])

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()




In [6]:
history = model.fit(
    X_train, y_train,
    epochs=200,
    batch_size=32,
    validation_data=(X_val, y_val)
)


Epoch 1/100
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 66ms/step - accuracy: 0.3727 - loss: 1.5397 - val_accuracy: 0.6875 - val_loss: 0.8134
Epoch 2/100
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 64ms/step - accuracy: 0.7298 - loss: 0.6708 - val_accuracy: 0.7735 - val_loss: 0.5671
Epoch 3/100
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 63ms/step - accuracy: 0.8675 - loss: 0.3325 - val_accuracy: 0.9070 - val_loss: 0.3161
Epoch 4/100
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 63ms/step - accuracy: 0.9592 - loss: 0.1373 - val_accuracy: 0.9130 - val_loss: 0.3348
Epoch 5/100
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 69ms/step - accuracy: 0.9712 - loss: 0.0928 - val_accuracy: 0.9225 - val_loss: 0.2787
Epoch 6/100
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 67ms/step - accuracy: 0.9769 - loss: 0.0752 - val_accuracy: 0.9135 - val_loss: 0.2955
Epoch 7/10

In [15]:
# Evaluate on test set
test_loss, test_acc = model.evaluate(X_test, y_test)
print(f"\nTest Accuracy: {test_acc:.4f}")

# Classification Report
y_pred = np.argmax(model.predict(X_test), axis=1)
print(classification_report(y_test, y_pred))


[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 18ms/step - accuracy: 0.9038 - loss: 0.9405

Test Accuracy: 0.9050
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 16ms/step
              precision    recall  f1-score   support

           0       0.95      0.96      0.95       581
           1       0.93      0.93      0.93       695
           2       0.79      0.81      0.80       159
           3       0.88      0.91      0.89       275
           4       0.88      0.82      0.85       224
           5       0.70      0.73      0.71        66

    accuracy                           0.91      2000
   macro avg       0.85      0.86      0.86      2000
weighted avg       0.91      0.91      0.91      2000



In [16]:
model.save("emotion_model.h5")



In [17]:
# Predict probabilities
val_probs = model.predict(X_val)
test_probs = model.predict(X_test)

# Get the predicted class (argmax to convert from probabilities to class index)
val_preds = np.argmax(val_probs, axis=1)
test_preds = np.argmax(test_probs, axis=1)


[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 27ms/step
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 17ms/step


In [10]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Validation Set Evaluation
print("Validation Accuracy:", accuracy_score(y_val, val_preds))
print("Validation Classification Report:\n", classification_report(y_val, val_preds))
print("Validation Confusion Matrix:\n", confusion_matrix(y_val, val_preds))

# Test Set Evaluation
print("Test Accuracy:", accuracy_score(y_test, test_preds))
print("Test Classification Report:\n", classification_report(y_test, test_preds))
print("Test Confusion Matrix:\n", confusion_matrix(y_test, test_preds))

Validation Accuracy: 0.9135
Validation Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.97      0.95       550
           1       0.94      0.93      0.93       704
           2       0.81      0.84      0.83       178
           3       0.91      0.94      0.92       275
           4       0.91      0.79      0.85       212
           5       0.84      0.83      0.83        81

    accuracy                           0.91      2000
   macro avg       0.89      0.88      0.89      2000
weighted avg       0.91      0.91      0.91      2000

Validation Confusion Matrix:
 [[531   6   3   6   4   0]
 [ 12 652  28   7   3   2]
 [  3  21 150   2   1   1]
 [  7   3   3 259   3   0]
 [ 15   7   1  11 168  10]
 [  3   5   0   1   5  67]]
Test Accuracy: 0.905
Test Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.96      0.95       581
           1       0.93      0.93      0.9

In [12]:
label_encoder = LabelEncoder()
label_encoder.fit(train_df['label'])

# Encode labels
y_train = label_encoder.transform(train_df['label'])
y_val = label_encoder.transform(val_df['label'])
y_test = label_encoder.transform(test_df['label'])

# When converting predictions back to readable labels:
results_df['label'] = label_encoder.inverse_transform(y_test)
results_df['predicted'] = label_encoder.inverse_transform(test_preds)


In [13]:
# Remove rows with NaN labels
test_df_clean = test_df.dropna(subset=['label'])
X_test_clean = pad_sequences(tokenizer.texts_to_sequences(test_df_clean['text']), maxlen=max_len)
y_test_clean = label_encoder.transform(test_df_clean['label'])

# Predict on clean set
test_preds_clean = np.argmax(model.predict(X_test_clean), axis=1)

# Evaluate
print("Clean Test Accuracy:", accuracy_score(y_test_clean, test_preds_clean))
print("Clean Test Classification Report:\n", classification_report(y_test_clean, test_preds_clean))


[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 17ms/step
Clean Test Accuracy: 0.905
Clean Test Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.96      0.95       581
           1       0.93      0.93      0.93       695
           2       0.79      0.81      0.80       159
           3       0.88      0.91      0.89       275
           4       0.88      0.82      0.85       224
           5       0.70      0.73      0.71        66

    accuracy                           0.91      2000
   macro avg       0.85      0.86      0.86      2000
weighted avg       0.91      0.91      0.91      2000



In [19]:
# Get DataFrame with actual and predicted labels
test_df_clean['predicted'] = label_encoder.inverse_transform(test_preds_clean)
test_df_clean['actual'] = test_df_clean['label']

# Show misclassified
misclassified = test_df_clean[test_df_clean['predicted'] != test_df_clean['actual']]
# print(misclassified[['text', 'actual', 'predicted']].head(10))
print(test_df_clean[['text', 'actual', 'predicted']].head(100))


                                                 text  actual  predicted
0   im feeling rather rotten so im not very ambiti...       0          0
1           im updating my blog because i feel shitty       0          0
2   i never make her separate from me because i do...       0          0
3   i left with my bouquet of red and yellow tulip...       1          1
4     i was feeling a little vain when i did this one       0          0
..                                                ...     ...        ...
95  im feeling angry at someone i do something tho...       3          3
96  i love neglecting this blog but sometimes i fe...       2          2
97  i lay in bed feeling as though i were awaiting...       0          0
98    i feel my heart is tortured by what i have done       3          3
99  i was still feeling weepy and strung out so ma...       0          0

[100 rows x 3 columns]


In [20]:
from tensorflow.keras.models import load_model

# Load saved model
model = load_model("emotion_model.h5")



In [21]:
def predict_emotion(texts, tokenizer, model, max_len=100):
    if isinstance(texts, str):
        texts = [texts]

    # Preprocess input
    sequences = tokenizer.texts_to_sequences(texts)
    padded = pad_sequences(sequences, maxlen=max_len)

    # Predict
    preds = model.predict(padded)
    pred_labels = np.argmax(preds, axis=1)

    # Map labels
    label_map = {0: 'sadness', 1: 'joy', 2: 'love', 3: 'anger', 4: 'fear', 5: 'surprise'}
    return [label_map.get(label, "unknown") for label in pred_labels]

In [22]:
# Example texts
my_texts = [
    "I can't stop crying, I miss them so much.",
    "I just got a promotion! So excited!",
    "I hate being ignored.",
    "That was unexpected but wonderful!",
    "Why do I feel so anxious lately?"
]

# Predict emotions
predicted_emotions = predict_emotion(my_texts, tokenizer, model)
for txt, emotion in zip(my_texts, predicted_emotions):
    print(f"Text: {txt}\nPredicted Emotion: {emotion}\n")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 376ms/step
Text: I can't stop crying, I miss them so much.
Predicted Emotion: surprise

Text: I just got a promotion! So excited!
Predicted Emotion: joy

Text: I hate being ignored.
Predicted Emotion: sadness

Text: That was unexpected but wonderful!
Predicted Emotion: joy

Text: Why do I feel so anxious lately?
Predicted Emotion: fear

