In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedKFold
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score, classification_report
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import Counter
import nltk

In [3]:
df = pd.read_csv('emotions.csv')

In [4]:
df.head()

Unnamed: 0,text,label
0,i just feel really helpless and heavy hearted,4
1,ive enjoyed being able to slouch about relax a...,0
2,i gave up my internship with the dmrg and am f...,4
3,i dont know i feel so lost,0
4,i am a kindergarten teacher and i am thoroughl...,4


In [5]:
tfidf = TfidfVectorizer(max_features=10000)
X = tfidf.fit_transform(df['text'])
y = df['label']

In [6]:
emotion_words = {}
for emo in df['label'].unique():
    all_words = ' '.join(df[df['label'] == emo]['text']).split()
    common = [w for w, _ in Counter(all_words).most_common(100)]
    emotion_words[emo] = common

In [7]:
for emo, words in emotion_words.items():
    print(f"Top words for {emo}: {words[50:100]}")

#0 = Sadness
#1 = Joy
#2 = Love
#3 = Anger
#4 = Fear
#5 = Surprise

Top words for 4: ['bit', 'time', 'been', 'still', 'are', 'how', 'get', 'being', 'had', 'will', 'now', 'would', 'people', 'from', 'think', 'they', 'want', 'he', 'scared', 'afraid', 'her', 'anxious', 'them', 'going', 'there', 'm', 'nervous', 'ive', 'one', 'even', 'some', 'things', 'him', 'dont', 'uncomfortable', 'something', 'insecure', 'unsure', 'overwhelmed', 'weird', 'strange', 'too', 'go', 'we', 'pressured', 'vulnerable', 'who', 'terrified', 'reluctant', 'much']
Top words for 0: ['more', 'from', 'had', 'little', 'what', 'now', 'get', 'how', 'very', 'ive', 'would', 'people', 'still', 'will', 'even', 'want', 'think', 'they', 'her', 'he', 'one', 'life', 'an', 'bit', 'who', 'dont', 'them', 'much', 'some', 'him', 'its', 'no', 'there', 'things', 'something', 'm', 'we', 'way', 'day', 'going', 'make', 'too', 's', 'could', 'don', 'go', 'back', 'she', 'over', 'has']
Top words for 2: ['him', 'been', 'they', 'them', 'from', 'people', 'would', 'time', 'will', 'one', 'loved', 'up', 'who', 'loving'

In [8]:
def get_models():
    return {
        'RandomForest': RandomForestClassifier(n_estimators=100, random_state=42),
        'SVM_soft': SVC(C=1.0, kernel='rbf', probability=True),  # soft margin default C=1
        'SVM_hard': SVC(C=1e6, kernel='rbf', probability=True)  # approximate hard margin with large C
    }

In [9]:
models = get_models()

In [10]:
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

fold_accuracies = {name: [] for name in models}
conf_matrices = {name: np.zeros((len(np.unique(y)),) * 2, dtype=int) for name in models}
f1_scores = {name: [] for name in models}

for fold, (train_idx, test_idx) in enumerate(kf.split(X, y), 1):
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    for name, model in models.items():
        model.fit(X_train, y_train)
        preds = model.predict(X_test)

        acc = accuracy_score(y_test, preds)
        fold_accuracies[name].append(acc)

        conf_matrices[name] += confusion_matrix(y_test, preds, labels=np.unique(y))

        f1 = f1_score(y_test, preds, average=None, labels=np.unique(y))
        f1_scores[name].append(f1)

        print(f"Fold {fold}, {name} - Acc: {acc:.4f}")

In [None]:
for name in models:
    avg_acc = np.mean(fold_accuracies[name])
    print(f"\n{name} average accuracy: {avg_acc:.4f}")
    print("Confusion Matrix:")
    print(conf_matrices[name])
    print("Average F1 per class:")
    mean_f1 = np.mean(f1_scores[name], axis=0)
    for emo, score in zip(np.unique(y), mean_f1):
        print(f"{emo}: {score:.4f}")

In [None]:
plt.figure(figsize=(10, 6))
for name, accs in fold_accuracies.items():
    plt.plot(range(1, len(accs)+1), accs, marker='o', label=name)
plt.title('Model Accuracy per Fold')
plt.xlabel('Fold')
plt.ylabel('Accuracy')
plt.xticks(range(1, 11))
plt.legend()
plt.grid(True)
plt.show()

In [None]:
def predict_emotion(text, model, tfidf_vectorizer, preprocess_func):
    cleaned_text = preprocess_func(text)
    vectorized_text = tfidf_vectorizer.transform([cleaned_text])
    prediction = model.predict(vectorized_text)

    return prediction[0]

In [None]:
input_text = "I just got a promotion at work!"

predicted_emotion_rf = predict_emotion(input_text, models['RandomForest'], tfidf, preprocess_text)
print(f"Random Forest Prediction: {predicted_emotion_rf}")

predicted_emotion_svm_soft = predict_emotion(input_text, models['SVM_soft'], tfidf, preprocess_text)
print(f"Soft Margin SVM Prediction: {predicted_emotion_svm_soft}")

predicted_emotion_svm_hard = predict_emotion(input_text, models['SVM_hard'], tfidf, preprocess_text)
print(f"Hard Margin SVM Prediction: {predicted_emotion_svm_hard}")