# Emotion Detection

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("data/emotion_dataset_10k.csv")

In [3]:
df.head(5)

Unnamed: 0,id,timestamp,user,text,ticker,emotion_label
0,emo_0,2025-10-25T03:42:52.347823,user_641,$BAC strong earnings sad,BAC,sadness
1,emo_1,2025-02-24T20:34:52.347823,user_949,Bank of America Corporation profit warning CEO...,BAC,fear
2,emo_2,2025-04-29T13:46:52.347823,user_791,"Meta Platforms, Inc conference presentation #m...",META,neutral
3,emo_3,2025-07-09T22:45:52.347823,user_562,$GOOG declining sales excited,GOOG,joy
4,emo_4,2025-10-29T12:16:52.347823,user_1545,$AAPL company updates #earnings thrilled,AAPL,joy


## Rule-based emotion detection and classifier

1. We'll build a small emotion lexicon (words -> emotion) and implement a rule-based function.
2. Then we will train a TF-IDF + Logistic Regression classifier on the labelled emotion data and compare.


In [4]:
# small emotion lexicon (example)
emotion_words = {
    "joy": ["excited","thrilled","optimistic","happy","delighted","bullish"],
    "anger": ["furious","angry","outraged","frustrated","irritated"],
    "fear": ["worried","concerned","fearful","uneasy","anxious"],
    "sadness": ["disappointed","sad","down","depressed","sombre"],
    "surprise": ["surprised","shocked","astonished"]
}

In [5]:
# build reverse lookup
lex = {}
for emo, words in emotion_words.items():
    for w in words:
        lex[w] = emo

In [6]:
def rule_emotion(text):
    if not isinstance(text, str):
        return "neutral"
    text_l = text.lower()
    found = []
    for word, emo in lex.items():
        if word in text_l:
            found.append(emo)
    if found:
        # return most frequent found emotion
        return max(set(found), key=found.count)
    return "neutral"

In [7]:
# show rule-based outputs on sample
print("Rule-based outputs on first 5 rows:")
print(df['text'].head(5).apply(rule_emotion))

Rule-based outputs on first 5 rows:
0    sadness
1       fear
2    neutral
3        joy
4        joy
Name: text, dtype: object


In [8]:
# Train TF-IDF + Logistic Regression classifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

  import scipy.linalg  # noqa


In [9]:
df = df.dropna(subset=['text','emotion_label']).reset_index(drop=True)

In [10]:
X = df['text']
y = df['emotion_label']

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [12]:
pipe = make_pipeline(
    TfidfVectorizer(max_features=5000, stop_words='english'),
    LogisticRegression(max_iter=1000)
)

In [13]:
pipe.fit(X_train, y_train)

In [14]:
y_pred = pipe.predict(X_test)

In [15]:
print("Emotion classifier accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification report:\n", classification_report(y_test, y_pred))

Emotion classifier accuracy: 0.966

Classification report:
               precision    recall  f1-score   support

       anger       1.00      1.00      1.00       304
        fear       1.00      1.00      1.00       394
         joy       1.00      1.00      1.00       501
     neutral       0.81      1.00      0.90       291
     sadness       1.00      0.78      0.87       303
    surprise       1.00      1.00      1.00       207

    accuracy                           0.97      2000
   macro avg       0.97      0.96      0.96      2000
weighted avg       0.97      0.97      0.97      2000



In [16]:
# Compare rule vs model for a small sample
sample_texts = X_test.sample(10, random_state=4).tolist()
for t in sample_texts:
    print("TEXT:", t)
    print("Rule:", rule_emotion(t))
    print("Model:", pipe.predict([t])[0])
    print("----")

TEXT: Alphabet Inc strong earnings #markets concerned
Rule: fear
Model: fear
----
TEXT: Meta Platforms, Inc shares traded #markets disappointed
Rule: sadness
Model: sadness
----
TEXT: $MSFT regulatory probe worried
Rule: fear
Model: fear
----
TEXT: $NVDA revenue growth ‚ö†Ô∏è
Rule: neutral
Model: neutral
----
TEXT: Amazon.com, Inc conference presentation #markets
Rule: neutral
Model: neutral
----
TEXT: $AAPL upbeat guidance ‚ö†Ô∏è thrilled
Rule: joy
Model: joy
----
TEXT: $BAC announces conference call delighted
Rule: joy
Model: joy
----
TEXT: JPMorgan Chase & Co. strong earnings guidance raised for next quarter irritated
Rule: anger
Model: anger
----
TEXT: $GOOG revenue miss üöÄ
Rule: neutral
Model: neutral
----
TEXT: $AAPL strategic acquisition thrilled
Rule: joy
Model: joy
----
