In [1]:
import pandas as pd

In [7]:
df_train = pd.read_csv("emotions_train.txt", delimiter=";", header=None, names=['comment', 'emotion'])
df_train.head()
df_train.value_counts('emotion')

emotion
joy         5362
sadness     4666
anger       2159
fear        1937
love        1304
surprise     572
dtype: int64

In [9]:
df_train.shape
df_train.head()

Unnamed: 0,comment,emotion
0,i didnt feel humiliated,sadness
1,i can go from feeling so hopeless to so damned...,sadness
2,im grabbing a minute to post i feel greedy wrong,anger
3,i am ever feeling nostalgic about the fireplac...,love
4,i am feeling grouchy,anger


In [10]:
emotion_map = {'joy': 0, 'sadness': 1, 'anger': 2, 'fear': 3, 'love': 4, 'surprise': 5}

In [11]:
df_train['emotion_num'] = df_train['emotion'].map(emotion_map)
df_train.head()

Unnamed: 0,comment,emotion,emotion_num
0,i didnt feel humiliated,sadness,1
1,i can go from feeling so hopeless to so damned...,sadness,1
2,im grabbing a minute to post i feel greedy wrong,anger,2
3,i am ever feeling nostalgic about the fireplac...,love,4
4,i am feeling grouchy,anger,2


In [16]:
df_test = pd.read_csv("emotions_test.txt", delimiter=";", header=None, names=['comment', 'emotion'])
df_test.head()
df_test.value_counts('emotion')
df_test['emotion_num'] = df_train['emotion'].map(emotion_map)
df_test.head()

Unnamed: 0,comment,emotion,emotion_num
0,im feeling rather rotten so im not very ambiti...,sadness,1
1,im updating my blog because i feel shitty,sadness,1
2,i never make her separate from me because i do...,sadness,2
3,i left with my bouquet of red and yellow tulip...,joy,4
4,i was feeling a little vain when i did this one,sadness,2


In [17]:
df_merged = pd.concat([df_train, df_test])
df_merged.head()

Unnamed: 0,comment,emotion,emotion_num
0,i didnt feel humiliated,sadness,1
1,i can go from feeling so hopeless to so damned...,sadness,1
2,im grabbing a minute to post i feel greedy wrong,anger,2
3,i am ever feeling nostalgic about the fireplac...,love,4
4,i am feeling grouchy,anger,2


In [18]:
df_merged.shape

(18000, 3)

In [21]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    df_merged['comment'],
    df_merged['emotion_num'],
    test_size=0.2,
    random_state=2022,
    stratify=df_merged['emotion_num']
)

print(X_train.shape)
print(X_test.shape)

(14400,)
(3600,)


In [25]:
y_train.value_counts()

0    4851
1    4168
2    1942
3    1738
4    1182
5     519
Name: emotion_num, dtype: int64

In [26]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline

clf = Pipeline([
    ('vectorizer', CountVectorizer(ngram_range=(3,3))),
    ('rf', RandomForestClassifier())
])

clf.fit(X_train, y_train)

Pipeline(steps=[('vectorizer', CountVectorizer(ngram_range=(3, 3))),
                ('rf', RandomForestClassifier())])

In [27]:
from sklearn.metrics import classification_report

y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.45      0.62      0.52      1213
           1       0.57      0.31      0.40      1042
           2       0.20      0.45      0.28       485
           3       0.50      0.24      0.32       435
           4       0.49      0.09      0.15       295
           5       0.52      0.08      0.15       130

    accuracy                           0.40      3600
   macro avg       0.46      0.30      0.30      3600
weighted avg       0.46      0.40      0.38      3600



In [28]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

clf = Pipeline([
    ('vectorizer', CountVectorizer(ngram_range=(1,2))),
    ('nb', MultinomialNB())
])

clf.fit(X_train, y_train)

from sklearn.metrics import classification_report

y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.55      0.93      0.69      1213
           1       0.64      0.80      0.71      1042
           2       0.87      0.21      0.33       485
           3       0.79      0.17      0.29       435
           4       0.85      0.06      0.11       295
           5       1.00      0.01      0.02       130

    accuracy                           0.60      3600
   macro avg       0.78      0.36      0.36      3600
weighted avg       0.69      0.60      0.53      3600



In [29]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline

clf = Pipeline([
    ('vectorizer', CountVectorizer(ngram_range=(1,2))),
    ('rf', RandomForestClassifier())
])

clf.fit(X_train, y_train)

from sklearn.metrics import classification_report

y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.69      0.92      0.79      1213
           1       0.82      0.79      0.80      1042
           2       0.84      0.67      0.75       485
           3       0.80      0.62      0.70       435
           4       0.80      0.53      0.63       295
           5       0.76      0.50      0.60       130

    accuracy                           0.76      3600
   macro avg       0.79      0.67      0.71      3600
weighted avg       0.77      0.76      0.76      3600



In [30]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import pipeline

clf = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('rf', RandomForestClassifier())
])

clf.fit(X_train, y_train)

from sklearn.metrics import classification_report

y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.74      0.90      0.81      1213
           1       0.84      0.81      0.82      1042
           2       0.83      0.73      0.77       485
           3       0.76      0.68      0.72       435
           4       0.77      0.58      0.66       295
           5       0.72      0.58      0.64       130

    accuracy                           0.78      3600
   macro avg       0.78      0.71      0.74      3600
weighted avg       0.79      0.78      0.78      3600



In [31]:
import spacy

# load english language model and create nlp object from it
nlp = spacy.load("en_core_web_sm") 


#use this utility function to get the preprocessed text data
def preprocess(text):
    # remove stop words and lemmatize the text
    doc = nlp(text)
    filtered_tokens = []
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filtered_tokens.append(token.lemma_)
    
    return " ".join(filtered_tokens) 

In [33]:
df_merged['processed_comment'] = df_merged['comment'].apply(preprocess)

In [35]:
df_merged.head()

Unnamed: 0,comment,emotion,emotion_num,processed_txt,processed_comment
0,i didnt feel humiliated,sadness,1,not feel humiliate,not feel humiliate
1,i can go from feeling so hopeless to so damned...,sadness,1,feel hopeless damn hopeful care awake,feel hopeless damn hopeful care awake
2,im grabbing a minute to post i feel greedy wrong,anger,2,m grab minute post feel greedy wrong,m grab minute post feel greedy wrong
3,i am ever feeling nostalgic about the fireplac...,love,4,feel nostalgic fireplace know property,feel nostalgic fireplace know property
4,i am feeling grouchy,anger,2,feel grouchy,feel grouchy


In [36]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    df_merged['processed_comment'],
    df_merged['emotion_num'],
    test_size=0.2,
    random_state=2022,
    stratify=df_merged['emotion_num']
)

print(X_train.shape)
print(X_test.shape)

(14400,)
(3600,)


In [38]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import pipeline

clf = Pipeline([
    ('tfidf', CountVectorizer(ngram_range=(1,2))),
    ('rf', RandomForestClassifier())
])

clf.fit(X_train, y_train)

from sklearn.metrics import classification_report

y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.81      0.85      0.83      1213
           1       0.81      0.85      0.83      1042
           2       0.78      0.79      0.78       485
           3       0.79      0.71      0.74       435
           4       0.75      0.62      0.68       295
           5       0.71      0.58      0.64       130

    accuracy                           0.80      3600
   macro avg       0.77      0.73      0.75      3600
weighted avg       0.79      0.80      0.79      3600



In [39]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import pipeline

clf = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('rf', RandomForestClassifier())
])

clf.fit(X_train, y_train)

from sklearn.metrics import classification_report

y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.76      0.87      0.81      1213
           1       0.83      0.81      0.82      1042
           2       0.80      0.75      0.77       485
           3       0.74      0.72      0.73       435
           4       0.76      0.57      0.65       295
           5       0.63      0.49      0.55       130

    accuracy                           0.78      3600
   macro avg       0.75      0.70      0.72      3600
weighted avg       0.78      0.78      0.78      3600

