In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv("train.txt", sep=";", header=None, names=["text", "emotions"])

In [3]:
df.head()

Unnamed: 0,text,emotions
0,i didnt feel humiliated,sadness
1,i can go from feeling so hopeless to so damned...,sadness
2,im grabbing a minute to post i feel greedy wrong,anger
3,i am ever feeling nostalgic about the fireplac...,love
4,i am feeling grouchy,anger


# data cleaning and preprocessing


In [4]:
df.isnull().sum()

text        0
emotions    0
dtype: int64

In [5]:
df.duplicated().sum()

np.int64(1)

In [6]:
df.drop_duplicates(inplace=True)

In [7]:
df.duplicated().sum()

np.int64(0)

In [8]:
df["emotions"].unique()

array(['sadness', 'anger', 'love', 'surprise', 'fear', 'joy'],
      dtype=object)

In [9]:
# convert emotions to numbers

unique_emotions = df["emotions"].unique()
emotion_numbers = {}
# Assign a unique number to each emotion
i = 0
for emotion in unique_emotions:
    emotion_numbers[emotion] = i
    i += 1

df["emotions"] = df["emotions"].map(emotion_numbers)

In [10]:
emotion_numbers

{'sadness': 0, 'anger': 1, 'love': 2, 'surprise': 3, 'fear': 4, 'joy': 5}

In [11]:
df.head()

Unnamed: 0,text,emotions
0,i didnt feel humiliated,0
1,i can go from feeling so hopeless to so damned...,0
2,im grabbing a minute to post i feel greedy wrong,1
3,i am ever feeling nostalgic about the fireplac...,2
4,i am feeling grouchy,1


In [12]:
# converting all text to lower case
df["text"] = df["text"].str.lower()

In [13]:
df.head()

Unnamed: 0,text,emotions
0,i didnt feel humiliated,0
1,i can go from feeling so hopeless to so damned...,0
2,im grabbing a minute to post i feel greedy wrong,1
3,i am ever feeling nostalgic about the fireplac...,2
4,i am feeling grouchy,1


In [14]:
# removing punctuation
import string


def remove_punctuation(text):
    return text.translate(str.maketrans("", "", string.punctuation))


df["text"] = df["text"].apply(remove_punctuation)

In [15]:
df.head()

Unnamed: 0,text,emotions
0,i didnt feel humiliated,0
1,i can go from feeling so hopeless to so damned...,0
2,im grabbing a minute to post i feel greedy wrong,1
3,i am ever feeling nostalgic about the fireplac...,2
4,i am feeling grouchy,1


In [16]:
# removing numbers from text
def remove_numbers(text):
    new = ""
    for i in text:
        if not i.isdigit():
            new += i
    return new


df["text"] = df["text"].apply(remove_numbers)

In [17]:
df.head()

Unnamed: 0,text,emotions
0,i didnt feel humiliated,0
1,i can go from feeling so hopeless to so damned...,0
2,im grabbing a minute to post i feel greedy wrong,1
3,i am ever feeling nostalgic about the fireplac...,2
4,i am feeling grouchy,1


In [18]:
# removing urls/links from text
import re


def remove_urls(text):
    return re.sub(r"http\S+|www\S+|https\S+", "", text, flags=re.MULTILINE)


df["text"] = df["text"].apply(remove_urls)

In [19]:
df.head()

Unnamed: 0,text,emotions
0,i didnt feel humiliated,0
1,i can go from feeling so hopeless to so damned...,0
2,im grabbing a minute to post i feel greedy wrong,1
3,i am ever feeling nostalgic about the fireplac...,2
4,i am feeling grouchy,1


In [20]:
# removing extra spaces from text
def remove_extra_spaces(text):
    return re.sub(r"\s+", " ", text).strip()


df["text"] = df["text"].apply(remove_extra_spaces)

In [21]:
df.head()

Unnamed: 0,text,emotions
0,i didnt feel humiliated,0
1,i can go from feeling so hopeless to so damned...,0
2,im grabbing a minute to post i feel greedy wrong,1
3,i am ever feeling nostalgic about the fireplac...,2
4,i am feeling grouchy,1


In [22]:
# removing emojis from text
def remove_emojis(text):
    new = ""
    for i in text:
        if i.isascii():
            new += i
    return new


df["text"] = df["text"].apply(remove_emojis)

In [23]:
df.head()

Unnamed: 0,text,emotions
0,i didnt feel humiliated,0
1,i can go from feeling so hopeless to so damned...,0
2,im grabbing a minute to post i feel greedy wrong,1
3,i am ever feeling nostalgic about the fireplac...,2
4,i am feeling grouchy,1


In [24]:
# removing stopwords using nltk library
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [25]:
# download stopwords
nltk.download("stopwords")
nltk.download("punkt")
nltk.download("punkt_tab")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\yasho\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\yasho\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\yasho\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [26]:
eng_stopwords = set(stopwords.words("english"))
eng_stopwords

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 "he'd",
 "he'll",
 "he's",
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 "i'd",
 "i'll",
 "i'm",
 "i've",
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it'd",
 "it'll",
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'on

In [27]:
len(eng_stopwords)

198

In [28]:
type(eng_stopwords)

set

In [29]:
df["text"].loc[1]

'i can go from feeling so hopeless to so damned hopeful just from being around someone who cares and is awake'

In [30]:
# tokenzing and removing stopwords
def remove_stopwords(text):
    words = word_tokenize(text)
    cleaned_text = []

    for word in words:
        if word not in eng_stopwords:
            cleaned_text.append(word)

    # Join the cleaned words back into a single string
    return " ".join(cleaned_text)


df["text"] = df["text"].apply(remove_stopwords)

In [31]:
df["text"].loc[1]

'go feeling hopeless damned hopeful around someone cares awake'

In [32]:
df.shape

(15999, 2)

In [33]:
df.head()

Unnamed: 0,text,emotions
0,didnt feel humiliated,0
1,go feeling hopeless damned hopeful around some...,0
2,im grabbing minute post feel greedy wrong,1
3,ever feeling nostalgic fireplace know still pr...,2
4,feeling grouchy,1


In [34]:
# separating features and labels
x = df["text"]
y = df["emotions"]

In [35]:
# train-test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, random_state=42
)

In [36]:
X_train.shape, X_test.shape

((12799,), (3200,))

In [37]:
y_train

676      5
14180    0
7077     2
13004    5
10264    5
        ..
13418    4
5390     2
860      0
15796    5
7270     1
Name: emotions, Length: 12799, dtype: int64

## model training using bow


In [38]:
from sklearn.feature_extraction.text import CountVectorizer

In [39]:
bow_vectorizer = CountVectorizer()

In [40]:
X_train_bow = bow_vectorizer.fit_transform(X_train)
X_train_bow

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 115772 stored elements and shape (12799, 13352)>

In [41]:
X_test_bow = bow_vectorizer.transform(X_test)
X_test_bow

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 27183 stored elements and shape (3200, 13352)>

In [42]:
# trainig a model using multinomial naive bayes
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
nb_model = MultinomialNB()
nb_model.fit(X_train_bow, y_train)
y_pred = nb_model.predict(X_test_bow)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy of the Naive Bayes model: {accuracy * 100:.2f}%")

Accuracy of the Naive Bayes model: 77.31%


In [43]:
# logistic regression classifier
from sklearn.linear_model import LogisticRegression
lr_model = LogisticRegression(max_iter=1000, n_jobs=-1)
lr_model.fit(X_train_bow, y_train)
y_pred = lr_model.predict(X_test_bow)
accuracy2 = accuracy_score(y_test, y_pred)
print(f"Accuracy of the Logistic Regression model: {accuracy2 * 100:.2f}%")

Accuracy of the Logistic Regression model: 89.59%


In [44]:
# random forest classifier
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
rf_model.fit(X_train_bow, y_train)
y_pred_rf = rf_model.predict(X_test_bow)
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print(f"Accuracy of the Random Forest model: {accuracy_rf * 100:.2f}%")

Accuracy of the Random Forest model: 88.88%


In [45]:
# k nearest neighbors classifier
from sklearn.neighbors import KNeighborsClassifier
knn_model = KNeighborsClassifier(n_neighbors=5, n_jobs=-1)
knn_model.fit(X_train_bow, y_train)
y_pred_knn = knn_model.predict(X_test_bow)
accuracy_knn = accuracy_score(y_test, y_pred_knn)
print(f"Accuracy of the KNN model: {accuracy_knn * 100:.2f}%")

Accuracy of the KNN model: 55.06%


In [46]:
#  decision tree classifier
from sklearn.tree import DecisionTreeClassifier
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train_bow, y_train)
y_pred_dt = dt_model.predict(X_test_bow)
accuracy_dt = accuracy_score(y_test, y_pred_dt)
print(f"Accuracy of the Decision Tree model: {accuracy_dt * 100:.2f}%")

Accuracy of the Decision Tree model: 88.12%


In [47]:
# svm classifier
from sklearn.svm import SVC
svm_model = SVC(kernel="linear", random_state=42)
svm_model.fit(X_train_bow, y_train)
y_pred_svm = svm_model.predict(X_test_bow)
accuracy_svm = accuracy_score(y_test, y_pred_svm)
print(f"Accuracy of the SVM model: {accuracy_svm * 100:.2f}%")

Accuracy of the SVM model: 88.22%


In [48]:
# gradient boosting classifier
from sklearn.ensemble import GradientBoostingClassifier
gb_model = GradientBoostingClassifier(random_state=42)
gb_model.fit(X_train_bow, y_train)
y_pred_gb = gb_model.predict(X_test_bow)
accuracy_gb = accuracy_score(y_test, y_pred_gb)
print(f"Accuracy of the Gradient Boosting model: {accuracy_gb * 100:.2f}%")

Accuracy of the Gradient Boosting model: 84.75%


In [49]:
# xgboost classifier
from xgboost import XGBClassifier
xgb_model = XGBClassifier(n_jobs=-1)
xgb_model.fit(X_train_bow, y_train)
y_pred_xgb = xgb_model.predict(X_test_bow)
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
print(f"Accuracy of the XGBoost model: {accuracy_xgb * 100:.2f}%")

Accuracy of the XGBoost model: 89.50%


In [50]:
# catboost classifier
from catboost import CatBoostClassifier
catboost_model = CatBoostClassifier(verbose=0)
catboost_model.fit(X_train_bow, y_train)
y_pred_catboost = catboost_model.predict(X_test_bow)
accuracy_catboost = accuracy_score(y_test, y_pred_catboost)
print(f"Accuracy of the CatBoost model: {accuracy_catboost * 100:.2f}%")

Accuracy of the CatBoost model: 89.56%


In [51]:
# comparing the accuracies of all models
accuracies = {
    "Naive Bayes": accuracy,
    "Logistic Regression": accuracy2,
    "Random Forest": accuracy_rf,
    "KNN": accuracy_knn,
    "Decision Tree": accuracy_dt,
    "SVM": accuracy_svm,
    "Gradient Boosting": accuracy_gb,
    "XGBoost": accuracy_xgb,
    "CatBoost": accuracy_catboost
}
accuracies_df = pd.DataFrame(accuracies.items(), columns=["Model", "Accuracy"])
accuracies_df.sort_values(by="Accuracy", ascending=False, inplace=True)
accuracies_df.reset_index(drop=True, inplace=True)

In [52]:
accuracies_df

Unnamed: 0,Model,Accuracy
0,Logistic Regression,0.895938
1,CatBoost,0.895625
2,XGBoost,0.895
3,Random Forest,0.88875
4,SVM,0.882188
5,Decision Tree,0.88125
6,Gradient Boosting,0.8475
7,Naive Bayes,0.773125
8,KNN,0.550625
