In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
df = pd.read_csv("train.txt", sep=";", header=None, names=["text", "emotions"])

In [4]:
df.head()

Unnamed: 0,text,emotions
0,i didnt feel humiliated,sadness
1,i can go from feeling so hopeless to so damned...,sadness
2,im grabbing a minute to post i feel greedy wrong,anger
3,i am ever feeling nostalgic about the fireplac...,love
4,i am feeling grouchy,anger


# data cleaning and preprocessing

In [5]:
df.isnull().sum()

text        0
emotions    0
dtype: int64

In [6]:
df.duplicated().sum()

np.int64(1)

In [7]:
df.drop_duplicates(inplace=True)

In [8]:
df.duplicated().sum()

np.int64(0)

In [9]:
df["emotions"].unique()

array(['sadness', 'anger', 'love', 'surprise', 'fear', 'joy'],
      dtype=object)

In [10]:
# convert emotions to numbers

unique_emotions = df["emotions"].unique()
emotion_numbers = {}
# Assign a unique number to each emotion
i = 0
for emotion in unique_emotions:
    emotion_numbers[emotion] = i
    i += 1

df["emotions"] = df["emotions"].map(emotion_numbers)

In [11]:
emotion_numbers

{'sadness': 0, 'anger': 1, 'love': 2, 'surprise': 3, 'fear': 4, 'joy': 5}

In [12]:
df.head()

Unnamed: 0,text,emotions
0,i didnt feel humiliated,0
1,i can go from feeling so hopeless to so damned...,0
2,im grabbing a minute to post i feel greedy wrong,1
3,i am ever feeling nostalgic about the fireplac...,2
4,i am feeling grouchy,1


In [13]:
# converting all text to lower case
df["text"] = df["text"].str.lower()

In [14]:
df.head()

Unnamed: 0,text,emotions
0,i didnt feel humiliated,0
1,i can go from feeling so hopeless to so damned...,0
2,im grabbing a minute to post i feel greedy wrong,1
3,i am ever feeling nostalgic about the fireplac...,2
4,i am feeling grouchy,1


In [15]:
# removing punctuation
import string


def remove_punctuation(text):
    return text.translate(str.maketrans("", "", string.punctuation))


df["text"] = df["text"].apply(remove_punctuation)

In [16]:
df.head()

Unnamed: 0,text,emotions
0,i didnt feel humiliated,0
1,i can go from feeling so hopeless to so damned...,0
2,im grabbing a minute to post i feel greedy wrong,1
3,i am ever feeling nostalgic about the fireplac...,2
4,i am feeling grouchy,1


In [17]:
# removing numbers from text
def remove_numbers(text):
    new = ""
    for i in text:
        if not i.isdigit():
            new += i
    return new


df["text"] = df["text"].apply(remove_numbers)

In [18]:
df.head()

Unnamed: 0,text,emotions
0,i didnt feel humiliated,0
1,i can go from feeling so hopeless to so damned...,0
2,im grabbing a minute to post i feel greedy wrong,1
3,i am ever feeling nostalgic about the fireplac...,2
4,i am feeling grouchy,1


In [19]:
# removing urls/links from text
import re


def remove_urls(text):
    return re.sub(r"http\S+|www\S+|https\S+", "", text, flags=re.MULTILINE)


df["text"] = df["text"].apply(remove_urls)

In [20]:
df.head()

Unnamed: 0,text,emotions
0,i didnt feel humiliated,0
1,i can go from feeling so hopeless to so damned...,0
2,im grabbing a minute to post i feel greedy wrong,1
3,i am ever feeling nostalgic about the fireplac...,2
4,i am feeling grouchy,1


In [21]:
# removing extra spaces from text
def remove_extra_spaces(text):
    return re.sub(r"\s+", " ", text).strip()


df["text"] = df["text"].apply(remove_extra_spaces)

In [22]:
df.head()

Unnamed: 0,text,emotions
0,i didnt feel humiliated,0
1,i can go from feeling so hopeless to so damned...,0
2,im grabbing a minute to post i feel greedy wrong,1
3,i am ever feeling nostalgic about the fireplac...,2
4,i am feeling grouchy,1


In [23]:
# removing emojis from text
def remove_emojis(text):
    new = ""
    for i in text:
        if i.isascii():
            new += i
    return new


df["text"] = df["text"].apply(remove_emojis)

In [24]:
df.head()

Unnamed: 0,text,emotions
0,i didnt feel humiliated,0
1,i can go from feeling so hopeless to so damned...,0
2,im grabbing a minute to post i feel greedy wrong,1
3,i am ever feeling nostalgic about the fireplac...,2
4,i am feeling grouchy,1


In [25]:
# removing stopwords using nltk library
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [26]:
# download stopwords
nltk.download("stopwords")
nltk.download("punkt")
nltk.download("punkt_tab")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\yasho\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\yasho\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\yasho\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [27]:
eng_stopwords = set(stopwords.words("english"))
eng_stopwords

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 "he'd",
 "he'll",
 "he's",
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 "i'd",
 "i'll",
 "i'm",
 "i've",
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it'd",
 "it'll",
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'on

In [28]:
len(eng_stopwords)

198

In [29]:
type(eng_stopwords)

set

In [30]:
df["text"].loc[1]

'i can go from feeling so hopeless to so damned hopeful just from being around someone who cares and is awake'

In [31]:
# tokenzing and removing stopwords
def remove_stopwords(text):
    words = word_tokenize(text)
    cleaned_text = []

    for word in words:
        if word not in eng_stopwords:
            cleaned_text.append(word)

    # Join the cleaned words back into a single string
    return " ".join(cleaned_text)


df["text"] = df["text"].apply(remove_stopwords)

In [32]:
df["text"].loc[1]

'go feeling hopeless damned hopeful around someone cares awake'

In [33]:
df.head()

Unnamed: 0,text,emotions
0,didnt feel humiliated,0
1,go feeling hopeless damned hopeful around some...,0
2,im grabbing minute post feel greedy wrong,1
3,ever feeling nostalgic fireplace know still pr...,2
4,feeling grouchy,1


## implementing bag of words

In [34]:
# bag of words
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()

# converting the text data into a list of documents
document = df["text"].tolist()

In [36]:
X = vectorizer.fit_transform(document)
print(X)

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 144749 stored elements and shape (15999, 15041)>
  Coords	Values
  (0, 3585)	1
  (0, 4886)	1
  (0, 6369)	1
  (1, 5633)	1
  (1, 4890)	1
  (1, 6282)	1
  (1, 3175)	1
  (1, 6280)	1
  (1, 690)	1
  (1, 12271)	1
  (1, 1926)	1
  (1, 916)	1
  (2, 4886)	1
  (2, 6492)	1
  (2, 5688)	1
  (2, 8370)	1
  (2, 10046)	1
  (2, 5749)	1
  (2, 14897)	1
  (3, 4890)	1
  (3, 4525)	1
  (3, 8968)	1
  (3, 5015)	1
  (3, 7349)	1
  (3, 12618)	1
  :	:
  (15994, 1106)	1
  (15995, 4886)	1
  (15995, 12618)	1
  (15995, 9540)	1
  (15995, 13823)	1
  (15995, 13080)	1
  (15995, 3359)	1
  (15995, 14444)	1
  (15995, 13184)	1
  (15995, 12793)	1
  (15996, 4886)	1
  (15996, 5657)	1
  (15996, 12734)	1
  (15996, 9316)	1
  (15997, 4886)	1
  (15997, 6492)	1
  (15997, 7655)	1
  (15997, 5583)	1
  (15997, 11279)	1
  (15997, 2492)	1
  (15998, 4886)	1
  (15998, 7349)	1
  (15998, 7826)	1
  (15998, 12779)	1
  (15998, 10016)	1


In [38]:
vocabulary = vectorizer.get_feature_names_out()
print("Vocabulary = ", vocabulary)

Vocabulary =  ['aa' 'aaaaaaand' 'aaaaand' ... 'zum' 'zumba' 'zz']


In [45]:
BOW = X.toarray()
print("Bag of Words = \n", BOW)
print("Shape of Bag of Words = ", BOW.shape)

Bag of Words = 
 [[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
Shape of Bag of Words =  (15999, 15041)


numpy.ndarray