In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


In [2]:
df=pd.read_csv('train.txt',sep=';',header=None,names=['text','emotion'])
df.head()



Unnamed: 0,text,emotion
0,i didnt feel humiliated,sadness
1,i can go from feeling so hopeless to so damned...,sadness
2,im grabbing a minute to post i feel greedy wrong,anger
3,i am ever feeling nostalgic about the fireplac...,love
4,i am feeling grouchy,anger


In [3]:
## Checkking null values
df.isnull().sum()

text       0
emotion    0
dtype: int64

In [9]:
## Uniques emotions
unique_emotions=df['emotion'].unique()
unique_emotions
emotion_numbers={}
i=0
for emotion in unique_emotions:
    emotion_numbers[emotion]=i
    i+=1
df['emotion']=df['emotion'].map(emotion_numbers)
df

Unnamed: 0,text,emotion
0,i didnt feel humiliated,0
1,i can go from feeling so hopeless to so damned...,0
2,im grabbing a minute to post i feel greedy wrong,1
3,i am ever feeling nostalgic about the fireplac...,2
4,i am feeling grouchy,1
...,...,...
15995,i just had a very brief time in the beanbag an...,0
15996,i am now turning and i feel pathetic that i am...,0
15997,i feel strong and good overall,5
15998,i feel like this was such a rude comment and i...,1


In [11]:
##lowercase
df['text']=df['text'].str.lower()
df

Unnamed: 0,text,emotion
0,i didnt feel humiliated,0
1,i can go from feeling so hopeless to so damned...,0
2,im grabbing a minute to post i feel greedy wrong,1
3,i am ever feeling nostalgic about the fireplac...,2
4,i am feeling grouchy,1
...,...,...
15995,i just had a very brief time in the beanbag an...,0
15996,i am now turning and i feel pathetic that i am...,0
15997,i feel strong and good overall,5
15998,i feel like this was such a rude comment and i...,1


In [14]:
## removing punctuation
import string
def remove_punc(txt):
     return txt.translate(str.maketrans('','',string.punctuation))

In [15]:
df['text'].apply(remove_punc)

0                                  i didnt feel humiliated
1        i can go from feeling so hopeless to so damned...
2         im grabbing a minute to post i feel greedy wrong
3        i am ever feeling nostalgic about the fireplac...
4                                     i am feeling grouchy
                               ...                        
15995    i just had a very brief time in the beanbag an...
15996    i am now turning and i feel pathetic that i am...
15997                       i feel strong and good overall
15998    i feel like this was such a rude comment and i...
15999    i know a lot but i feel so stupid because i ca...
Name: text, Length: 16000, dtype: object

In [16]:
## remove numbers
def remove_num(txt):
     new=""
     for i in txt:
          if i not in "0123456789":
               new=new+i
     return new

df['text']=df['text'].apply(remove_num)


In [18]:
## Remove urls and links
import re

def remove_url(text):
     return re.sub(r'http\S+|www\S+|https\S+', '', text)

df['text']=df['text'].apply(remove_url)


In [19]:
## Remove html tags

def remove_html_tags(text):
    return re.sub(r'<.*?>', '', text)

df['text']=df['text'].apply(remove_html_tags)

In [21]:
## remove emojis and special characters
def remove_emojis(txt):
     new=""
     for i in txt:
          if i.isascii():
               new=new+i
     return new

df['text']=df['text'].apply(remove_emojis)

In [31]:
## remove stopwords
import nltk

In [25]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [35]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


True

In [27]:
stop_words = set(stopwords.words('english'))
 

In [28]:
df.loc[1]['text']

'i can go from feeling so hopeless to so damned hopeful just from being around someone who cares and is awake'

In [29]:
def remove(txt):
     words=word_tokenize(txt)
     cleaned=[]
     for i in words:
          if i not in stop_words:
               cleaned.append(i)
     return " ".join(cleaned)
     

In [None]:
df['text']=df['text'].apply(remove)

In [38]:
df.loc[1]['text']

'go feeling hopeless damned hopeful around someone cares awake'

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

documents=[
     "I love pizza",
     "Pizza is the best",
     "I love pasta",
     "Pasta is great"
]
vectorizer=CountVectorizer()
X=vectorizer.fit_transform(documents)
print("Vocabulary:",vectorizer.get_feature_names_out())
print(X.toarray())

Vocabulary: ['best' 'great' 'is' 'love' 'pasta' 'pizza' 'the']
[[0 0 0 1 0 1 0]
 [1 0 1 0 0 1 1]
 [0 0 0 1 1 0 0]
 [0 1 1 0 1 0 0]]


In [41]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test=train_test_split(df['text'],df['emotion'],
test_size=0.20,random_state=42)


In [44]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer


In [45]:
bow_vectorizer=CountVectorizer()


In [51]:
X_train_bow=bow_vectorizer.fit_transform(X_train)
X_train_bow
X_test_bow=bow_vectorizer.transform(X_test)
X_test_bow


<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 26934 stored elements and shape (3200, 13357)>

In [52]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score


In [53]:
nb_model=MultinomialNB()
nb_model.fit(X_train_bow,y_train)

0,1,2
,alpha,1.0
,force_alpha,True
,fit_prior,True
,class_prior,


In [56]:
pred_nb=nb_model.predict(X_test_bow)

In [59]:
print(accuracy_score(y_test,pred_nb))

0.7678125


In [66]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfifd_vectorizer=TfidfVectorizer()
X_train_tfidf=tfifd_vectorizer.fit_transform(X_train)
X_test_tfidf=tfifd_vectorizer.transform(X_test)

nb2_model=MultinomialNB()
nb2_model.fit(X_train_tfidf,y_train)

0,1,2
,alpha,1.0
,force_alpha,True
,fit_prior,True
,class_prior,


In [67]:
y_pred = nb2_model.predict(X_test_tfidf)
print(accuracy_score(y_test, y_pred))

0.6609375


In [68]:
from sklearn.linear_model import LogisticRegression

In [69]:
logistic_model=LogisticRegression(max_iter=1000)
logistic_model.fit(X_train_tfidf,y_train)


0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,1000


In [70]:
log_pred=logistic_model.predict(X_test_tfidf)
print(accuracy_score(y_test, log_pred))

0.8621875
