In [1]:
import nltk
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report
nltk.download('wordnet') #For using WordNetLemmatizer
nltk.download('punkt')

[nltk_data] Downloading package wordnet to /home/saikat/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/saikat/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

In [3]:
data=pd.read_csv("text_emotion.csv")
print("Dataset shape is:",data.shape)
data.head()

Dataset shape is: (40000, 4)


Unnamed: 0,tweet_id,sentiment,author,content
0,1956967341,empty,xoshayzers,@tiffanylue i know i was listenin to bad habi...
1,1956967666,sadness,wannamama,Layin n bed with a headache ughhhh...waitin o...
2,1956967696,sadness,coolfunky,Funeral ceremony...gloomy friday...
3,1956967789,enthusiasm,czareaquino,wants to hang out with friends SOON!
4,1956968416,neutral,xkilljoyx,@dannycastillo We want to trade with someone w...


In [4]:
sentiment_types=data["sentiment"].value_counts()
print("Types of sentiments are:\n",sentiment_types)

Types of sentiments are:
 sentiment
neutral       8638
worry         8459
happiness     5209
sadness       5165
love          3842
surprise      2187
fun           1776
relief        1526
hate          1323
empty          827
enthusiasm     759
boredom        179
anger          110
Name: count, dtype: int64


In [5]:
#Changing the categorical variables into numeric form.
mapping={"neutral":0,"worry":1,"happiness":2,"sadness":3,"love":4,"surprise":5,"fun":6,"relief":7,"hate":8,"empty":9,"enthusiasm":10,"boredom":11,"anger":12}
data["sentiment"]=data["sentiment"].map(mapping)

#Verifying.
sentiment_types=data["sentiment"].value_counts()
print("Types of sentiments are:\n",sentiment_types)

Types of sentiments are:
 sentiment
0     8638
1     8459
2     5209
3     5165
4     3842
5     2187
6     1776
7     1526
8     1323
9      827
10     759
11     179
12     110
Name: count, dtype: int64


***Now we create a function to tokenize the tweets,remove stopwords(i.e frequently used insignificant words).***

In [6]:
def process_text(text:str):
    '''
    This function tokenizes the tweets, removes numeric char based words ,converts all the chars into lowercase and then lemmatizes the words.
    The reason we use lemmatization is ,it uses wordnet library to look up origin of a word.Like better/best will be converted to good.
    Here we should not use stop words as it will remove words like not/a/very ,since these kind of words which are important for sentiment classification.
    Note:Read about Lemmatization and Stemming and their difference .(Stemming just chops last section of the word)
    '''
    # stop_words=set(stopwords.words('english'))
    word_lemmatizer=WordNetLemmatizer()  #It's a class
    tokens=word_tokenize(text)
    tokens=[word for word in tokens if word.isalpha() ]  #and len(word)>=2
    tokens=[word.lower() for word in tokens ]
    # tokens=[word for word in tokens if word not in stop_words]
    tokens=[word_lemmatizer.lemmatize(word) for word in tokens ]

    return ' '.join(tokens)

#Veryfying the function
text="Hello! not  so Good Morning."
process_text(text)

'hello not so good morning'

In [7]:
#Applying the process_text function on the tweet column
data['processesed tweets']=data['content'].apply(process_text)
data.head()

Unnamed: 0,tweet_id,sentiment,author,content,processesed tweets
0,1956967341,9,xoshayzers,@tiffanylue i know i was listenin to bad habi...,tiffanylue i know i wa listenin to bad habit e...
1,1956967666,3,wannamama,Layin n bed with a headache ughhhh...waitin o...,layin n bed with a headache ughhhh waitin on y...
2,1956967696,3,coolfunky,Funeral ceremony...gloomy friday...,funeral ceremony gloomy friday
3,1956967789,10,czareaquino,wants to hang out with friends SOON!,want to hang out with friend soon
4,1956968416,0,xkilljoyx,@dannycastillo We want to trade with someone w...,dannycastillo we want to trade with someone wh...


In [8]:
#Creating feature vector from text using Bag of Words with bigram model .
'''In unigram we just take the feature as occurance of a particular word .In ngram we take feature as the occurence of a sequence of n words ,which help us in getting some context.
For example I'm not feeling good sentence is taken as a bigram or trigram model manner then we get the  context of negation otherwise individual word occurence like in unigram does not 
mean anything.
Note:To know more read about N-gram model.
'''
bg_vectorizor=CountVectorizer(ngram_range=(1,2))
X_bow=bg_vectorizor.fit_transform(data['processesed tweets'])

#Label.
Y=data['sentiment']

#Creating feature vector from text using Tf-Idf with bigram model.
tf_vectorizor=TfidfVectorizer(ngram_range=(1,2))
X_tf=tf_vectorizor.fit_transform(data['processesed tweets'])

print(f'Bag of Words based feature vector shape:{X_bow.shape}\n TF-IDF based feature vector shape:{X_tf.shape}')

Bag of Words based feature vector shape:(40000, 243282)
 TF-IDF based feature vector shape:(40000, 243282)


In [10]:
#Train test split for vectors created using bag of words
X_bow_train,X_bow_test,Y_train,Y_test=train_test_split(X_bow,Y,test_size=0.25,random_state=42)
#Train test split for vector/features created using TF_IDF
X_tf_train,X_tf_test,Y_train,Y_test=train_test_split(X_tf,Y,test_size=0.25,random_state=42)

***Classifying using Logistic Regression***

In [13]:
#Fitting on extracted feature using bag of words.
#Note:Here since class imbalance exists so we set class weight to balanced,so that the algo can give more importance to minority class automatically.For more info check scikit-learn docs.
clf_bow_log=LogisticRegression(class_weight="balanced",solver="sag",multi_class="multinomial") 
clf_bow_log.fit(X_bow_train,Y_train)
y_pred_bow=clf_bow_log.predict(X_bow_test)
print("Bag of Words Accuracy:",accuracy_score(Y_test,y_pred_bow))
print(classification_report(Y_test,y_pred_bow))

#Fitting on TF-IDF extracted feature.
clf_tf_log=LogisticRegression(class_weight="balanced")
clf_tf_log.fit(X_tf_train,Y_train)
y_pred_tf=clf_tf_log.predict(X_tf_test)
print("TF-IDF accuracy is:",accuracy_score(Y_test,y_pred_tf))
print(classification_report(Y_test,y_pred_tf))



Bag of Words Accuracy: 0.3305
              precision    recall  f1-score   support

           0       0.39      0.50      0.44      2183
           1       0.35      0.33      0.34      2093
           2       0.34      0.34      0.34      1288
           3       0.32      0.32      0.32      1314
           4       0.43      0.43      0.43       941
           5       0.15      0.13      0.14       516
           6       0.13      0.11      0.12       421
           7       0.20      0.14      0.16       435
           8       0.29      0.28      0.29       332
           9       0.04      0.03      0.03       203
          10       0.01      0.00      0.01       204
          11       0.04      0.02      0.03        43
          12       0.00      0.00      0.00        27

    accuracy                           0.33     10000
   macro avg       0.21      0.20      0.20     10000
weighted avg       0.32      0.33      0.32     10000

TF-IDF accuracy is: 0.3046
              precisio

***Now Classifying using SVM with rbf kernel.***

In [11]:
clf_bow_svm=SVC(class_weight="balanced",random_state=42)
clf_bow_svm.fit(X_bow_train,Y_train)
y_pred_bow=clf_bow_svm.predict(X_bow_test)
print("Bag of Words Accuracy:",accuracy_score(Y_test,y_pred_bow))
print(classification_report(Y_test,y_pred_bow))

#Fitting on TF-IDF extracted feature.
clf_tf_svm=SVC(class_weight="balanced")
clf_tf_svm.fit(X_tf_train,Y_train)
y_pred_tf=clf_tf_svm.predict(X_tf_test)
print("TF-IDF accuracy is:",accuracy_score(Y_test,y_pred_tf))
print(classification_report(Y_test,y_pred_tf))

Bag of Words Accuracy: 0.3316
              precision    recall  f1-score   support

           0       0.38      0.45      0.41      2183
           1       0.31      0.46      0.37      2093
           2       0.32      0.34      0.33      1288
           3       0.31      0.29      0.30      1314
           4       0.46      0.37      0.41       941
           5       0.19      0.14      0.16       516
           6       0.14      0.09      0.11       421
           7       0.23      0.07      0.10       435
           8       0.39      0.19      0.26       332
           9       0.05      0.01      0.02       203
          10       0.03      0.00      0.01       204
          11       0.00      0.00      0.00        43
          12       0.00      0.00      0.00        27

    accuracy                           0.33     10000
   macro avg       0.22      0.19      0.19     10000
weighted avg       0.32      0.33      0.32     10000



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


TF-IDF accuracy is: 0.33
              precision    recall  f1-score   support

           0       0.30      0.66      0.41      2183
           1       0.33      0.38      0.35      2093
           2       0.38      0.26      0.31      1288
           3       0.35      0.23      0.28      1314
           4       0.51      0.34      0.41       941
           5       0.27      0.08      0.13       516
           6       0.10      0.02      0.03       421
           7       0.23      0.02      0.04       435
           8       0.45      0.14      0.21       332
           9       0.11      0.02      0.03       203
          10       0.00      0.00      0.00       204
          11       0.00      0.00      0.00        43
          12       0.00      0.00      0.00        27

    accuracy                           0.33     10000
   macro avg       0.23      0.17      0.17     10000
weighted avg       0.32      0.33      0.30     10000



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
