In [83]:
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
import numpy as np 
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Reading Dataset 

In [84]:
emotions = pd.read_csv('Emotion.csv' ) #delimiter= ';'
emotions.dataframeName = 'Emotion.csv'
nRow, nCol = emotions.shape
print(f'There are {nRow} rows and {nCol} columns')

There are 5517 rows and 2 columns


In [85]:
emotions.head(50)
emotions['Sentiment'] = emotions.iloc[:, 1:]

#print(emotions['Sentiment'])
#print (emotions[''])


# Data Class

In [86]:
class Emotions:
    SADNESS= 'SADNESS'
    JOY= 'JOY'
    FEAR= 'FEAR'
    ANGER= 'ANGER'
    LOVE= 'LOVE'
    
class Comments:
    def __init__(self, text, sentiment):
        self.text= text
        self.sentiment= sentiment
        
class CommentContainer:
    def __init__(self, comments):
        self.comments = comments
        
    def get_text(self):
        return self.comments['Text']
    def get_x(self, vectorizer):
        return vectorizer.transform(self.get_text)
    def get_y(self):
        return self.comments['Sentiment']
    


# Training and tesing Data

In [87]:
training_data, test_data = train_test_split(emotions, test_size= 0.30, random_state=42)


train_container= CommentContainer(training_data)

test_container= CommentContainer(test_data)

#print(train_y[0])
train_x =train_container.get_text() 
train_y = train_container.get_y()
test_x =test_container.get_text()
test_y =test_container.get_y()



# Bag of words

In [88]:
vectorizer = TfidfVectorizer()
training_x_vectors = vectorizer.fit_transform(train_x)

test_x_vectors = vectorizer.transform(test_x)
print(training_x_vectors[0].toarray())


[[0. 0. 0. ... 0. 0. 0.]]


# Classification

# Linear SVM

In [89]:
from sklearn import svm

clf_svm = svm.SVC(kernel='linear')

clf_svm.fit(training_x_vectors, train_y)

clf_svm.predict(test_x_vectors[0])

array(['sadness'], dtype=object)

# Logistic Regression


In [90]:
from sklearn.linear_model import LogisticRegression

clf_log = LogisticRegression()
clf_log.fit(training_x_vectors, train_y)

clf_log.predict(test_x_vectors[0])


array(['joy'], dtype=object)

# Gaussian Naive Bayes

In [91]:
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier

clf_gnb = DecisionTreeClassifier()

clf_gnb.fit(training_x_vectors, train_y)

clf_gnb.predict(test_x_vectors[0])

array(['sadness'], dtype=object)

# Evaluation

In [92]:
# Mean Accuracy
print(clf_svm.score(test_x_vectors, test_y))
print(clf_gnb.score(test_x_vectors, test_y))

print(clf_log.score(test_x_vectors, test_y))

0.803743961352657
0.8176328502415459
0.6986714975845411


# testing new data 

In [93]:
test_set = ['very fun', 
            "bad book do not buy", 
            'horrible waste of time',
            'i dont have internet connection','i hate you']
new_test = vectorizer.transform(test_set)

clf_svm.predict(new_test)


array(['joy', 'sadness', 'sadness', 'sadness', 'sadness'], dtype=object)

In [94]:
test_set = ['very fun', "bad book do not buy", 
            'horrible waste of time',
            'i dont have internet connection',
            'i hate you']
new_test = vectorizer.transform(test_set)
clf_log.predict(new_test)

array(['joy', 'sadness', 'sadness', 'joy', 'joy'], dtype=object)

In [95]:
test_set = ['very fun', "bad book do not buy", 
            'horrible waste of time',
            'i dont have internet connection',
            'i hate you']
new_test = vectorizer.transform(test_set)
clf_gnb.predict(new_test)

array(['joy', 'sadness', 'sadness', 'joy', 'joy'], dtype=object)