<h2>The required packages</h2>

In [4]:
import pandas as pd
import copy
import time

<h2>The data set</h2>
<p>Since the dataset was too big, I only took like first few</p>

In [5]:
reddit_data = pd.read_csv('Reddit_Data.csv').head(1862) # 1862
reddit_data = reddit_data.dropna()
twitter_data = pd.read_csv('Twitter_Data.csv').head(3260) #3260
twitter_data = twitter_data.dropna()
df = pd.DataFrame(data=list(reddit_data['clean_comment']),columns=['lines'])
df['category'] = list(reddit_data['category'])
twitter_data = twitter_data.rename(columns={'clean_text': 'lines'})
df = df.append(twitter_data,ignore_index=True)
df.to_csv('final_dataset.csv')
df

Unnamed: 0,lines,category
0,family mormon have never tried explain them t...,1.0
1,buddhism has very much lot compatible with chr...,1.0
2,seriously don say thing first all they won get...,-1.0
3,what you have learned yours and only yours wha...,0.0
4,for your own benefit you may want read living ...,1.0
...,...,...
5114,dear rahul\nwhat new minimum income scheme 720...,-1.0
5115,promises times support poorest poor comparison...,-1.0
5116,wah bhakt dont you know the difference between...,0.0
5117,well plz dont geberalise kolkata for bengal vi...,1.0


<h2>Naive Bayes Classifier class</h2>

In [17]:
class NaiveBayes:
    def __init__(self,df):
        self.words = []
        self.lines = list(df['lines'])
        self.sentiments = list(df['category'])
    def word_extraction(self):
        for line in self.lines:
            if(type(line) is not str): # some comments were actually of float type instead of string so I put this if block here to convert them to string and then train them
                i = self.lines.index(line)
                line = str(line)
                self.lines[i] = line
            w = line.strip().split()
            for x in w:
                if(self.words.count(x) == 0):
                    self.words.append(x)
    def train_model(self): # training the model
        self.word_extraction() # calling the word extractor function to extract words from the dataset
        positive = [] # count of times when the particular word is being used for positive sentiment
        negative = [] # count of times when the particular word is being used for negative sentiment
        neutral = [] # count of times when the particular word is being used for neutral sentiment
        total = [] # count of times the particular word is used in dataset
        for i in range(0,len(self.words)):
            positive.append(0)
            negative.append(0)
            neutral.append(0)
            for j in range(0,len(self.lines)):
                if(self.sentiments[j] == -1): # negative
                    negative[i] += self.lines[j].count(self.words[i])
                elif(self.sentiments[j] == 0): # neutral
                    neutral[i] += self.lines[j].count(self.words[i])
                elif(self.sentiments[j] == 1): # positive
                    positive[i] += self.lines[j].count(self.words[i])
            total.append(positive[i]+negative[i]+neutral[i])
        positive.append(sum(positive)) # count of total comments with positive sentiments
        neutral.append(sum(neutral)) # count of total comments with neutral sentiments
        negative.append(sum(negative)) # count of total comments with negative sentiments
        l = len(positive)
        total.append(positive[l-1]+negative[l-1]+neutral[l-1]) # total comments we are training (it will be a fixed value so I don't know why I still did it, haha I don't know how to put emojis in comment, so understand the meotions here "smile in pain")
        count = [negative, neutral, positive, total] # data for the lookup table data frame
        self.make_lookup(count)
        print('Model trained.')
    def make_lookup(self,count): # for creating a loopup table data frame
        col = copy.deepcopy(self.words)
        self.V = len(self.words)
        col.append('_TOTAL_')
        self.lookup = pd.DataFrame(count,columns=col,index=['NEGATIVE','NEUTRAL','POSITIVE','TOTAL'])
        self.lookup.to_csv('lookup-naive-bayes.csv')
    def extract_keywords(self,doc): # to extract the required keywords from the input sentence (the words that are already present in our lookup table)
        words = doc.strip().lower().split()
        keywords = []
        for i in range(0,len(words)):
            if(type(words[i]) is not str):
                words[i] = str(words[i])
            if(self.words.count(words[i]) != 0):
                keywords.append(words[i])
        return keywords
    def make_predictions(self,doc,alpha=0):
        keywords = self.extract_keywords(doc)
        #conditionProb = [1,1,1] # conditional probabilties
        P = [self.sentiments.count(-1), self.sentiments.count(0), self.sentiments.count(1)] # prior probabilites
        for i in range(0,3):
            P[i] /= len(self.sentiments)
        for k in keywords:
            # for negative
            P[0] *= (self.lookup[k][0] + alpha) / (self.lookup['_TOTAL_'][0] + (alpha*self.V))
            # for neutral
            P[1] *= (self.lookup[k][1] + alpha) / (self.lookup['_TOTAL_'][1] + (alpha*self.V))
            # for positive
            P[2] *= (self.lookup[k][2] + alpha) / (self.lookup['_TOTAL_'][2] + (alpha*self.V))
        res = [P]
        if(P[0] > P[1] and P[0] > P[2]):
            res.append('Negative')
        elif(P[1] > P[0] and P[1] > P[2]):
            res.append('Neutral')
        elif(P[2] > P[0] and P[2] > P[1]):
            res.append('Positive')
        else:
            res.append('Unpredictable')
        return res
    def make_predictions_multiline(self,doc,alpha=0):
        count = [0,0,0,0] # so it is like [ negative, neutral, positive, unpredictable]
        res = []
        X = doc.strip().split('\n')
        for x in X:
            y = self.make_predictions(x,alpha)
            # print(y[0],'\t',y[1])
            if(y[1] == 'Negative'):
                count[0] += 1
            elif(y[1] == 'Neutral'):
                count[1] += 1
            elif(y[1] == 'Positive'):
                count[2] += 1
            elif(y[1] == 'Unpredictable'):
                count[3] += 1
        self.print_percent(count)
    def print_percent(self,count):
        percent = []
        for c in count:
            percent.append((c*100)/sum(count))
        print('Results:\n')
        if(percent[3] < 50.0):
            print('Positive: ',percent[2])
            print('Negative: ',percent[0])
            print('Neutral: ',percent[1])
        else:
            print('Unpredictable: ',percent[3])

In [18]:
startTime = time.time()
model = NaiveBayes(df)
model.train_model()
endTime = time.time()
totalTime = endTime-startTime
print('Training time: ',totalTime)

Model trained.
Training time:  82.30496001243591


In [21]:
file = open('lyrics.txt','r')
X = file.read()
print(X)

How long have you been smiling?
It seems like it's been too long
Some days I don't feel like trying
So what the fuck are you on
Whoa, ohh
I think too much, we drink too much
Falling in love like it's just nothing
I want to know where do we go
When nothing's wrong
'Cause all the kids are depressed
Nothing ever makes sense
I'm not feeling alright
Staying up 'til sunrise
And hoping shit is okay
Pretending we know things
I don't know what happened
My natural reaction is that we're scared
Ohh-oh-oh
Noo-oo-oo
Ohh-oh-oh...
So I guess we're scared
Ohh-oh-oh, ohh-oh-oh


In [23]:
epoch = 10
for i in range(0,epoch):
    print('\n\nAlpha: ',i)
    model.make_predictions_multiline(X,alpha=i)



Alpha:  0
Results:

Positive:  81.81818181818181
Negative:  13.636363636363637
Neutral:  4.545454545454546


Alpha:  1
Results:

Positive:  81.81818181818181
Negative:  13.636363636363637
Neutral:  4.545454545454546


Alpha:  2
Results:

Positive:  86.36363636363636
Negative:  9.090909090909092
Neutral:  4.545454545454546


Alpha:  3
Results:

Positive:  86.36363636363636
Negative:  9.090909090909092
Neutral:  4.545454545454546


Alpha:  4
Results:

Positive:  90.9090909090909
Negative:  4.545454545454546
Neutral:  4.545454545454546


Alpha:  5
Results:

Positive:  90.9090909090909
Negative:  4.545454545454546
Neutral:  4.545454545454546


Alpha:  6
Results:

Positive:  90.9090909090909
Negative:  4.545454545454546
Neutral:  4.545454545454546


Alpha:  7
Results:

Positive:  95.45454545454545
Negative:  0.0
Neutral:  4.545454545454546


Alpha:  8
Results:

Positive:  95.45454545454545
Negative:  0.0
Neutral:  4.545454545454546


Alpha:  9
Results:

Positive:  95.45454545454545
Negati