In [1]:
import numpy as np
import math
import pandas as pd
from sklearn.naive_bayes import MultinomialNB

In [2]:
dfTrain = pd.read_csv('question-4-train-features.csv')
dfTrainLabel = pd.read_csv('question-4-train-labels.csv')
dfTestFeature = pd.read_csv('question-4-test-features.csv')
dfTestLabel = pd.read_csv('question-4-test-labels.csv')

In [3]:
dfTrainLabel['neutral'].value_counts()

negative    7091
neutral     2616
positive    2004
Name: neutral, dtype: int64

In [4]:
class MultinomialNB:
    def __init__(self):
        self.p_of_positive = [] 
        self.p_of_negative = [] 
        self.p_of_neutral = [] 
        self.logPriorPos = 0
        self.logPriorNeg = 0
        self.logPriorNeu = 0
        self.nlarge_pos = 0
        self.nlarge_neg = 0
        self.nlarge_neu = 0
    def train(self,alpha,X,y):
    #CHANGING THE COLUMN NAMES FROM 0 TO 5721 FOR EASE OF OPERATION
        arr= np.arange(0,len(X.columns))
        ar1 = list(arr)
        X.columns = ar1
        #ALSO ADDING THE LABELS TO FEATURES FOR GROUPING THE DATA
        X['Labels'] = y
        
        #CALCULATING THE WORDS APPEARING IN GIVEN LABEL AND CALCULATING ALL OF THE WORDS IN A GIVEN LABEL
        sum_of_positive = X[X['Labels'] == "positive"].sum(numeric_only=True)
        sum_of_positive += alpha
        total_sum_of_positive = sum_of_positive.sum()
        self.nlarge_pos = sum_of_positive.nlargest(20)
        
        sum_of_negative = X[X['Labels'] == "negative"].sum(numeric_only=True)
        sum_of_negative += alpha
        total_sum_of_negative = sum_of_negative.sum()
        self.nlarge_neg = sum_of_negative.nlargest(20)

        sum_of_neutral = X[X['Labels'] == "neutral"].sum(numeric_only=True)
        sum_of_neutral += alpha
        total_sum_of_neutral = sum_of_neutral.sum()
        self.nlarge_neu = sum_of_neutral.nlargest(20)

        #CALCULATING THE PROBABILITY OF EACH WORD APPEARING FOR A GIVEN LABEL, THIS IS MY TRAINING DATA
        self.p_of_positive = sum_of_positive / (total_sum_of_positive + (alpha*len(X.columns)))
        
        self.p_of_negative = sum_of_negative / (total_sum_of_negative + (alpha*len(X.columns)))
        
        self.p_of_neutral = sum_of_neutral / (total_sum_of_neutral + (alpha*len(X.columns)))

        #CALCULATING THE PRIORS ACCORDING TO THE VALUE COUNT ABOVE
        self.logPriorPos = math.log(X['Labels'].value_counts()[2]/(X['Labels'].value_counts().sum()))
        self.logPriorNeg= math.log(X['Labels'].value_counts()[0]/(X['Labels'].value_counts().sum()))
        self.logPriorNeut= math.log(X['Labels'].value_counts()[1]/(X['Labels'].value_counts().sum()))
        
        #Droping dataframe to past form to further use
        X.drop(['Labels'], axis=1,inplace = True)
        
    def test(self, feat, label):
        #TESTING FOR POSITIVE, ESTIMATING THE PROBABILITIES OF BEING POSITIVE
        predict_pos = []
        pre_pos_df = pd.DataFrame()
        for index,row in feat.iterrows():
            indexx = 0
            for j in row:
                #0*log0 is 0 as the assignment wants
                if self.p_of_positive[indexx] == 0 and j == 0:
                    predict_pos.append(0)
                #a*log0 is -inf, and practically e^(-10000) is minus infinity
                elif self.p_of_positive[indexx] == 0 and j != 0:
                    predict_pos.append(-100000)
                else:
                    predict_pos.append((math.log(self.p_of_positive[indexx])*j ))
                indexx +=1
        pre_pos_df['prob'] = predict_pos
        
        #TESTING FOR NEUTRAL, ESTIMATING THE PROBABILITIES OF BEING NEUTRAL
        predict_neu = []
        pre_neu_df = pd.DataFrame()
        for index,row in feat.iterrows():
            indexx = 0
            for j in row:
                #0*log0 is 0 as the assignment wants
                if self.p_of_neutral[indexx] == 0 and j == 0:
                    predict_neu.append(0)
                #a*log0 is -inf, and practically e^(-10000) is minus infinity
                elif self.p_of_neutral[indexx] == 0 and j != 0:
                    predict_neu.append(-100000)
                else:
                    predict_neu.append((math.log(self.p_of_neutral[indexx])*j))
                indexx += 1
        pre_neu_df['Prob'] = predict_neu
        
        #TESTING FOR NEGATIVE, ESTIMATING THE PROBABILITIES OF BEING NEGATIVE FOR EACH WORD
        predict_neg = []
        pre_neg_df = pd.DataFrame()
        for index,row in feat.iterrows():
            indexx = 0
            for j in row:
                #0*log0 is 0 as the assignment wants
                if self.p_of_negative[indexx] == 0 and j == 0:
                    predict_neg.append(0)
                #a*log0 is -inf, and practically e^(-10000) is minus infinity
                elif self.p_of_negative[indexx] == 0 and j != 0:
                    predict_neg.append(-100000)
                else:
                    predict_neg.append((math.log(self.p_of_negative[indexx])*j ))
                indexx += 1   
        pre_neg_df['prob'] = predict_neg
        
        #SPLIT THE LIST ABOVE TO GET PROBABILITIES FOR EACH WORD
        pre_split_neg = np.split(pre_neg_df,2927)
        pre_split_pos = np.split(pre_pos_df,2927)
        pre_split_neu = np.split(pre_neu_df,2927)
        
        #STORING THE ESTIMATED PROBABILITIES OF LABELS FOR EACH TWEET THIS TIME WITH SUMMING 
        #THE PROBABILITY OF EACH WORD
        negative_prob = []
        for i in range(len(pre_split_neg)):
            negative_prob.append(pre_split_neg[i].sum())
            
        positive_prob = []
        for i in range(len(pre_split_pos)):
            positive_prob.append(pre_split_pos[i].sum())

        neutral_prob = []
        for i in range(len(pre_split_neu)):
            neutral_prob.append(pre_split_neu[i].sum())
            
        total_pos = np.array(positive_prob)
        total_pos += self.logPriorPos
        total_neg = np.array(negative_prob)
        total_neg += self.logPriorNeg
        total_neu = np.array(neutral_prob)
        total_neu += self.logPriorNeu
        #CREATING A DATAFRAME FOR PREDICTED LABELS
        self.dfScoreTable = pd.DataFrame()
        ScoreLabel = []
        for i in range(len(label)):
            if (total_neg[i][0] > total_neu[i][0] and total_neg[i][0] > total_pos[i][0]):
                ScoreLabel.append('negative')
            elif (total_neu[i][0] >= total_neg[i][0] and total_neu[i][0] >= total_pos[i][0]):
                ScoreLabel.append('neutral')
            else:
                ScoreLabel.append('positive')
                
        self.dfScoreTable['negative'] = ScoreLabel
        
        self.score = (self.dfScoreTable['negative'] == label['negative'])
        return self.score.value_counts(1)
    def get_20_most_used(self,vocabulary,label):
        if label == 'positive':
            for index, value in self.nlarge_pos.iteritems():
                print(vocabulary.loc[index][0] )
        elif label == 'negative':
            for index, value in self.nlarge_neg.iteritems():
                print(vocabulary.loc[index][0] )
        elif label == 'neutral':
            for index, value in self.nlarge_neu.iteritems():
                print(vocabulary.loc[index][0] )


In [5]:
mnb = MultinomialNB()

In [6]:
mnb.train(alpha = 1,X = dfTrain, y= dfTrainLabel)

In [7]:
mnb.test(dfTestFeature,dfTestLabel)

True     0.749573
False    0.250427
Name: negative, dtype: float64

In [8]:
mnb1 = MultinomialNB()
mnb1.train(alpha=0, X=dfTrain, y=dfTrainLabel)
mnb1.test(dfTestFeature,dfTestLabel)

True     0.717117
False    0.282883
Name: negative, dtype: float64

In [9]:
class BernoulliBC:
    def __init__(self):
        self.p_of_positive = [] 
        self.p_of_negative = [] 
        self.p_of_neutral = [] 
        self.ber_p_of_pos = []
        self.ber_p_of_neg = []
        self.ber_p_of_neu = []
        self.logPriorPos = 0
        self.logPriorNeg = 0
        self.logPriorNeu = 0
    def train(self,alpha,dfTrain):
        #CHANGING THE COLUMN NAMES FROM 0 TO 5721 FOR EASE OF OPERATION
        #CHANGING THE COLUMN NAMES FROM 0 TO 5721 FOR EASE OF OPERATION
        arr= np.arange(0,5722)
        ar1 = list(arr)
        dfTrain.columns = ar1
        #ALSO ADDING THE LABELS TO FEATURES FOR GROUPING THE DATA
        dfTrain['Labels'] = dfTrainLabel
        #CALCULATING THE WORDS APPEARING IN GIVEN LABEL AND CALCULATING ALL OF THE WORDS IN A GIVEN LABEL
        BernoulliTable = dfTrain[dfTrain <= 1]
        BernoulliTable.fillna(1, inplace = True)
        
        sum_of_positive = BernoulliTable[BernoulliTable['Labels'] == "positive"].sum(numeric_only=True)
        sum_of_negative = BernoulliTable[BernoulliTable['Labels'] == "negative"].sum(numeric_only=True)
        sum_of_neutral = BernoulliTable[BernoulliTable['Labels'] == "neutral"].sum(numeric_only=True)
        
        #CALCULATING THE PROBABILITY OF EACH WORD APPEARING FOR A GIVEN LABEL
        self.p_of_positive = sum_of_positive / dfTrain['Labels'].value_counts()[2]
        self.p_of_positive.fillna(0, inplace = True)

        self.p_of_negative = sum_of_negative / dfTrain['Labels'].value_counts()[0]
        self.p_of_negative.fillna(0, inplace = True)

        self.p_of_neutral = sum_of_neutral / dfTrain['Labels'].value_counts()[1]
        self.p_of_neutral.fillna(0 , inplace = True)

        #CALCULATING THE PRIORS ACCORDING TO THE VALUE COUNT
        self.logPriorPos = math.log(dfTrain['Labels'].value_counts()[2]/(dfTrain['Labels'].value_counts().sum()))
        self.logPriorNeg= math.log(dfTrain['Labels'].value_counts()[0]/(dfTrain['Labels'].value_counts().sum()))
        self.logPriorNeut= math.log(dfTrain['Labels'].value_counts()[1]/(dfTrain['Labels'].value_counts().sum()))
                
        #TO CALCULATE THE PROBABILITY 1-P
        for index, value in self.p_of_positive.items():
            self.ber_p_of_pos.append(1 - self.p_of_positive[index])
            
        for index, value in self.p_of_negative.items():
            self.ber_p_of_neg.append(1 - self.p_of_negative[index])
        
        for index, value in self.p_of_neutral.items():
            self.ber_p_of_neu.append(1 - self.p_of_neutral[index])

        dfTrain.drop(['Labels'], axis=1,inplace = True)
        
    def test(self,dfTestFeature, dfTestLabel):
        BernoulliTest = dfTestFeature[dfTestFeature <= 1]
        BernoulliTest.fillna(1, inplace = True)
        #TESTING FOR POSITIVE, ESTIMATING THE PROBABILITIES OF BEING POSITIVE
        predict_pos = []
        for index,row in BernoulliTest.iterrows():
            indexx = 0
            hold = 1
            for j in row:
                hold = hold *(self.p_of_positive[indexx]*j + ((1-j)*self.ber_p_of_pos[indexx]))  
                indexx +=1
            if hold != 0:
                predict_pos.append(math.log(hold) + self.logPriorPos)
            #practically e^(-10000) is minus infinity
            else : predict_pos.append(-10000)
        
        #TESTING FOR NEUTRAL, ESTIMATING THE PROBABILITIES OF BEING NEUTRAL
        predict_neu = []
        for index,row in BernoulliTest.iterrows():
            indexx = 0
            hold = 1
            for j in row:
                hold = hold * (self.p_of_neutral[indexx]*j + ((1-j)*self.ber_p_of_neu[indexx]))
                indexx += 1
            if hold != 0:
                predict_neu.append(math.log(hold) + self.logPriorNeut)
            #practically e^(-10000) is minus infinity
            else: predict_neu.append(-10000)
    
        
        #TESTING FOR NEGATIVE, ESTIMATING THE PROBABILITIES OF BEING NEGATIVE
        predict_neg = []
        for index,row in BernoulliTest.iterrows():
            indexx = 0
            hold = 1
            for j in row:
                hold = hold * (self.p_of_negative[indexx]*j + ((1-j)*self.ber_p_of_neg[indexx]))
                indexx += 1
            if hold != 0:
                predict_neg.append(math.log(hold) + self.logPriorNeg)
            #practically e^(-10000) is minus infinity
            else: predict_neg.append(-10000)
            
        #CREATING A DATAFRAME FOR PREDICTED LABELS
        dfScoreTable = pd.DataFrame()
        ScoreLabel = []
        for i in range(len(dfTestLabel)):
            if (predict_neg[i] > predict_neu[i] and predict_neg[i] > predict_pos[i]):
                ScoreLabel.append('negative')
            elif (predict_neu[i] >= predict_neg[i] and predict_neu[i] >= predict_pos[i]):
                ScoreLabel.append('neutral')
            else:
                ScoreLabel.append('positive')

        dfScoreTable['negative'] = ScoreLabel

        #COMPARING THE PREDICTED LABELS WITH REAL ONES AND PASS IT INTO ANOTHER OBJECT, COMPARING WILL 
        #RETURN TRUE OR FALSE AT EACH ROW
        score = dfScoreTable['negative'] == dfTestLabel['negative']
        return score.value_counts(1)

In [10]:
bnb = BernoulliBC()

In [11]:
bnb.train(alpha = 0,dfTrain = dfTrain)

In [12]:
bnb.test(dfTestFeature, dfTestLabel) 

True     0.640929
False    0.359071
Name: negative, dtype: float64

In [13]:
#FINDING THE MOST USED 20 WORDS IN TWEETS
dfVocabulary = pd.read_csv('question-4-vocab.txt',sep='\t',header= -1)
dfVocabulary.nlargest(20,columns = 1)

Unnamed: 0,0,1
16,flight,3948
572,@united,3890
2151,@usairways,2998
896,@americanair,2960
300,@southwestair,2455
301,@jetblue,2221
309,cancelled,1065
157,service,963
76,help,872
21,time,791


In [14]:
mnb.get_20_most_used(vocabulary = dfVocabulary, label='neutral')

@jetblue
@united
@southwestair
flight
@usairways
@virginamerica
flights
help
fleek
fleet's
dm
time
tomorrow
flying
cancelled
fly
change
today
travel
check


In [15]:
mnb.get_20_most_used(vocabulary = dfVocabulary, label='negative')

@united
flight
@usairways
@southwestair
@jetblue
cancelled
service
hours
hold
time
customer
help
delayed
plane
hour
flights
bag
gate
late
flightled


In [16]:
mnb.get_20_most_used(vocabulary = dfVocabulary, label='positive')

@southwestair
@jetblue
@united
flight
@usairways
great
@virginamerica
service
love
best
guys
customer
time
awesome
help
airline
amazing
today
fly
flying


In [3]:
mb = MultinomialNB()

In [4]:
mb.fit(dfTrain,dfTrainLabel)

  y = column_or_1d(y, warn=True)


MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [5]:
mb.score(dfTestFeature,dfTestLabel)

0.7529894089511445