## Machine learning with positivity and negativity scores

This notebook contains a Naive Bayes model trained on the positivity and negativity scores per review.

In [1]:
import pickle
import random
from collections import Counter
import numpy as np
import matplotlib.pyplot as plt
from sklearn import naive_bayes, metrics
from itertools import chain
from math import log
from nltk import BigramAssocMeasures
from nltk.corpus import sentiwordnet as swn
from nltk.corpus import wordnet as wn 
import operator 

#importing the test and train dataset
with open('test_dicts.txt', 'rb') as file:
    test_lemmedreviews = pickle.load(file)
    
with open('training_dicts.txt', 'rb') as file:
    train_lemmedreviews = pickle.load(file)

In [2]:
mapping = {'NOUN':wn.NOUN, "ADJ": wn.ADJ, 'VERB' : wn.VERB, "ADV": wn.ADV}

def find_senti(lst):
    """Finds the positivity and negativity score per review"""
    avg_review = []
    for sublist in lst:
        avg_positivity, avg_negativity, total_words = 0, 0, 0
        #find total num words in each class for calculating the average
        total_words = len(sublist)
        for word in sublist:
            #split item into individual word and lemma 
            trunc_word = word.split("-")
            if trunc_word[1] not in mapping:
                continue
            #get the right part of speech from predefined mapping
            new_pos = mapping[trunc_word[1]]
            if len(list(swn.senti_synsets(trunc_word[0], pos = new_pos))) > 0:
                scores = swn.senti_synset(trunc_word[0] + "."+ new_pos + ".01")
                #find senti-wordnet's positivity and negativity rating for each word in each class  
                avg_positivity += scores.pos_score()
                avg_negativity += scores.neg_score() 
        avg_tupl = ((avg_positivity/total_words), (avg_negativity/total_words))
        avg_review.append(avg_tupl)
                
                

    return avg_review

In [3]:
#For all reviews in the training set find their positivity and negativity scores
all_scores=[]
for i in range(1,6): 
    all_scores += find_senti(train_lemmedreviews[i])
 
#Create the label vector
labelsVec = np.zeros((sum([len(v) for v in train_lemmedreviews.values()])))
docId = 0
for score in range(1, 6):
    for rev in train_lemmedreviews[score]:
        labelsVec[docId] = score
        docId += 1

In [5]:
#Create the model based on the positivity and negativity scores
clf = naive_bayes.MultinomialNB()
clf.fit(all_scores, labelsVec)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [7]:
#Create the test scores and the golden standard
all_scores_test=[]
for i in range(1,6): 
    all_scores_test += find_senti(test_lemmedreviews[i])   

goldStandard = np.zeros((sum([len(v) for v in test_lemmedreviews.values()])))
docId = 0
for score in range(1, 6):
    for rev in test_lemmedreviews[score]:
        goldStandard[docId] = score
        docId += 1

In [8]:
#Create a prediction
predicted = clf.predict(all_scores_test)

In [9]:
# accuracy
print("accuracy:", metrics.accuracy_score(predicted, goldStandard))

# precision, recall and f-measure
print("precision:", metrics.precision_score(predicted, goldStandard, average='macro'))
print("recall:", metrics.recall_score(predicted, goldStandard, average='macro'))
print("f1-measure:", metrics.f1_score(predicted, goldStandard, average='macro'))

accuracy: 0.6115
precision: 0.2
recall: 0.1223
f1-measure: 0.15178405212534907


  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
