We thought that finding the co-occurrences of words in the review with negative or positive words from the corpus would be a good way to find out exactly which parts of the purchased product make it good or bad. For example, in a hot sauce review, it would be helpful for manufacturers and consumers alike to understand if it's the flavor, packaging, or something else that make it popular or unpopular. We could use this both to be able to predict the number of stars that an Amazon review will recieve and to indicate to customers what the best parts of a product are.

In [3]:
import nltk
import pickle
import random

with open('training_dicts.txt', 'rb') as file:
    lemmedreviews = pickle.load(file)
    
#make lists for each class in training set 
one_star = lemmedreviews[1]
two_star = lemmedreviews[2]
three_star = lemmedreviews[3]
four_star = lemmedreviews[4]
five_star = lemmedreviews[5]

In [4]:
#find co-occurences 
from collections import Counter

def find_cooccs(lst):
    span = 3
    cooccs_stem_surface = Counter()

    for sentence in lst:
        for i,w in enumerate(sentence):
            #check all co-occurring words within a 3 word span 
            span_range = list(range(max(i- span, 0), i)) 
            span_range.extend(range(i+1, min(i + span + 1, len(sentence)))) 
            for cw in [sentence[idx] for idx in span_range]:
                if cw != w and (cw, w) not in cooccs_stem_surface:
                    cooccs_stem_surface[(w, cw)] += 1
    
    print(cooccs_stem_surface.most_common(10))
    return cooccs_stem_surface 

#apply to every class 
one_cooccs = find_cooccs(one_star)
two_cooccs = find_cooccs(two_star)
three_cooccs = find_cooccs(three_star)
four_cooccs = find_cooccs(four_star)
five_cooccs = find_cooccs(five_star)

[(('taste-NOUN', 'like-ADP'), 88), (('not-NEGATION', 'would-VERB'), 73), (('not-NEGATION', 'product-NOUN'), 70), (('not-NEGATION', 'like-ADP'), 65), (('taste-NOUN', 'not-NEGATION'), 51), (('not-NEGATION', 'even-ADV'), 46), (('not-NEGATION', 'buy-VERB'), 36), (('not-NEGATION', 'good-ADJ'), 32), (('taste-VERB', 'like-ADP'), 29), (('not-NEGATION', 'worth-NOUN'), 27)]
[(('like-ADP', 'taste-NOUN'), 66), (('like-ADP', 'not-NEGATION'), 56), (('not-NEGATION', 'taste-NOUN'), 54), (('would-VERB', 'not-NEGATION'), 37), (('like-ADP', 'coffee-NOUN'), 37), (('not-NEGATION', 'good-ADJ'), 35), (('not-NEGATION', 'product-NOUN'), 25), (('not-NEGATION', 'flavor-NOUN'), 24), (('like-ADP', 'really-ADV'), 23), (('not-NEGATION', 'worth-NOUN'), 22)]
[(('taste-NOUN', 'like-ADP'), 88), (('would-VERB', 'not-NEGATION'), 65), (('not-NEGATION', 'taste-NOUN'), 61), (('not-NEGATION', 'like-ADP'), 60), (('coffee-NOUN', 'not-NEGATION'), 58), (('not-NEGATION', 'good-ADJ'), 53), (('not-NEGATION', 'really-ADV'), 50), (('o

Find the most "polarized" combinations of words per class

In [26]:
#perform sentiment analysis
from nltk.corpus import sentiwordnet as swn
from nltk.corpus import wordnet as wn 
import operator 
import collections

#create mapping for proper sentiment analysis 
mapping = {'NOUN':wn.NOUN, "ADJ": wn.ADJ, 'VERB' : wn.VERB}

def find_vals(lst):
    #make synsets out of tuples
    list_cooccs = lst
    word1, word2 = "", ""
    positive_score, negative_score, positive_score2, negative_score2 = 0,0,0,0
    #dictionary to store tuples of co-occurences and their positivity 
    wordvals = {}

    #give items new mapping for use with senti_synsets
    for tup in list_cooccs:
        #get individual words out of tuple
        w1 = tup[0].split("-")
        w2 = tup[1].split("-")
    
        #check if first word in tuple is contained within mapping
        if w1[1] in mapping.keys() and mapping[w1[1]] == 'a':
            #extract part of speech
            one_pos = mapping[w1[1]]
            #extract the word itself 
            word1 = w1[0]
        
            #test that synsets exist 
            if len(list(swn.senti_synsets(word1, pos = one_pos))) > 0:
                scores = swn.senti_synset(word1 + "."+ one_pos + ".01")
                #calculate positive and negative scores for word 1 
                positive_score = scores.pos_score()
                negative_score = scores.neg_score()
            
        #repeat for second word in tuple         
        if w2[1] in mapping.keys() and mapping[w2[1]] == 'n':
            #extract part of speech 
            two_pos = mapping[w2[1]]
            word2 = w2[0]
        
            if len(list(swn.senti_synsets(word2, pos = two_pos))) > 0:
                scores2 = swn.senti_synset(word2 + "."+ two_pos + ".01")
                #calculate positive and negative scores for word 1 
                positive_score2 = scores2.pos_score()
                negative_score2 = scores2.neg_score() 
            
        #store tuples and positivity values in a dictionary
        tupl = (word1, word2)
        wordvals[tupl] = (positive_score + negative_score + positive_score2 + negative_score2)

    #sort the dictionary of positive and negative values from largest to smallest 
    sorted_wordvals = list(reversed(sorted(wordvals.items(), key=operator.itemgetter(1))))
    #sorted_dict = collections.OrderedDict(reversed(sorted_wordvals))
    return sorted_wordvals

one_vals = find_vals(one_cooccs)
two_vals = find_vals(two_cooccs)
three_vals = find_vals(three_cooccs)
four_vals = find_vals(four_cooccs)
five_vals = find_vals(five_cooccs)


In [28]:
print(one_vals[10:20])
print(two_vals[10:20])
print(three_vals[10:20])
print(four_vals[10:20])
print(five_vals[10:20])

[(('good', 'love'), 1.375), (('solid', 'mxl'), 1.25), (('solid', 'beg'), 1.25), (('solid', 'quality'), 1.25), (('different', 'find'), 1.25), (('different', 'health'), 1.25), (('good', 'help'), 1.25), (('diet', 'buy'), 1.25), (('healthy', 'buy'), 1.25), (('good', 'worthy'), 1.25)]
[(('best', 'love'), 1.375), (('good', 'love'), 1.375), (('good', 'something'), 1.375), (('delicious', 'love'), 1.375), (('iced', 'help'), 1.25), (('different', 'love'), 1.25), (('different', 'wellness'), 1.25), (('good', 'help'), 1.25), (('positive', 'truth'), 1.25), (('different', 'find'), 1.25)]
[(('good', 'love'), 1.375), (('happy', 'hand'), 1.375), (('responsible', 'health'), 1.25), (('necessary', 'health'), 1.25), (('good', 'orangina'), 1.25), (('nice', 'fun'), 1.25), (('healthy', 'nature'), 1.25), (('old', 'mom'), 1.25), (('major', 'advantage'), 1.25), (('vegetable', 'wish'), 1.25)]
[(('nice', 'passion'), 1.375), (('nice', 'skype'), 1.375), (('resemble', 'love'), 1.375), (('best', 'know'), 1.375), (('nic

We're going to find the PLMI of the co-occurences per class to use as a feature for classification 

In [30]:
#plmi measure
from itertools import chain
from math import log


def ppmi(o_11, r_1, c_1, n):
    """
    Positive Pointwise Mutual Information (Church & Hanks, 1990)
    
    PMI is also available in NLTK:
    from nltk.metrics import BigramAssocMeasures
    print BigramAssocMeasures.pmi(8, (15828, 4675), 14307668)
    """
    observed = o_11
    expected = (r_1*c_1)/n 
    res = log(observed/expected,2)
    return max(0, res)

def plmi(o_11, r_1, c_1, n):
    """
    Positive Local Mutual Information, useful for leveraging the 
    low-frequency bias of the PPMI
    """
    res = o_11 * ppmi(o_11, r_1, c_1, n)
    return res

def find_plmi(cooccs_dict, lst):
    N = len(cooccs_dict.values())
    plmis_stem_surface = Counter()
    stemmed_frequencies = Counter(chain(*lst))


    for k,v in cooccs_dict.items():
        plmis_stem_surface[k] = plmi(v, stemmed_frequencies[k[0]], stemmed_frequencies[k[1]], N)

    print(plmis_stem_surface.most_common(10))
    return plmis_stem_surface

In [31]:
one_plmi = find_plmi(one_cooccs, one_star)
two_plmi =  find_plmi(two_cooccs, two_star)
three_plmi = find_plmi(three_cooccs, three_star)
four_plmi = find_plmi(four_cooccs, four_star)
five_plmi = find_plmi(five_cooccs, five_star)

[(('taste-NOUN', 'like-ADP'), 498.21421039440526), (('not-NEGATION', 'would-VERB'), 328.187444872234), (('not-NEGATION', 'product-NOUN'), 315.9453669160056), (('not-NEGATION', 'like-ADP'), 245.1104691952661), (('not-NEGATION', 'even-ADV'), 206.80891232686838), (('taste-NOUN', 'not-NEGATION'), 203.19398438910855), (('wolfgang-NOUN', 'puck-NOUN'), 174.8994910957734), (('customer-NOUN', 'service-NOUN'), 169.3327182410485), (('bad-ADJ', 'ever-ADV'), 168.73227297299397), (('taste-VERB', 'like-ADP'), 165.09501887758555)]
[(('like-ADP', 'taste-NOUN'), 346.7954123964976), (('like-ADP', 'not-NEGATION'), 223.73486055771255), (('not-NEGATION', 'taste-NOUN'), 223.2276969716608), (('like-ADP', 'coffee-NOUN'), 199.5862918162832), (('would-VERB', 'not-NEGATION'), 159.49068237603507), (('not-NEGATION', 'good-ADJ'), 158.6901697871882), (('hot-ADJ', 'chocolate-NOUN'), 152.92386027809886), (('bake-VERB', 'lay-NOUN'), 146.25097065400752), (('not-NEGATION', 'worth-NOUN'), 134.97016198705478), (('like-ADP',

We can also use the average positivity and negativity of each class to use as features as well.

In [55]:
#find average positivity + negativity for each class
def find_senti(lst):
    avg_positivity = 0
    avg_negativity = 0
    total_words = 0
    
    for sublist in lst:
        #find total num words in each class for calculating the average
        total_words += len(sublist)
        for word in sublist:
            #split item into individual word and lemma 
            trunc_word = word.split("-")
            #TODO: handle negation!! this is a band-aid  
            if trunc_word[1] not in mapping:
                continue
            #get the right part of speech from predefined mapping
            new_pos = mapping[trunc_word[1]]
            if len(list(swn.senti_synsets(trunc_word[0], pos = new_pos))) > 0:
                scores = swn.senti_synset(trunc_word[0] + "."+ new_pos + ".01")
                #find senti-wordnet's positivity and negativity rating for each word in each class  
                avg_positivity += scores.pos_score()
                avg_negativity += scores.neg_score() 
                

    return (avg_positivity/total_words)*100, (avg_negativity/total_words)*100

In [56]:
find_senti(one_star)

(3.6588333878351866, 3.4843157292347944)

In [57]:
find_senti(five_star)

(5.295702755816919, 2.74709716617411)