We thought that finding the co-occurrences of words in the review with negative or positive words from the corpus would be a good way to find out exactly which parts of the purchased product make it good or bad. For example, in a hot sauce review, it would be helpful for manufacturers and consumers alike to understand if it's the flavor, packaging, or something else that make it popular or unpopular.

In [23]:
import nltk
import pickle


with open('lemmatized.txt', 'rb') as file:
    lemmedreviews = pickle.load(file)
        
        
print(lemmedreviews[:5])

[['bought-NOUN', 'several-ADJ', 'vitality-NOUN', 'can-VERB', 'dog-NOUN', 'food-NOUN', 'product-NOUN', 'found-NOUN', 'good-ADJ', 'product-NOUN', 'look-NOUN', 'like-ADP', 'stew-NOUN', 'process-VERB', 'meat-NOUN', 'smell-NOUN', 'labrador-NOUN', 'finicky-NOUN', 'appreciates-NOUN', 'product-NOUN', 'well-ADV'], ['product-NOUN', 'arrive-VERB', 'label-VERB', 'jumbo-ADJ', 'salt-VERB', 'peanut-NOUN', 'actually-ADV', 'small-ADJ', 'size-VERB', 'not-NEGATION', 'not-NEGATION', 'sure-NOUN', 'error-NOUN', 'vendor-NOUN', 'intend-VERB', 'represent-NOUN', 'product-NOUN'], ['confection-NOUN', 'around-ADP', 'pillowy-NOUN', 'citrus-NOUN', 'gelatin-NOUN', 'nut-NOUN', 'case-NOUN', 'cut-NOUN', 'tiny-ADJ', 'square-NOUN', 'liberally-ADV', 'coat-VERB', 'powder-VERB', 'tiny-ADJ', 'mouthful-NOUN', 'not-NEGATION', 'not-NEGATION', 'highly-ADV', 'recommend-NOUN', 'yummy-NOUN', 'familiar-ADJ', 'story-NOUN', 'treat-NOUN', 'seduces-NOUN', 'edmund-NOUN', 'sell-VERB', 'brother-NOUN', 'sister-NOUN'], ['look-VERB', 'secret-N

In [22]:
#find co-occurences 
from collections import Counter
span = 3
cooccs_stem_surface = Counter()


for sentence in lemmedreviews:
    for i,w in enumerate(sentence):
        #check all co-occurring words within a 3 word span 
        span_range = list(range(max(i- span, 0), i)) 
        span_range.extend(range(i+1, min(i + span + 1, len(sentence)))) 
        for cw in [sentence[idx] for idx in span_range]:
            if cw != w and (cw, w) not in cooccs_stem_surface:
                cooccs_stem_surface[(w, cw)] += 1
    
print(cooccs_stem_surface.most_common(10))

[(('kettle-NOUN', 'chip-NOUN'), 109), (('not-NEGATION', 'like-ADP'), 101), (('chip-NOUN', 'potato-NOUN'), 90), (('not-NEGATION', 'chip-NOUN'), 69), (('taste-NOUN', 'not-NEGATION'), 61), (('brand-NOUN', 'chip-NOUN'), 53), (('would-VERB', 'not-NEGATION'), 52), (('not-NEGATION', 'good-ADJ'), 51), (('like-ADP', 'taste-NOUN'), 48), (('kettle-NOUN', 'brand-NOUN'), 45)]


In [147]:
#perform sentiment analysis
from nltk.corpus import sentiwordnet as swn
from nltk.corpus import wordnet as wn 

#TODO:
#what is the mapping for ADP?
#how to handle negation?

#create mapping for proper sentiment analysis 
mapping = {'NOUN':wn.NOUN, "ADJ": wn.ADJ, 'VERB' : wn.VERB}

#make synsets out of tuples
list_cooccs = list(cooccs_stem_surface)

wordvals = {}

#give items new mapping for use with senti_synsets
for tup in list_cooccs:
    w1 = tup[0].split("-")
    w2 = tup[1].split("-")
    
    if w1[1] in mapping.keys():
        new_pos = mapping[w1[1]]
        word = w1[0]
        #w1_test = swn.senti_synsets(word, pos = new_pos)
        if len(list(swn.senti_synsets(word, pos = new_pos))) > 0:
            scores = swn.senti_synset(word+"."+new_pos+".01")
            positive_score = scores.pos_score()
            negative_score = scores.neg_score()
                