We thought that finding the co-occurrences of words in the review with negative or positive words from the corpus would be a good way to find out exactly which parts of the purchased product make it good or bad. For example, in a hot sauce review, it would be helpful for manufacturers and consumers alike to understand if it's the flavor, packaging, or something else that make it popular or unpopular.

In [1]:
import nltk
import pickle


with open('lemmatized.txt', 'rb') as file:
    lemmedreviews = pickle.load(file)
        
        
print(lemmedreviews[:5])

[['bought-NOUN', 'several-ADJ', 'vitality-NOUN', 'can-VERB', 'dog-NOUN', 'food-NOUN', 'product-NOUN', 'found-NOUN', 'good-ADJ', 'product-NOUN', 'look-NOUN', 'like-ADP', 'stew-NOUN', 'process-VERB', 'meat-NOUN', 'smell-NOUN', 'labrador-NOUN', 'finicky-NOUN', 'appreciates-NOUN', 'product-NOUN', 'well-ADV'], ['product-NOUN', 'arrive-VERB', 'label-VERB', 'jumbo-ADJ', 'salt-VERB', 'peanut-NOUN', 'actually-ADV', 'small-ADJ', 'size-VERB', 'not-NEGATION', 'not-NEGATION', 'sure-NOUN', 'error-NOUN', 'vendor-NOUN', 'intend-VERB', 'represent-NOUN', 'product-NOUN'], ['confection-NOUN', 'around-ADP', 'pillowy-NOUN', 'citrus-NOUN', 'gelatin-NOUN', 'nut-NOUN', 'case-NOUN', 'cut-NOUN', 'tiny-ADJ', 'square-NOUN', 'liberally-ADV', 'coat-VERB', 'powder-VERB', 'tiny-ADJ', 'mouthful-NOUN', 'not-NEGATION', 'not-NEGATION', 'highly-ADV', 'recommend-NOUN', 'yummy-NOUN', 'familiar-ADJ', 'story-NOUN', 'treat-NOUN', 'seduces-NOUN', 'edmund-NOUN', 'sell-VERB', 'brother-NOUN', 'sister-NOUN'], ['look-VERB', 'secret-N

In [2]:
#find co-occurences 
from collections import Counter
span = 3
cooccs_stem_surface = Counter()


for sentence in lemmedreviews:
    for i,w in enumerate(sentence):
        #check all co-occurring words within a 3 word span 
        span_range = list(range(max(i- span, 0), i)) 
        span_range.extend(range(i+1, min(i + span + 1, len(sentence)))) 
        for cw in [sentence[idx] for idx in span_range]:
            if cw != w and (cw, w) not in cooccs_stem_surface:
                cooccs_stem_surface[(w, cw)] += 1
    
print(cooccs_stem_surface.most_common(10))

[(('kettle-NOUN', 'chip-NOUN'), 109), (('not-NEGATION', 'like-ADP'), 101), (('chip-NOUN', 'potato-NOUN'), 90), (('not-NEGATION', 'chip-NOUN'), 69), (('taste-NOUN', 'not-NEGATION'), 61), (('brand-NOUN', 'chip-NOUN'), 53), (('would-VERB', 'not-NEGATION'), 52), (('not-NEGATION', 'good-ADJ'), 51), (('like-ADP', 'taste-NOUN'), 48), (('kettle-NOUN', 'brand-NOUN'), 45)]


In [12]:
#perform sentiment analysis
from nltk.corpus import sentiwordnet as swn
from nltk.corpus import wordnet as wn 

#TODO:
#what is the mapping for ADP?
#handle negation
#graphing/visual analysis 

#create mapping for proper sentiment analysis 
mapping = {'NOUN':wn.NOUN, "ADJ": wn.ADJ, 'VERB' : wn.VERB}

#make synsets out of tuples
list_cooccs = list(cooccs_stem_surface)

#dictionary to store tuples of co-occurences and their positivity 
wordvals = {}

#give items new mapping for use with senti_synsets
for tup in list_cooccs:
    #get individual words out of tuple
    w1 = tup[0].split("-")
    w2 = tup[1].split("-")
    
    #check if first word in tuple is contained within mapping
    if w1[1] in mapping.keys():
        #extract part of speech
        one_pos = mapping[w1[1]]
        #extract the word itself 
        word1 = w1[0]
        
        #test that synsets exist 
        if len(list(swn.senti_synsets(word1, pos = one_pos))) > 0:
            scores = swn.senti_synset(word1 + "."+ one_pos + ".01")
            #calculate positive and negative scores for word 1 
            positive_score = scores.pos_score()
            negative_score = scores.neg_score() 
            
    #repeat for second word in tuple         
    if w2[1] in mapping.keys():
        #extract part of speech 
        two_pos = mapping[w2[1]]
        word2 = w2[0]
        
        if len(list(swn.senti_synsets(word2, pos = two_pos))) > 0:
            scores2 = swn.senti_synset(word2 + "."+ two_pos + ".01")
            #calculate positive and negative scores for word 1 
            positive_score2 = scores2.pos_score()
            negative_score2 = scores2.neg_score() 
            
    #store word that co-occurs with the more polar word of the tuple in dict 
    if (positive_score + negative_score) > (positive_score2 + negative_score2):
        #find if the word is more negative or more positive 
        if positive_score > negative_score:
            #store word with score of other word in the tuple 
            wordvals[word2] = positive_score 
        else:
            wordvals[word2] = (-1 * negative_score)
            
    elif (positive_score + negative_score) < (positive_score2 + negative_score2):
        if positive_score > negative_score:
            wordvals[word1] = positive_score2 
        else:
            wordvals[word1] = (-1 * negative_score2)

{'bought': -0.25, 'can': -0.25, 'dog': -0.25, 'food': -0.375, 'product': 0.125, 'found': -0.0, 'look': -0.0, 'process': -0.75, 'meat': -0.0, 'smell': -0.25, 'arrive': -0.25, 'label': 0.5, 'salt': -0.0, 'peanut': -0.0, 'size': -0.0, 'sure': -0.125, 'vendor': -0.5, 'intend': -0.125, 'represent': -0.625, 'pillowy': -0.125, 'citrus': -0.0, 'gelatin': -0.125, 'case': -0.0, 'cut': 0.5, 'tiny': -0.0, 'mouthful': 0.375, 'recommend': -0.0, 'yummy': -0.125, 'story': -0.375, 'treat': -0.0, 'seduces': 0.25, 'ingredient': -0.0, 'robitussin': -0.0, 'believe': -0.875, 'addition': 0.125, 'root': -0.125, 'beer': -0.0, 'great': 0.5, 'taffy': -0.25, 'assortment': -0.0, 'delivery': -0.0, 'get': -0.25, 'hair': -0.25, 'order': 0.25, 'pound': -0.125, 'many': -0.25, 'bit': 0.25, 'much': 0.125, 'piece': 0.75, 'flavor': 0.5, 'candy': -0.125, 'wrap': -0.625, 'stuck': -0.0, 'happen': -0.125, 'serve': -0.0, 'party': -0.0, 'everyone': -0.125, 'soft': 0.375, 'would': -0.125, 'sprout': -0.0, 'cat': -0.125, 'eat': -0.