In [None]:
'''
This script runs a number of experiments on the dataset as described below, mainly
sentiment analysis of the context that provided terms appeared in.
'''

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter

In [None]:
tagged_reviews = pd.read_pickle("feature_extracted_reviews.pkl")
elems = [" chair", " table", " sofa", " couch", " art", " window", " decor ", " wall"]

In [None]:
# 1. what words immediately precede feature words?
freqs = []

for elem in elems:
    pre_words = []
    rel_reviews = tagged_reviews.text[tagged_reviews.text.str.contains(elem)]
    for review in rel_reviews:
        idx = review.find(elem)
        end_idx = idx - 2
        beg_idx = end_idx
        while(review[beg_idx] != " "):
            beg_idx -= 1

        pre_words.append(review[beg_idx: end_idx+2])
        
    c = Counter(pre_words)
    freqs.append(c)

In [None]:
# 2. what is the specific context (sentence) in which the key features are being mentioned?

from nltk.tokenize import sent_tokenize

contexts = []
for elem in elems:
    elem_contexts = []
    rel_reviews = tagged_reviews.text[tagged_reviews.text.str.contains(elem)]
    for review in rel_reviews:
        sentences = sent_tokenize(review.decode('utf8'))
        #display(sentences)
        for sentence in sentences:
            if elem in sentence:
                elem_contexts.append(sentence)
                
    contexts.append(elem_contexts)

In [None]:
# concatenates the generated contexts into a dataframe where the columns are the words investigated
contexts_df = pd.DataFrame()
idx = 0

for elem in elems:
    col_df = pd.DataFrame(data=contexts[idx], columns=[elem])
    contexts_df = pd.concat([contexts_df, col_df], axis=1)
    idx += 1
    
contexts_df.to_pickle("./feature_contexts.pkl")

In [None]:
count = 0
count1 = 0
for line in contexts[0]:
    if any(word in line for word in [" plastic"]):
        count += 1
    if any(word in line for word in [" light"]):
        count1 += 1
        
print count/float(len(contexts[0]))
print count1/float(len(contexts[5]))
print len(contexts[6])

In [None]:
for line in contexts[7]:
    print line
    print "-------"

In [None]:
# make a list of businesses that mention the selected word
places = []
for line in tagged_reviews.business_name[tagged_reviews.text.str.contains(" window")]:
    places.append(line)
    
c = Counter(places)
display(c)

In [None]:
# produce the average sentiment of the contexts where the selected word appears
# the word itself isn't removed from the sentence when analyzing

from nltk.sentiment.vader import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()

neg = []
pos = []
neu = []
com = []
for sentence in contexts_df[" window"].dropna():
    #print sentence 
    ss = sid.polarity_scores(sentence)
    neg.append(ss["neg"])
    pos.append(ss["pos"])
    neu.append(ss["neu"])
    com.append(ss["compound"])
    
print "negative", np.mean(neg)
print "positive", np.mean(pos)
print "neutral", np.mean(neu)
print "compound", np.mean(com)

In [None]:
# produce the average sentiment of all the sentences in all the reviews
# this can be used as a baseline instead of 0 because the dataset is biased
neg = []
pos = []
neu = []
com = []
for review in tagged_reviews.text:
    #print sentence 
    sentences = sent_tokenize(review.decode('utf8'))
    for sentence in sentences:
        #if " the" in sentence:
        ss = sid.polarity_scores(sentence)
        neg.append(ss["neg"])
        pos.append(ss["pos"])
        neu.append(ss["neu"])
        com.append(ss["compound"])
    
print "negative", np.mean(neg)
print "positive", np.mean(pos)
print "neutral", np.mean(neu)
print "compound", np.mean(com)

In [None]:
# same process for the word 'wonderful'
neg = []
pos = []
neu = []
com = []
for review in tagged_reviews.text[tagged_reviews.text.str.contains(" wonderful")]:
    #print sentence 
    sentences = sent_tokenize(review.decode('utf8'))
    for sentence in sentences:
        if " wonderful" in sentence:
            ss = sid.polarity_scores(sentence)
            neg.append(ss["neg"])
            pos.append(ss["pos"])
            neu.append(ss["neu"])
            com.append(ss["compound"])
    
print "negative", np.mean(neg)
print "positive", np.mean(pos)
print "neutral", np.mean(neu)
print "compound", np.mean(com)

In [None]:
# same process for the word 'plastic'
neg = []
pos = []
neu = []
com = []
for review in tagged_reviews.text[tagged_reviews.text.str.contains(" plastic ")]:
    #print sentence 
    sentences = sent_tokenize(review.decode('utf8'))
    for sentence in sentences:
        if " plastic " in sentence:
            ss = sid.polarity_scores(sentence)
            neg.append(ss["neg"])
            pos.append(ss["pos"])
            neu.append(ss["neu"])
            com.append(ss["compound"])
    
print "negative", np.mean(neg)
print "positive", np.mean(pos)
print "neutral", np.mean(neu)
print "compound", np.mean(com)

In [None]:
# list of terms to be investigated in a similar process
feature_words = [" chair", " table", " sofa", " couch", " art", " window", " decor ", " wall", " natural light", 
                 " water", " fountain", " stairs", " staircase", " internet", " local art", " music", 
                " big table", " small table"]
adjectives = [" cozy", " comfy", " comfortable", " clean", " dirty", " nois", " quiet"]
keywords = feature_words + adjectives

In [None]:
# find the average sentimnet of all the sentences in the corpus that contained each
# of the terms provided above
compound_scores = []

for word in keywords:
    #neg = []
    #pos = []
    #neu = []
    com = []
    for review in tagged_reviews.text[tagged_reviews.text.str.contains(word)]:
        sentences = sent_tokenize(review.decode('utf8'))
        for sentence in sentences:
            if word in sentence:
                ss = sid.polarity_scores(sentence)
                #neg.append(ss["neg"])
                #pos.append(ss["pos"])
                #neu.append(ss["neu"])
                com.append(ss["compound"])

    compound_scores.append(np.mean(com))
    

display(compound_scores)

In [None]:
# plot these sentiments
y_pos = [i for i, _ in enumerate(keywords)]

plt.barh(y_pos, compound_scores)
plt.ylabel("word")
plt.xlabel("compound score")

plt.yticks(y_pos, keywords)
plt.axvline(x=0.2809892771562327, color='red')
plt.xlim(-0.3, 0.7)
plt.show()

In [None]:
# control analysis: in this test, we follow the same earlier procedures, except that we
# replace the terms with neutral ones according to whether they are nouns or adjectives
# This is to cancel the effect of the word itself on the sentiment when we care more about the context
compound_scores1 = []

# nouns
for word in feature_words:
    #neg = []
    #pos = []
    #neu = []
    com = []
    for review in tagged_reviews.text[tagged_reviews.text.str.contains(word)]:
        sentences = sent_tokenize(review.decode('utf8'))
        for sentence in sentences:
            if word in sentence:
                ss = sid.polarity_scores(sentence.replace(word, " chair"))
                neg.append(ss["neg"])
                pos.append(ss["pos"])
                neu.append(ss["neu"])
                com.append(ss["compound"])

    compound_scores1.append(np.mean(com))
    
# adjectives
for word in adjectives:
    #neg = []
    #pos = []
    #neu = []
    com = []
    for review in tagged_reviews.text[tagged_reviews.text.str.contains(word)]:
        sentences = sent_tokenize(review.decode('utf8'))
        for sentence in sentences:
            if word in sentence:
                ss = sid.polarity_scores(sentence.replace(word, " blue"))
                neg.append(ss["neg"])
                pos.append(ss["pos"])
                neu.append(ss["neu"])
                com.append(ss["compound"])

    compound_scores1.append(np.mean(com))

display(compound_scores1)

In [None]:
# plot the control analysis
y_pos = [i for i, _ in enumerate(keywords)]

plt.barh(y_pos, compound_scores)
plt.ylabel("word")
plt.xlabel("compound score")

plt.yticks(y_pos, keywords)
plt.axvline(x=0.2809892771562327, color='red')
plt.xlim(-0.3, 0.7)

plt.show()

In [None]:
# merge both of the earlier plots + the counts of sentences that support them
counts = []
for word in keywords:
    counts.append(tagged_reviews.text[tagged_reviews.text.str.contains(word)].count())
    
print counts

y_pos = [i for i, _ in enumerate(keywords)]

fig, ax = plt.subplots()
width = 0.35

p1 = ax.bar(y_pos, compound_scores, width)
p2 = ax.bar(np.add(y_pos, width), compound_scores1, width)

ax.set_title('Average sentiment of sentences containing key words')
ax.set_ylabel('words')
ax.set_xlabel('sentiment')

ax.legend((p1[0], p2[0]), ('Original', 'Replaced'))
plt.xticks(y_pos, keywords, rotation='vertical')
plt.axhline(y=0.2809892771562327, color='red')

plt.show()

plt.scatter(y_pos, counts)
plt.xticks(y_pos, keywords, rotation='vertical')
plt.axhline(y=20)
plt.show()

In [None]:
# produces a list of all the words that co-occured with the selected word with counts
from nltk.tokenize import word_tokenize

all_words = []

for word in [" window"]:
    
    for review in tagged_reviews.text[tagged_reviews.text.str.contains(word)]:
        sentences = sent_tokenize(review.decode('utf8'))
        for sentence in sentences:
            if word in sentence:
                words = word_tokenize(sentence)
                all_words = all_words + words
                
c = Counter(all_words)
display(c)


In [None]:
# exclude stopwords and punctuation from the prvious product
from nltk.corpus import stopwords
import string

for word in sorted(c, key=c.get, reverse=True):
    if word not in stopwords.words('english') and word not in string.punctuation:       
        print word, c[word]

In [None]:
# count of reviews containing the selected word
tagged_reviews.text[tagged_reviews.text.str.contains(" water")].count()