In [1]:
# This tells matplotlib not to try opening a new window for each plot.
%matplotlib inline

# General libraries.
import re
import csv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from wordcloud import WordCloud

import nltk
from apyori import apriori
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import wordnet 
from nltk.corpus import sentiwordnet as swn
from collections import Counter
from textblob import TextBlob

### Import TripAdvisor dataset (currently one csv file from each city)

In [2]:
filepath = ['reviews/Chicago_Illinois_1.csv', 'reviews/Las_Vegas_Nevada_1.csv', \
            'reviews/New_York_City_New_York_1.csv', 'reviews/San_Francisco_California_1.csv']

data = pd.DataFrame()
for path in filepath:
    df = pd.read_csv(path)
    df['city'] = path.split('/')[1]
    data = data.append(df)
data.shape #247,637 rows, 19 columns

(247637, 19)

In [3]:
#data.isnull().sum()
del data['neighborhood'] #remove column given 80+% null

In [4]:
#Make sure every sentence ends in a period and every review is split into sentences
#for cases such as {961, 869, 871, 809, 717, 494, 911, 720, 818, 340, 634, 310, 442, 990}
#data['review_body'] = data['review_body']+'.'
data['review_body'] = [x.split('.') for x in data['review_body']]

In [5]:
import random
data_sample = data.sample(frac=0.01, replace=False,random_state=1)
len(data_sample)

2476

### Import LIWC scores for each csv

In [6]:
filepath_LIWC = ['reviews/LIWC_scores_Chicago_Illinois_1.csv', 'reviews/LIWC_scores_Las_Vegas_Nevada_1.csv', \
            'reviews/LIWC_scores_New_York_City_New_York_1.csv', 'reviews/LIWC_scores_San_Francisco_California_1.csv']
consciousness = pd.DataFrame()
for path in filepath_LIWC:
    d = pd.read_csv(path)
    d['city'] = path.split('/')[1]
    consciousness = consciousness.append(d)

consciousness.shape

(247637, 45)

In [7]:
# Drop redundant columns from dataframe
consciousness.drop(['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q',
                   'R'], axis=1, inplace=True)
consciousness_sample = consciousness.sample(frac=0.1, replace=False,random_state=1)
consciousness_sample.head()

Unnamed: 0,affect,anger,anx,cause,certain,city,cogproc,compare,differ,discrep,...,negemo,percept,posemo,quant,relativ,sad,see,space,tentat,time
7472,7.46,0.0,0.0,1.49,0.0,LIWC_scores_Chicago_Illinois_1.csv,10.45,2.99,2.99,2.99,...,0.0,4.48,7.46,5.97,5.97,0.0,2.99,5.97,0.0,0.0
3833,5.94,0.0,0.0,0.99,1.98,LIWC_scores_New_York_City_New_York_1.csv,8.91,,2.97,1.98,...,0.99,3.96,2.97,,18.81,0.0,1.98,9.9,0.99,7.92
13976,10.53,0.0,0.0,0.0,3.51,LIWC_scores_Las_Vegas_Nevada_1.csv,3.51,,0.0,0.0,...,0.0,0.0,10.53,,14.04,0.0,0.0,7.02,0.0,5.26
20253,11.39,0.0,0.0,0.63,1.27,LIWC_scores_New_York_City_New_York_1.csv,5.7,,1.27,0.0,...,0.63,2.53,10.76,,15.19,0.63,0.63,8.23,1.27,6.33
52744,9.0,0.0,0.0,0.95,0.95,LIWC_scores_Las_Vegas_Nevada_1.csv,2.84,,0.47,0.0,...,0.0,3.79,9.0,,17.06,0.0,3.32,9.95,0.47,5.21


### Read in NRC Emotion Lexicon

In [8]:
with open('NRC-Emotion-Lexicon-Wordlevel-v0.92.csv', 'r') as f:
    df = pd.read_csv(f, delimiter='\t', header=0, names=['word', 'emotion', 'relation'])

# Create lists of emotion words
#(anger, fear, anticipation, trust, surprise, sadness, joy, and disgust)
anticipation = list(df[(df['emotion']=='anticipation') & (df['relation']!=0)]['word'])
anger = list(df[(df['emotion']=='anger') & (df['relation']!=0)]['word'])
fear = list(df[(df['emotion']=='fear') & (df['relation']!=0)]['word'])
trust = list(df[(df['emotion']=='trust') & (df['relation']!=0)]['word'])
surprise = list(df[(df['emotion']=='surprise') & (df['relation']!=0)]['word'])
sadness = list(df[(df['emotion']=='sadness') & (df['relation']!=0)]['word'])
joy = list(df[(df['emotion']=='joy') & (df['relation']!=0)]['word'])
disgust = list(df[(df['emotion']=='disgust') & (df['relation']!=0)]['word'])
neg_NRC = list(df[(df['emotion']=='negative') & (df['relation']!=0)]['word'])
pos_NRC = list(df[(df['emotion']=='positive') & (df['relation']!=0)]['word'])

### Pre-process Text

In [9]:
# Tokenize and tag each sentence in review with part of speech tags.
all_sent = []
for item in data_sample['review_body']:
    tagged_sent = []
    for sent in item:
        tokenized = nltk.word_tokenize(sent)
        tagged=nltk.pos_tag(tokenized)
        tagged_sent.append(tagged)
    all_sent.append(tagged_sent)

### Extract features
Feature details: https://docs.google.com/document/d/1xbN-xf9eeJsqsfqzynQt1KcV51cy0TLiaX3lAP8evBE/edit?usp=sharing 

In [12]:
reviews = []
sentences = []
# Initiate empty list for subjectivity features
nouns = []
adj = []
comparatives = []
superlatives = []
adverbs = []
word_count = []
ne = []
tense = []
polarity = []
dt = []

# Initiate empty list for trauma narrative features
past_tense = []
first_person_plural = []
third_person_prn=[]
negative_words = []
positive_words = []
add_narrative_feat = []
narr_words=[]
# Initiate empty list for TextBlob scores
subjectivity = []
polarity2 = []

# Initiate empty list for NRC Emotion / Plutchik’s scale of emotion
tot_anger = []
tot_fear = []
tot_anticipation = []
tot_trust = []
tot_surprise = []
tot_sadness = []
tot_joy = []
tot_disgust = []
tot_neg_NRC = []
tot_pos_NRC = []
word_ant = []
word_anger = []
word_fear = []
word_trust = []
word_surprise = []
word_sad = []
word_joy = []
word_disgust = []

untagged_sent = []
count=0

In [13]:
for rev in all_sent:
    review_no = data_sample.index.values[count]
    count+=1
    #print(rev)
    for sent in rev:
        if len(sent)==0:
            pass
        else:
            #print(sent)
            reviews.append(review_no)
            # Append sentence
            sentences.append(sent)
            # Get word count
            word_count.append(len(sent))

            counts = Counter(x[1] for x in sent)
            total_nouns = counts['NN'] + counts['NNS'] + counts['NNP'] + counts['NNPS']
            nouns.append(total_nouns)

            #print('total nouns: ' + str(total_nouns))

            total_adj = counts['JJ'] + counts['JJR'] + counts['JJS']
            adj.append(total_adj)
            #print('total adj: ' + str(total_adj))

            total_comparatives = counts['JJR']
            comparatives.append(total_comparatives)
            #print('total comparatives: ' + str(total_comparatives))

            total_superlatives = counts['JJS']
            superlatives.append(total_superlatives)
            #print('total superlatives: ' + str(total_superlatives))

            total_adv = counts['RB']+ counts['RBR'] + counts['RBS']
            adverbs.append(total_adv)
            #print('total adv: ' + str(total_adv))
            
            # Get named entities
            ne_tree =  nltk.ne_chunk(sent)
            ne_list = list(ne_tree)
            counts_ne = Counter(list(ne_list[i])[0][1] for i in range(len(ne_list)) \
                                if type(ne_list[i]) is nltk.tree.Tree)
            total_ne = counts_ne['NNP']
            ne.append(total_ne)
            
            count_future = counts['MD']
            #print('future words: ' + str(count_future))
            count_present = counts['VBP'] + counts['VBG'] + counts['VBZ']
            #print('present words: ' + str(count_present))
            count_past = counts['VBD'] + counts ['VBN']
            past_tense.append(count_past)
            #print('past words: ' + str(count_past))
            # Get max count from future, present, and past, and use as tense of sentence.
            tense_count = max(count_future, count_present, count_past)
            #print('max: ' + str(tense_count))
            if tense_count == count_future:
                tense.append('future')
                #print('this sentence is written in future tense')
            elif tense_count == count_present:
                tense.append('present')
                #print('this sentence is written in present tense')
            else:
                tense.append('past')
                #print('this sentence is written in past tense')
            
            # CD represents cardinal numbers
            count_dt = counts['CD']
            dt.append(count_dt)
            #print('total digits: ' + str(count_dt))
            
            count_fp = Counter(x[0] for x in sent if x[0] in ['we','us','our'])
            first_person_plural.append(sum(count_fp.values()))
            #print('total first person pronoun words: ' + str(sum(count_fp.values())))
            
            count_tp = Counter(x[0] for x in sent if x[0] in ['he','she','it','him','her','his','hers','its','they',\
                                                              'them','their','theirs'])            
            third_person_prn.append(sum(count_tp.values()))
            
            count_add_narr = Counter(x[0] for x in sent if x[0] in ['say','tell','said','told','saying','telling',\
                                                                   'then', 'after', 'before','initially', 'first',\
                                                                   'next', 'while', 'during', 'finally', 'eventually',\
                                                                   'end', 'start'])
            add_narrative_feat.append(sum(count_add_narr.values()))
            narr_words.append(count_add_narr.keys()) 
            # Determine polarity of sentence. Count the number of words from SentiWordNet 
            # (having either nonzero positive polarity score or nonzero negative polarity score) 
            # present in a sentence
            sent_pos = 0
            sent_neg = 0
            for word in sent:
                #print(word)
                pos_list = []
                neg_list = []
                # Get word synonyms
                syns = wordnet.synsets(word[0])
                if syns == []:
                    pass
                else:
                    # Append all polarity scores for each word and take the max positive and negative
                    # to use as overall word polarity. 
                    for s in syns:
                        #print(s.name)
                        swn_synset =swn.senti_synset(s.name())
                        pos_list.append(swn_synset.pos_score())
                        neg_list.append(swn_synset.neg_score())
                        #print('Positive: ' + str(pos_list))
                        #print('Negative: ' + str(neg_list))
                    word_pos = max(pos_list)
                    #print('Positive max: ' + str(word_pos))
                    word_neg = max(neg_list)
                    #print('Negative max: ' + str(word_neg))
                    if word_pos > word_neg:
                        sent_pos +=1
                    if word_neg > word_pos:
                        sent_neg +=1
            negative_words.append(sent_neg)
            positive_words.append(sent_pos)
            # Sum up number of positive and negative words in sentence
            total_polarity = sent_pos+sent_neg
            polarity.append(total_polarity)
            
            # NRC Emotions: anger, fear, anticipation, trust, surprise, sadness, joy, and disgust
            count_ant = Counter(x[0] for x in sent if x[0] in anticipation)
            tot_anticipation.append(sum(count_ant.values()))
            word_ant.append(count_ant.keys())
            count_anger = Counter(x[0] for x in sent if x[0] in anger)
            tot_anger.append(sum(count_anger.values()))
            word_anger.append(count_anger.keys())
            count_fear = Counter(x[0] for x in sent if x[0] in fear)
            tot_fear.append(sum(count_fear.values()))
            word_fear.append(count_fear.keys())
            count_trust = Counter(x[0] for x in sent if x[0] in trust)
            tot_trust.append(sum(count_trust.values()))
            word_trust.append(count_trust.keys())
            count_surprise = Counter(x[0] for x in sent if x[0] in surprise)
            tot_surprise.append(sum(count_surprise.values()))
            word_surprise.append(count_surprise.keys())
            count_sad = Counter(x[0] for x in sent if x[0] in sadness)
            tot_sadness.append(sum(count_sad.values()))
            word_sad.append(count_sad.keys())
            count_joy = Counter(x[0] for x in sent if x[0] in joy)
            tot_joy.append(sum(count_joy.values()))
            word_joy.append(count_joy.keys())
            count_disgust = Counter(x[0] for x in sent if x[0] in disgust)
            tot_disgust.append(sum(count_disgust.values()))
            word_disgust.append(count_disgust.keys())
            count_neg = Counter(x[0] for x in sent if x[0] in neg_NRC)
            tot_neg_NRC.append(sum(count_neg.values()))
            count_pos = Counter(x[0] for x in sent if x[0] in pos_NRC)
            tot_pos_NRC.append(sum(count_pos.values()))
            
            # Use TextBlob to sentence-level subjectivity and polarity scores
            # Each word in the lexicon has scores for:
            # 1) polarity: negative vs. positive    (-1.0 => +1.0)
            # 2) subjectivity: objective vs. subjective (+0.0 => +1.0)
            new_sent = " ".join([a for a, b in sent])
            text = TextBlob(new_sent)
            untagged_sent.append(new_sent)
            subjectivity.append(text.sentiment.subjectivity)
            polarity2.append(text.sentiment.polarity)
            
            
print(count)

2476


In [14]:
# Create a dataframe of sentence-level records
setup = {'Reviews':reviews, 'Sentence':sentences, 'Untagged': untagged_sent, \
         'Word_Count': word_count, 'Nouns': nouns, 'Adjectives':adj, \
         'Comparatives': comparatives, 'Superlatives':superlatives, 'Adverbs': adverbs,\
         'Subjectivity': subjectivity, 'Tense': tense, 'Digits': dt, \
         'Polarity': polarity, 'Polarity2': polarity2, 'First Person': first_person_plural,     
         'Named Entities': ne, 'Third Person': third_person_prn, 'Past Tense': past_tense, \
         'Narrative Seq': add_narrative_feat, 'Narr Words': narr_words, 'Neg_Words': negative_words, 'Anger':tot_anger, \
         'Anger Words': word_anger, 'Anticipation':tot_anticipation, 'Ant Words': word_ant, 'Fear':tot_fear,\
         'Fear Words': word_fear, 'Trust': tot_trust, 'Trust Words': word_trust, 'Surprise':tot_surprise, \
         'Surprise Words': word_surprise, 'Sad': tot_sadness, 'Sad Words': word_sad, 'Joy':tot_joy, \
         'Joy Words': word_joy,'Disgust': tot_disgust, 'Disgust Words': word_disgust, 'neg_NRC':tot_neg_NRC, \
         'pos_NRC':tot_pos_NRC
}
opinions = pd.DataFrame(setup)    

In [92]:
#opinions.to_csv('opinions_NRC_sample', sep=',')

In [114]:
#opinions = pd.read_csv('opinions_NRC_sample')
#opinions.head()

In [15]:
opinions = opinions.drop_duplicates(subset=['Reviews'], keep='first')

In [16]:
# Aggregate sentence-level dataframe back into review-level
aggregation_functions = {'Adjectives': 'sum', 'Adverbs': 'sum', 'Comparatives': 'sum', 'Superlatives': 'sum',
                        'Digits': 'sum', 'Nouns': 'sum', 'Polarity': 'sum', 'Polarity2': 'mean',
                        'Subjectivity': 'mean', 'Tense': lambda col: ', '.join(col), 'Word_Count': 'sum',
                        'First Person': 'sum', 'Named Entities': 'sum', 'Third Person': 'sum', 'Past Tense': 'sum',
                        'Narrative Seq': 'sum', 'Narr Words': 'sum','Neg_Words': 'sum', 'Anger':'sum', 'Anger Words': 'sum',
                         'Anticipation':'sum', 'Ant Words': 'sum', 'Fear':'sum', 'Fear Words': 'sum', 
                         'Trust': 'sum', 'Trust Words': 'sum', 'Surprise':'sum', 'Surprise Words': 'sum',
                         'Sad': 'sum', 'Sad Words': 'sum', 'Joy': 'sum', 'Joy Words': 'sum', 'Disgust': 'sum', 
                         'Disgust Words': 'sum', 'neg_NRC':'sum', 'pos_NRC':'sum'}
df1 = opinions.groupby(opinions['Reviews']).agg(aggregation_functions)

In [17]:
data_sample = data_sample[~data_sample.index.duplicated(keep='first')]
consciousness_sample = consciousness_sample[~consciousness_sample.index.duplicated(keep='first')]

In [18]:
len(data_sample)

2438

In [19]:
len(df1)

2438

In [None]:
# Make values percentages
df2 = df1[['Adjectives','Adverbs', 'Comparatives', 'Superlatives', 'Digits', 'Nouns', 'Polarity',
           'Named Entities', 'First Person', 'Third Person', 'Past Tense', 'Neg_Words', 'Narrative Seq', 
           'Anger', 'Anticipation', 'Fear', 'Trust', 'Surprise', 'Sad', 'Joy', 'Disgust',
           'neg_NRC', 'pos_NRC']].div(df1.Word_Count, axis=0)
df2['Tense'] = df1['Tense']
df2['Polarity2'] = df1['Polarity2']
df2['Subjectivity'] = df1['Subjectivity']
df2['Word_Count'] = df1['Word_Count']
df2['Ant Words'] = df1['Ant Words']
df2['Anger Words'] = df1['Anger Words']
df2['Fear Words'] = df1['Fear Words']
df2['Trust Words'] = df1['Trust Words']
df2['Surprise Words'] = df1['Surprise Words']
df2['Sad Words'] = df1['Sad Words']
df2['Joy Words'] = df1['Joy Words']
df2['Disgust Words'] = df1['Disgust Words']
df2['Narr Words'] = df1['Narr Words']
df2.head()

In [21]:
# create column for Narrative to denote presence of narrative (if sum of % of
# past tense, third person pronoun and narrative feature variables > 0.5, then 1 for narrative)
df2['Narrative'] = [1 if (df2.iloc[i]['Past Tense']+df2.iloc[i]['Third Person']+df2.iloc[i]['Narrative Seq'])>=0.5
                    else 0 for i in range(len(df2))]
df2['Tense'] = [Counter(i.split(', ')).most_common()[0][0] for i in df2['Tense']]
df2['affect'] = consciousness_sample['affect']*.01
df2['posemo'] = consciousness_sample['posemo']*.01
df2['negemo'] = consciousness_sample['negemo']*.01
df2['Trauma'] = [1 if (df2.iloc[i]['Narrative']>0) and (df2.iloc[i]['Neg_Words']>0) 
                 and (df2.iloc[i]['First Person']>0) else 0 for i in range(len(df2))]
df2['anx'] = consciousness_sample['anx']*.01*.01
df2['anger'] = consciousness_sample['anger']*.01
df2['sad'] = consciousness_sample['sad']*.01 #[:8088]
df2['cogproc'] = consciousness_sample['cogproc']*.01 #[:8088]
df2['insight'] = consciousness_sample['insight']*.01 #[:8088]
df2['cause'] = consciousness_sample['cause']*.01 #[:8088]
df2['discrep'] = consciousness_sample['discrep']*.01 #[:8088]
df2['tentat'] = consciousness_sample['tentat']*.01 #[:8088]
df2['certain'] = consciousness_sample['certain']*.01 #[:8088]
df2['differ'] = consciousness_sample['differ']*.01 #[:8088]
df2['Rating'] = data_sample['rating'] #[:8088]
df2['Helpful'] = data_sample['helpful_vote'] #[:8088]
df2['city'] = data_sample['city']

In [22]:
# Remove rows with rating of 3
df2 = df2[df2.Rating != 30]
df2.shape

(2164, 54)

In [23]:
df2['Rating'] = df2['Rating'].map({50: 1, 40: 1, 20: 0, 10: 0})
df2['Tense'] = df2['Tense'].map({'past': -1, 'present': 2, 'future': 3})
df2['city'] = df2['city'].map({'Chicago_Illinois_1.csv': 1, 'New_York_City_New_York_1.csv': 2, 
                              'San_Francisco_California_1.csv': 3, 'Las_Vegas_Nevada_1.csv': 4})

In [24]:
# Create spatial and temporal columns as stated in What Happens in Vegas

consciousness_sample['spatial'] = [2 if (consciousness_sample.iloc[s]['space'] > 0) & (consciousness_sample.iloc[s]['percept'] > 0) 
                            else 1 if (consciousness_sample.iloc[s]['space'] > 0) & (consciousness_sample.iloc[s]['percept'] == 0) 
                            else 0 for s in range(len(consciousness_sample))]
consciousness_sample['temporal'] = [2 if (consciousness_sample.iloc[s]['time'] > 0) & (consciousness_sample.iloc[s]['cause'] > 0) 
                            else 1 if (consciousness_sample.iloc[s]['time'] > 0) | (consciousness_sample.iloc[s]['cause'] > 0) 
                            else 0 for s in range(len(consciousness_sample))]

In [25]:
df2['spatial'] = consciousness_sample['spatial']
df2['temporal'] = consciousness_sample['temporal']

In [26]:
print('Number of positive reviews: ' + str(len(df2[df2['Rating'] == 1])))
print('Number of negative reviews: ' + str(len(df2[df2['Rating'] == 0])))
print('% of positive: ' + str(round((len(df2[df2['Rating'] == 1])/(len(df2[df2['Rating'] == 1])+len(df2[df2['Rating'] == 0])))*100,2)))

Number of positive reviews: 1883
Number of negative reviews: 281
% of positive: 87.01


In [27]:
print('Chicago reviews: ' + str(len(df2[df2['city'] == 1])))
print('NYC reviews: ' + str(len(df2[df2['city'] == 2])))
print('SF reviews: ' + str(len(df2[df2['city'] == 3])))
print('Las Vegas reviews: ' + str(len(df2[df2['city'] == 4])))

Chicago reviews: 536
NYC reviews: 534
SF reviews: 534
Las Vegas reviews: 560


In [28]:
df2.to_csv('Chi_NYC_LV_features_NRC', sep=',')

## Train Model

In [29]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn import metrics
from sklearn.utils import resample

In [30]:
y = df2['Rating']
#'Adjectives', 'Adverbs', 'Comparatives', 'Superlatives', 'Digits',
       #'Nouns', 'Polarity', 'Named Entities', 'Tense', 'Past Tense', 'Neg_Words', 'Narrative', 
    # 'posemo', 'negemo','Trauma', 'anx', 'anger', 'sad','insight', 'cause', 'discrep', 'tentat', 'certain', 'differ' 
X = df2[['Polarity2','Subjectivity', 'Word_Count', 'Named Entities',
       'affect', 'cogproc','Helpful', 'city', 'Trauma','spatial', 'temporal', 'Third Person', 'Past Tense']]

In [31]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0, stratify=y)
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(X_test, y_test)))
confusion_matrix = metrics.confusion_matrix(y_test, y_pred)
print(confusion_matrix)
print(metrics.classification_report(y_test, y_pred))


Accuracy of logistic regression classifier on test set: 0.89
[[ 16  68]
 [  6 560]]
             precision    recall  f1-score   support

          0       0.73      0.19      0.30        84
          1       0.89      0.99      0.94       566

avg / total       0.87      0.89      0.86       650



### Add NRC

In [32]:
y1 = df2['Rating']
#'Adjectives', 'Adverbs', 'Comparatives', 'Superlatives', 'Digits',
       #'Nouns', 'Polarity', 'Named Entities', 'Tense', 'Past Tense', 'Neg_Words', 'Narrative', 
    # 'posemo', 'negemo','Trauma', 'anx', 'anger', 'sad','insight', 'cause', 'discrep', 'tentat', 'certain', 'differ' 
X1 = df2[['Polarity2','Subjectivity', 'Word_Count', 'Named Entities',
       'affect', 'cogproc','Helpful', 'city', 'Trauma','spatial', 'temporal', 'Third Person', 'Past Tense', 'Anger', 'Anticipation', 'Fear',\
       'Trust', 'Surprise', 'Sad', 'Joy','Disgust']]

In [33]:
X_train1, X_test1, y_train1, y_test1 = train_test_split(X1, y1, test_size=0.3, random_state=0, stratify=y)
logreg1 = LogisticRegression()
logreg1.fit(X_train1, y_train1)
y_pred1 = logreg1.predict(X_test1)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg1.score(X_test1, y_test1)))
confusion_matrix = metrics.confusion_matrix(y_test1, y_pred1)
print(confusion_matrix)
print(metrics.classification_report(y_test1, y_pred1))


Accuracy of logistic regression classifier on test set: 0.89
[[ 16  68]
 [  6 560]]
             precision    recall  f1-score   support

          0       0.73      0.19      0.30        84
          1       0.89      0.99      0.94       566

avg / total       0.87      0.89      0.86       650



### Upsampled

In [34]:
from sklearn.utils import resample
# Separate majority and minority classes
df_majority = df2[df2.Rating==1]
df_minority = df2[df2.Rating==0]
 
# Upsample minority class
df_minority_upsampled = resample(df_minority, 
                                 replace=True,     # sample with replacement
                                 n_samples=len(df2[df2['Rating'] == 1]),    # to match majority class
                                 random_state=123) # reproducible results
 
# Combine majority class with upsampled minority class
df_upsampled = pd.concat([df_majority, df_minority_upsampled])
 
# Display new class counts
df_upsampled.Rating.value_counts()

1    1883
0    1883
Name: Rating, dtype: int64

In [122]:
df_upsampled.to_csv('Chi_NYC_LV_features_NRC_upsampled', sep=',')

In [35]:
#Scale data to determine feature importance
y2 = df_upsampled.Rating
X2 = df_upsampled[['Polarity2','Subjectivity', 'Word_Count', 'Named Entities',
       'affect', 'cogproc','Helpful', 'city', 'Trauma','spatial', 'temporal', 'Third Person', 'Past Tense']]
#scaler = RobustScaler()
#scaler.fit(X1) 
#X_scaled1 = pd.DataFrame(scaler.transform(X1),columns = X1.columns)
#X_scaled1.head()

X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, y2, test_size=0.3, random_state=0)
logreg2 = LogisticRegression()
logreg2.fit(X_train2, y_train2)
y_pred2 = logreg2.predict(X_test2)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg2.score(X_test2, y_test2)))
confusion_matrix1 = metrics.confusion_matrix(y_test2, y_pred2)
print(confusion_matrix1)
print(metrics.classification_report(y_test2, y_pred2))


Accuracy of logistic regression classifier on test set: 0.76
[[438 128]
 [139 425]]
             precision    recall  f1-score   support

          0       0.76      0.77      0.77       566
          1       0.77      0.75      0.76       564

avg / total       0.76      0.76      0.76      1130



### Add NRC

In [36]:
#Scale data to determine feature importance
y2 = df_upsampled.Rating
X2 = df_upsampled[['Polarity2','Subjectivity', 'Word_Count', 'Named Entities',
       'affect', 'cogproc','Helpful', 'city', 'Trauma','spatial', 'temporal', 'Third Person', 'Past Tense', 'Anger', 'Anticipation', 'Fear',\
       'Trust', 'Surprise', 'Sad', 'Joy','Disgust']]
#scaler = RobustScaler()
#scaler.fit(X1) 
#X_scaled1 = pd.DataFrame(scaler.transform(X1),columns = X1.columns)
#X_scaled1.head()

X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, y2, test_size=0.3, random_state=0)
logreg2 = LogisticRegression()
logreg2.fit(X_train2, y_train2)
y_pred2 = logreg2.predict(X_test2)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg2.score(X_test2, y_test2)))
confusion_matrix1 = metrics.confusion_matrix(y_test2, y_pred2)
print(confusion_matrix1)
print(metrics.classification_report(y_test2, y_pred2))


Accuracy of logistic regression classifier on test set: 0.77
[[438 128]
 [132 432]]
             precision    recall  f1-score   support

          0       0.77      0.77      0.77       566
          1       0.77      0.77      0.77       564

avg / total       0.77      0.77      0.77      1130



## Look at Incorrects

In [37]:
results_df = X_test2
results_df['y_test']= y_test2
results_df['y_pred']=y_pred2
incorrects = results_df[results_df['y_test']!=results_df['y_pred']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [38]:
data_sample2 = data.sample(frac=0.01, replace=False,random_state=1)

In [39]:
incorrects['review_body']=np.nan
incorrects['Anger Words']=np.nan
incorrects['Ant Words'] =np.nan
incorrects['Fear Words']=np.nan
incorrects['Trust Words']=np.nan
incorrects['Surprise Words'] =np.nan
incorrects['Joy Words'] =np.nan
incorrects['Disgust Words'] =np.nan
incorrects['Sad Words'] =np.nan

for i in incorrects.index.values:
    incorrects['review_body'].loc[i] = str(data_sample2['review_body'].loc[i])
    incorrects['Anger Words'].loc[i] = str(df2['Anger Words'].loc[i])
    incorrects['Ant Words'].loc[i] = str(df2['Ant Words'].loc[i])
    incorrects['Fear Words'].loc[i] = str(df2['Fear Words'].loc[i])
    incorrects['Trust Words'].loc[i] = str(df2['Trust Words'].loc[i])
    incorrects['Surprise Words'].loc[i] = str(df2['Surprise Words'].loc[i])
    incorrects['Joy Words'].loc[i] = str(df2['Joy Words'].loc[i])
    incorrects['Disgust Words'].loc[i] = str(df2['Disgust Words'].loc[i])
    incorrects['Sad Words'].loc[i] = str(df2['Sad Words'].loc[i])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See

In [41]:
results_df['review_body']=np.nan
results_df['Anger Words']=np.nan
results_df['Ant Words'] =np.nan
results_df['Fear Words']=np.nan
results_df['Trust Words']=np.nan
results_df['Surprise Words'] =np.nan
results_df['Joy Words'] =np.nan
results_df['Disgust Words'] =np.nan
results_df['Sad Words'] =np.nan
for i in results_df.index.values:
    results_df['review_body'].loc[i] = str(data_sample2['review_body'].loc[i])
    results_df['Anger Words'].loc[i] = str(df2['Anger Words'].loc[i])
    results_df['Ant Words'].loc[i] = str(df2['Ant Words'].loc[i])
    results_df['Fear Words'].loc[i] = str(df2['Fear Words'].loc[i])
    results_df['Trust Words'].loc[i] = str(df2['Trust Words'].loc[i])
    results_df['Surprise Words'].loc[i] = str(df2['Surprise Words'].loc[i])
    results_df['Joy Words'].loc[i] = str(df2['Joy Words'].loc[i])
    results_df['Disgust Words'].loc[i] = str(df2['Disgust Words'].loc[i])
    results_df['Sad Words'].loc[i] = str(df2['Sad Words'].loc[i])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See

In [101]:
false_positives = incorrects[(incorrects['y_test']==0) & (incorrects['y_pred']==1)]
false_negatives = incorrects[(incorrects['y_test']==1) & (incorrects['y_pred']==0)]
true_positives = results_df[(results_df['y_test']==1) & (results_df['y_pred']==1)]
true_negatives = results_df[(results_df['y_test']==0) & (results_df['y_pred']==0)]

In [44]:
false_positives.to_csv('false_positives_NRC_upsampled', sep=',')
false_negatives.to_csv('false_negatives_NRC_upsampled', sep=',')
true_positives.to_csv('true_positives_NRC_upsampled', sep=',')
true_negatives.to_csv('true_negatives_NRC_upsampled', sep=',')

In [102]:
print(len(true_negatives))
true_negatives = true_negatives[~true_negatives.index.duplicated(keep='first')]
print(len(true_negatives))

438
194


In [110]:
print(len(false_positives))
false_positives = false_positives[~false_positives.index.duplicated(keep='first')]
print(len(false_positives))

128
58


### Emotion Words in True Negative and Positive Reviews

In [105]:
#'Anger Words','Ant Words', 'Fear Words', 'Trust Words', 'Surprise Words', 'Joy Words',
                #'Disgust Words', 'Sad Words'
print('Anger Words in True Negative Reviews')
for i in true_negatives.index.values:
    if true_negatives.loc[i]['Anger Words']!='dict_keys([])':
        print(true_negatives.loc[i]['Anger Words'].replace("dict_keys([","").replace("])",""))

print('Anticipation Words in True Negative Reviews')
for i in true_negatives.index.values:
    if true_negatives.loc[i]['Ant Words']!='dict_keys([])':
        print(true_negatives.loc[i]['Ant Words'].replace("dict_keys([","").replace("])",""))
        
print('Disgust Words in True Negative Reviews')
for i in true_negatives.index.values:
    if true_negatives.loc[i]['Disgust Words']!='dict_keys([])':
        print(true_negatives.loc[i]['Disgust Words'].replace("dict_keys([","").replace("])",""))

print('Sad Words in True Negative Reviews')
for i in true_negatives.index.values:
    if true_negatives.loc[i]['Sad Words']!='dict_keys([])':
        print(true_negatives.loc[i]['Sad Words'].replace("dict_keys([","").replace("])",""))

Anger Words in True Negative Reviews
'hot'
'disgusting', 'smell'
'terrible'
'bad', 'treat', 'money'
'bad', 'warp'
'disappointed', 'noisy'
'hot'
'noisy'
'terrible'
'bad'
'treat'
'force', 'fee'
'stolen'
'money'
'broken', 'stolen', 'cash'
'hit', 'fee', 'greed'
'awful'
'shoddy'
'bad', 'terrible'
'disappointed'
'disappointed'
'bad'
'misleading'
'disappointed'
'terrible'
'bad'
'disappointed'
'cutting'
'noisy'
'horrible'
'fee', 'ridiculous'
'hot'
'horrible', 'money'
'bad'
'stolen'
'money'
'noisy', 'bad'
'fee'
'smell', 'horrible'
'horrible'
'mad'
'ruined'
'remove'
'bad'
'money'
'wireless', 'row'
'bad'
'terrible'
Anticipation Words in True Negative Reviews
'renovation'
'time'
'god'
'share'
'friendly'
'treat', 'money'
'advance', 'arrive'
'responsive'
'pay', 'advance'
'time'
'time'
'excited', 'arrive'
'spa', 'celebrating', 'time', 'perfect'
'time'
'honeymoon'
'good'
'vacation'
'treat', 'mother'
'pretty'
'time'
'star'
'time'
'pay'
'money'
'cash'
'time'
'medical'
'ready'
'share', 'opportunity'
'arr

In [77]:
#'Anger Words','Ant Words', 'Fear Words', 'Trust Words', 'Surprise Words', 'Joy Words',
                #'Disgust Words', 'Sad Words'
print('Trust Words in True Positive Reviews')
for i in true_positives.index.values:
    if true_positives.loc[i]['Trust Words']!='dict_keys([])':
        print(true_positives.loc[i]['Trust Words'].replace("dict_keys([","").replace("])",""))

print('Surprise Words in True Positive Reviews')
for i in true_positives.index.values:
    if true_positives.loc[i]['Surprise Words']!='dict_keys([])':
        print(true_positives.loc[i]['Surprise Words'].replace("dict_keys([","").replace("])",""))
        
print('Joy Words in True Positive Reviews')
for i in true_positives.index.values:
    if true_positives.loc[i]['Joy Words']!='dict_keys([])':
        print(true_positives.loc[i]['Joy Words'].replace("dict_keys([","").replace("])",""))

Trust Words in True Positive Reviews
'attentive'
'center'
'personal'
'good'
'recommend'
'clean', 'excellent', 'helpful', 'professional', 'fairly'
'clean'
'attentive'
'clean'
'fully'
'lovely', 'clean'
'friendly', 'efficient', 'wonderful'
'friend'
'perfect', 'money'
'pleasant'
'friendly'
'honest'
'wonderful', 'attentive'
'center'
'heritage'
'good'
'interior', 'thoughtful', 'friendly'
'clean'
'recommend'
'oasis'
'helpful', 'professional'
'loyalty', 'perfect'
'honest'
'excellent'
'shopping'
'genuine'
'good'
'honeymoon'
'shopping', 'food'
'clean', 'friendly', 'helpful'
'excellent'
'favorite'
'excellent'
'clean'
'perfect', 'friendly'
'share'
'good'
'helpful', 'friendly'
'shopping'
'perfect', 'friendly'
'wonderful'
'smile'
'good'
'excellent'
'shopping'
'lovely', 'excellent'
'clean'
'perfect'
'clean'
'shopping'
'perfect'
'pretty', 'good'
'found', 'excellent'
'pleasant', 'friendly', 'helpful'
'food'
'friendly'
'attentive'
'excellent', 'money'
'friendly'
'level'
'clean'
'friendly'
'shopping'
'en

### Sample False Positive

In [112]:
for i in false_positives.index.values[:5]:
    print('Review #: ' + str(i))
    print(false_positives.loc[i]['review_body'])
    print('\n')

Review #: 33891
['I booked this hotel because it was cheaper than others, and reviews were ok', 'The registration took nearly an hour (there is a computer for a automatic registration but nobody informed me)', 'There are 6 elevators to the floor but the Hotel was full and the elevarors were not enough, the first time we waited patiently for a elevator (we had valises), fortunately we were in a low floor so, after that, we (as many others) used the stairs', 'The pool is realatively small, it was so full that you bearly could move arround', 'The room was ok (although the icebox was noisy)We got a deal that included breakfast, we thought we would have a calm breakfast in a dining room of a hotel but at the registration, we got informed there is no room, we had to go to one specific coffee shop and spend no more than 9$ (not a good deal because now we had to go to that shop while there are many others nearby)Summary -Not recommend this placeIf I came again I will spend more money and enjoy

### Sample False Negative

In [113]:
for i in false_negatives.index.values[:5]:
    print('Review #: ' + str(i))
    print(false_negatives.loc[i]['review_body'])
    print('\n')

Review #: 13772
['We used to live not far from the Marcel and I am in its neighborhood almost weekly, but had never noticed the hotel before my husband made a reservation there for a birthday weekend', " I was actually a little apprehensive when he told me where he had reserved the room, because I couldn't imagine what the hotel would be like", ' There was no need to worry', ' The experience from check in to check out was great', ' The front desk staff was friendly and efficient', ' The room was spotless and surprisingly spacious for Manhattan', ' The modern style of both the room and the lobby was appealing', ' There was even complementary coffee in an upstairs guest lounge and a continental breakfast available in the morning for a separate fee', " Although we opted to eat at a favorite breakfast destination in the neighborhood (Bluebell Café, on Third Avenue and 22nd Street), the hotel's option looked good", " We will definitely return in the future, as the location is convenient to 

### Value Comparisons btw True/False + -

In [106]:
from statistics import mean 
print('False+ | Polarity2: ' + str(mean(false_positives.Polarity2)) + ' True |' +str(mean(true_negatives.Polarity2)))
print('False+ | Subjectivity: ' + str(mean(false_positives.Subjectivity))+ ' True |' +str(mean(true_negatives.Subjectivity)))
print('False+ | Word_Count: ' + str(mean(false_positives.Word_Count))+ ' True |' +str(mean(true_negatives.Word_Count)))
print('False+ | Helpful: ' + str(mean(false_positives.Helpful))+ ' True |' +str(mean(true_negatives.Helpful)))
print('False+ | affect: ' + str(mean(false_positives.affect))+ ' True |' +str(mean(true_negatives.affect)))
print('False+ | cogproc: ' + str(mean(false_positives.cogproc))+ ' True |' +str(mean(true_negatives.cogproc)))
print('False+ | Trauma: ' + str(mean(false_positives.Trauma))+ ' True |' +str(mean(true_negatives.Trauma)))
print('False+ | spatial: ' + str(mean(false_positives.spatial))+ ' True |' +str(mean(true_negatives.spatial)))
print('False+ | temporal: ' + str(mean(false_positives.temporal))+ ' True |' +str(mean(true_negatives.temporal)))
print('False+ | Anger: ' + str(mean(false_positives.Anger))+ ' True |' +str(mean(true_negatives.Anger)))
print('False+ | Anticipation: ' + str(mean(false_positives.Anticipation))+ ' True |' +str(mean(true_negatives.Anticipation)))
print('False+ | Trust: ' + str(mean(false_positives.Trust))+ ' True |' +str(mean(true_negatives.Trust)))
print('False+ | Surprise: ' + str(mean(false_positives.Surprise))+ ' True |' +str(mean(true_negatives.Surprise)))
print('False+ | Sad: ' + str(mean(false_positives.Sad))+ ' True |' +str(mean(true_negatives.Sad)))
print('False+ | Joy: ' + str(mean(false_positives.Joy))+ ' True |' +str(mean(true_negatives.Joy)))
print('False+ | Disgust: ' + str(mean(false_positives.Disgust))+ ' True |' +str(mean(true_negatives.Disgust)))
print('False+ | Fear: ' + str(mean(false_positives.Fear))+ ' True |' +str(mean(true_negatives.Fear)))
            

False+ | Polarity2: 0.3185039569805195 True |-0.10390411850463396
False+ | Subjectivity: 0.4803929642406205 True |0.3857907127237024
False+ | Word_Count: 14 True |20
False+ | Helpful: 20 True |11
False+ | affect: 0.0546375 True |0.045620103092783505
False+ | cogproc: 0.1025796875 True |0.09956494845360825
False+ | Trauma: 0 True |0
False+ | spatial: 1 True |1
False+ | temporal: 1 True |1
False+ | Anger: 0.003037052266081871 True |0.017211995633945106
False+ | Anticipation: 0.02483077202191785 True |0.019008265004702207
False+ | Trust: 0.03538472189632535 True |0.01637514573586462
False+ | Surprise: 0.016319369792695257 True |0.006145783411439398
False+ | Sad: 0.002441454586701864 True |0.015270775933676506
False+ | Joy: 0.029703574537062665 True |0.013831640934790813
False+ | Disgust: 0.0012335526315789473 True |0.019013842023626937
False+ | Fear: 0.005343599885129491 True |0.012988083678996414


In [107]:
print('False- | Polarity2: ' + str(mean(false_negatives.Polarity2))+ ' True |' +str(mean(true_positives.Polarity2)))
print('False- | Subjectivity: ' + str(mean(false_negatives.Subjectivity))+ ' True |' +str(mean(true_positives.Subjectivity)))
print('False- | Word_Count: ' + str(mean(false_negatives.Word_Count))+ ' True |' +str(mean(true_positives.Word_Count)))
print('False- | Helpful: ' + str(mean(false_negatives.Helpful))+ ' True |' +str(mean(true_positives.Helpful)))
print('False- | affect: ' + str(mean(false_negatives.affect))+ ' True |' +str(mean(true_positives.affect)))
print('False- | cogproc: ' + str(mean(false_negatives.cogproc))+ ' True |' +str(mean(true_positives.cogproc)))
print('False- | Trauma: ' + str(mean(false_negatives.Trauma))+ ' True |' +str(mean(true_positives.Trauma)))
print('False- | spatial: ' + str(mean(false_negatives.spatial))+ ' True |' +str(mean(true_positives.spatial)))
print('False- | temporal: ' + str(mean(false_negatives.temporal))+ ' True |' +str(mean(true_positives.temporal)))
print('False- | Anger: ' + str(mean(false_negatives.Anger))+ ' True |' +str(mean(true_positives.Anger)))
print('False- | Anticipation: ' + str(mean(false_negatives.Anticipation))+ ' True |' +str(mean(true_positives.Anticipation)))
print('False- | Trust: ' + str(mean(false_negatives.Trust))+ ' True |' +str(mean(true_positives.Trust)))
print('False- | Surprise: ' + str(mean(false_negatives.Surprise))+ ' True |' +str(mean(true_positives.Surprise)))
print('False- | Sad: ' + str(mean(false_negatives.Sad))+ ' True |' +str(mean(true_positives.Sad)))
print('False- | Joy: ' + str(mean(false_negatives.Joy))+ ' True |' +str(mean(true_positives.Joy)))
print('False- | Disgust: ' + str(mean(false_negatives.Disgust))+ ' True |' +str(mean(true_positives.Disgust)))
print('False- | Fear: ' + str(mean(false_negatives.Fear))+ ' True |' +str(mean(true_positives.Fear)))
            

False- | Polarity2: 0.030926157739266505 True |0.45260317134573774
False- | Subjectivity: 0.2445374188752165 True |0.6102787830994125
False- | Word_Count: 18 True |16
False- | Helpful: 10 True |18
False- | affect: 0.05787575757575757 True |0.09004953703703704
False- | cogproc: 0.09607121212121213 True |0.07933402777777777
False- | Trauma: 0 True |0
False- | spatial: 1 True |1
False- | temporal: 1 True |1
False- | Anger: 0.004755669105717281 True |0.0022862478577837363
False- | Anticipation: 0.021757516973061646 True |0.02855271164296416
False- | Trust: 0.015951096406668357 True |0.04114897861434589
False- | Surprise: 0.012525183356726612 True |0.01451659588388738
False- | Sad: 0.008381353693547713 True |0.004980185046805211
False- | Joy: 0.013792447953588947 True |0.045059468014049135
False- | Disgust: 0.003536604240829593 True |0.0013586385645043368
False- | Fear: 0.008246246133899968 True |0.0016556714100557355
