In [None]:
'''
This script is used to add tags to reviews that contain words
from provided categorized dictionary. It generates a file containing
a table with additional boolean columns for each tag.
The file also includes a few queries to get useful statistics from the
generated tags.
'''

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [135]:
# load filtered reviews and the build_features dictionary we built
all_reviews = pd.read_pickle("filtered_Yelp_reviews.pkl")
build_features = pd.read_csv("build_features.csv")

# convert to lowercase
all_reviews["text"] = all_reviews["text"].str.lower()
    
# add spaces and remove "*"
# this step works in place of Regular Expressions
for feature in build_features:
    for i in range(len(build_features[feature].dropna())):
        build_features[feature][i] = " " + build_features[feature][i]
        if "*" in build_features[feature][i]:
            build_features[feature][i] = build_features[feature][i].replace("*", "")
        else:
            build_features[feature][i] = build_features[feature][i] + " "

In [170]:
# create a new dataframe for tagged reviews and their tags as columns
cols = ["business_name", "text", "stars", "categories"] + build_features.columns.tolist()
rel_reviews = pd.DataFrame(columns=cols)

for row in all_reviews.iterrows():
    cat = []
    for feature in build_features:
        if any(word in row[1].text for word in build_features[feature].dropna()):
            cat.append(True)
        else:
            cat.append(False)
    # a dataframe of the information of the review        
    info_df = pd.DataFrame([[row[1].loc["name"], row[1].text, row[1].stars_y, row[1].categories]], columns=["business_name", "text", "stars", "categories"])
    # a dataframe of the tags of the review (True/False according to 'cat')
    cat_df = pd.DataFrame([cat], columns=build_features.columns.tolist())
    # a dataframe concatenating them to be added to the list as a row
    curr_row = pd.concat([info_df, cat_df], axis=1)
    rel_reviews = rel_reviews.append(curr_row)
    
rel_reviews = rel_reviews.reset_index()
rel_reviews = rel_reviews.drop("index", axis = 1)
rel_reviews.to_pickle("./feature_extracted_reviews.pkl")
rel_reviews.to_csv("./feature_extracted_reviews.csv")

In [None]:
display(rel_reviews)

In [156]:
# print the counts of each category
for feature in build_features.columns:
    print feature
    print rel_reviews[feature].value_counts()

light
False    7981
True      838
Name: light, dtype: int64
materials
False    5079
True     3740
Name: materials, dtype: int64
icons
False    8566
True      253
Name: icons, dtype: int64
temp
False    7643
True     1176
Name: temp, dtype: int64
ambiance
False    6990
True     1829
Name: ambiance, dtype: int64
noise
False    7091
True     1728
Name: noise, dtype: int64


In [None]:
# print all reviews of the selected category 
for line in rel_reviews.text[rel_reviews.icons == True]:
    print line
    print "--------"

In [None]:
# print all reviews with their metadata of the selected category
rel_reviews[rel_reviews.icons == True]

In [200]:
# print which of the keywords in the dictionary appeared in a selected review
rel_reviews.loc[46][1]

for keyword in build_features.temp.dropna():
    if keyword in rel_reviews.loc[46][1]:
        print keyword

 smell


In [None]:
# print all reviews that include a term from the stress dictionary
stress_terms = pd.read_csv("./stress.csv")
stress = []
for line in all_reviews.text:
    if any(term in line for term in stress_terms):
        stress.append(True)
    else:
        stress.append(False)
            
stress_df = pd.DataFrame(stress)

for line in all_reviews.text[stress_df[0] == True]:
    print line
    print "--------"

In [186]:
# number of co-occurances of the stress category and light categoy 
((stress_df[0] == True) & (rel_reviews.light == True)).value_counts()

False    8812
True        7
dtype: int64

In [187]:
# number of reviews that include the regex '* communit*'
(rel_reviews.text.str.contains(" communit")).value_counts()

False    8545
True      274
Name: text, dtype: int64

In [188]:
# number of reviews that include the regex '* creativ*'
(rel_reviews.text.str.contains(" creativ")).value_counts()

False    8730
True       89
Name: text, dtype: int64

In [201]:
# number of reviews that include the regex '* clean*'
(rel_reviews.text.str.contains(" clean")).value_counts()

False    8054
True      765
Name: text, dtype: int64

In [202]:
# number of reviews that include the regex '* dirty*'
(rel_reviews.text.str.contains(" dirty")).value_counts()

False    8693
True      126
Name: text, dtype: int64

In [203]:
# number of reviews that include the regex '* gross*'
(rel_reviews.text.str.contains(" gross")).value_counts()

False    8769
True       50
Name: text, dtype: int64

In [215]:
# compare the frequency of the 'noise' category in Libraries vs other spaces
display((rel_reviews.categories.str.contains("Libraries") & rel_reviews.noise==True).value_counts())
display((~rel_reviews.categories.str.contains("Libraries") & rel_reviews.noise==True).value_counts())

False    7828
True      991
dtype: int64

False    8082
True      737
dtype: int64

In [205]:
# number of reviews of libraries
(rel_reviews.categories.str.contains("Libraries")).value_counts()

False    5838
True     2981
Name: categories, dtype: int64

In [216]:
# relivance of 'noise' in Libraries vs other spaces, normalized
display(991.0/2981)
display(737.0/5838)

0.33243877893324386

0.1262418636519356

In [218]:
# the counts of reviews containing the selected word according to their categories
rel_reviews.categories[rel_reviews.text.str.contains(" clean")].value_counts()

Public Services & Government;Libraries                                                                                                                                                                                                    154
Libraries;Public Services & Government                                                                                                                                                                                                    135
Food;Internet Cafes;Comfort Food;Bakeries;Restaurants;Coffee & Tea;Juice Bars & Smoothies;Local Flavor;Cafes;Desserts;Breakfast & Brunch                                                                                                   45
Internet Cafes;Coffee & Tea;Shaved Ice;Burgers;Sandwiches;Breakfast & Brunch;Restaurants;Food;Juice Bars & Smoothies;Ice Cream & Frozen Yogurt                                                                                             27
Coffee & Tea;Bakeries;Sandwiches;Internet Cafes;