In [54]:
import pandas as pd
import json
import config as cf
from tqdm import tqdm
import random
from pandas.io.json import json_normalize
from nltk.corpus import stopwords
from collections import Counter
# from collections import OrderedDict
from nltk import word_tokenize
import math
from odict import odict
import os

In [None]:
def log(x):
    try:
        return math.log(x)
    except ValueError:
        return 0

In [None]:
def notEnglish(s): 
    results = []
    for t in s:
        try:
            t.encode(encoding='utf-8').decode('ascii')
        except UnicodeDecodeError:
            results.append(True)
        else:
            results.append(False)
    return results

In [None]:
dataPath = cf.ROOT_PATH + cf.DATA_PATH
reviewPath = dataPath + 'review.json'
max_records = 1e5
df = pd.read_json(reviewPath, lines=True, chunksize=max_records)
reviews = pd.DataFrame() # Initialize the dataframe
print('next step')
i = 0
try:
   for df_chunk in tqdm(df):
       if i > 2:
            break
       reviews = pd.concat([reviews, df_chunk])
       print(i)
       i=i +1
except ValueError:
       print ('\nSome messages in the file cannot be parsed')

In [None]:
reviews = reviews.drop(reviews[notEnglish(reviews['text'])].index)

In [None]:
df = pd.read_json(dataPath + 'user.json', lines=True, chunksize=max_records)
users = pd.DataFrame() # Initialize the dataframe
print('next step')
i = 0
try:
   for df_chunk in tqdm(df):
       if i > 2:
            break
       users = pd.concat([users, df_chunk])
       print(i)
       i=i +1
except ValueError:
       print ('\nSome messages in the file cannot be parsed')

In [None]:
reviews = reviews.merge(users[['user_id', 'cool', 'useful', 'funny', 'fans', 'review_count']], left_on='user_id', right_on='user_id')

In [None]:
reviews['user_score'] = 0
reviews['user_score'] = ((reviews['useful_y'].apply(log) + reviews['funny_y'].apply(log) + reviews['cool_y'].apply(log) + reviews['fans'].apply(log))/reviews['review_count'])

In [None]:
reviews['review_score'] = 0
reviews['review_score'] = reviews['useful_x'].apply(log) + reviews['cool_x'].apply(log) + reviews['funny_x'].apply(log)

In [None]:
reviews = reviews.drop(reviews[(reviews['user_score'] == 0) & (reviews['review_score'] == 0)].index)

In [None]:
aspects = ['food', 'price', 'service', 'ambience', 'misc']
for aspect in aspects:
    reviews[aspect] = random.uniform(reviews['stars'], reviews['stars'] * 9/4 - 29/4) * (reviews['review_score'] + reviews['user_score'])

In [None]:
drop_cols = ['date', 'useful_x', 'funny_x', 'cool_x', 'useful_y', 'cool_y', 'funny_y', 'user_id', 'fans', 'review_count', 'user_score', 'review_score']
reviews = reviews.drop(drop_cols, 1)

In [None]:
states = ['AZ', 'NC', 'NV', 'IL', 'OH', 'PA', 'WI']
i = 0
for state in states:
        statePath = dataPath + 'business_consumer/' + state
        for subdir, _, files in tqdm(os.walk(statePath)):
            for file in files:
                if 'business-ids.json' not in file:
                    business = pd.read_json(os.path.join(statePath, file))
                    leftcols = ['business_id', 'text', 'food', 'price', 'service', 'ambience', 'misc']
                    business = business.merge(reviews[leftcols], left_on='business_id', right_on='business_id') 
                    business.to_json(dataPath + '%s_business.json' % state, orient= 'records')

In [None]:
s=set(stopwords.words('english'))
for state in states:
    stateBusiness = dataPath + '%s_business.json' % state
    business = pd.read_json(stateBusiness)
    business['text'] = [dict(Counter(list(filter(lambda w: not w in s and w.isalpha(),word_tokenize(txt.lower()))))) for txt in tqdm(business['text'])]
    business.to_json(dataPath + '%s_business.json' % state, orient= 'records')

In [3]:
dataPath = cf.ROOT_PATH + cf.DATA_PATH
file = dataPath + 'Tempe_business_processed.json'
business = pd.read_json(file)

In [19]:
commonCols = ['address', 'business_id', 'city', 'latitude', 'longitude', 'name', 'postal_code', 'review_count', 'stars_x', 'state']
dictCols = ['business_id', 'text', 'hours', 'categories', 'stars_y']
dropCols = ['attributes', 'is_open', 'text', 'hours', 'categories', 'stars_y']

In [20]:
sliced = business[dictCols]

In [21]:
dropped = business.drop(dropCols, 1)

In [22]:
text = sliced[['business_id', 'text', 'stars_y']]
sliced = sliced.drop('text', 1)

In [23]:
dropped = dropped.groupby(commonCols).sum()
dropped = dropped.reset_index()

In [24]:
goodText = text[text['stars_y'] >= 4.0]
badText = text[text['stars_y'] <= 2.0]

In [25]:
allId = text['business_id']
badId = badText['business_id']
goodId = goodText['business_id']
between = text[(text['stars_y'] < 4.0) & (text['stars_y'] > 2.0)]
beId = between['business_id']
dropIds = beId[~beId.isin(goodId.append(badId))]

In [26]:
dropped=dropped[~dropped['business_id'].isin(dropIds)]
text = text[~text['business_id'].isin(dropIds)]
sliced = sliced[~sliced['business_id'].isin(dropIds)]

In [27]:
# calculate stars
stars = text.groupby(['business_id', 'stars_y'])['stars_y'].size().unstack(fill_value=0)
stars = stars.reset_index()
stars[[1,2,3,4,5]] = stars[[1,2,3,4,5]].divide(stars.sum(1), 0) * 100

In [28]:
def counter(df):
    return dict(sum((Counter(dict(x)) for x in df['text']), Counter()))

In [73]:
def orderDict(x):
    d = odict(sorted(x.items(), key=lambda t: t[1], reverse=True))
    results = {}
    i = 0
    for key, value in d.items():
        if i > 19:
            break
        results[key] = value
        i += 1
    return results

In [29]:
goodText = goodText.groupby('business_id').apply(counter).reset_index()
badText = badText.groupby('business_id').apply(counter).reset_index()
goodText[0] = goodText[0].apply(orderDict)
badText[0] = badText[0].apply(orderDict)

In [30]:
sliced = sliced.groupby('business_id').last().reset_index()
sliced = sliced.drop('stars_y',1)

In [31]:
# put everything together
temp = pd.concat([sliced, goodText])
temp = temp.groupby('business_id').last().reset_index()
temp = temp.rename(index=str, columns={0: "good words"})
temp = pd.concat([temp, badText])
temp = temp.groupby('business_id').last().reset_index()
temp = temp.rename(index=str, columns={0: "bad words"})
temp = pd.concat([temp, stars]).groupby('business_id').last().reset_index()
temp = pd.concat([dropped, temp]).groupby('business_id').last().reset_index()

In [32]:
temp.to_json(dataPath + 'test.json', orient = 'records')

In [33]:
temp

Unnamed: 0,business_id,address,ambience,bad words,categories,city,food,good words,hours,latitude,...,price,review_count,service,stars_x,state,1,2,3,4,5
0,-0Sgh0QlUKVsWosCWJzGqQ,"681 E Apache Blvd, Ste 104",-22.759590,"{'boyfriend': 1, 'always': 2, 'liked': 1, 'dom...","[Fast Food, American (New), Restaurants, Pizza]",Tempe,16.590120,"{'decent': 2, 'changes': 1, 'seem': 1, 'happen...","{'Monday': '10:00-0:00', 'Tuesday': '10:00-0:0...",33.414346,...,-38.708960,42.0,-19.968329,2.0,AZ,65.000000,15.000000,5.000000,5.000000,10.000000
1,-4g68Hwm892_KPUuW5g1_Q,840 E Southern,-10.992440,"{'heard': 2, 'people': 2, 'really': 7, 'rave':...","[Food Delivery Services, Fast Food, Restaurant...",Tempe,29.436708,"{'awesome': 1, 'always': 7, 'deliver': 1, 'foo...","{'Monday': '10:00-22:00', 'Tuesday': '10:00-22...",33.393322,...,-27.379331,37.0,-8.124609,2.5,AZ,22.222222,14.814815,14.814815,29.629630,18.518519
2,-8QlV3b_9H4BAh6LgMIr1g,"1845 E Broadway Rd, Ste 101",179.965777,"{'place': 22, 'may': 1, 'night': 1, 'service':...","[Restaurants, Indian, Vegetarian, Buffets]",Tempe,355.249147,"{'buffet': 96, 'great': 77, 'deal': 7, 'mango'...","{'Monday': '17:00-22:00', 'Tuesday': '17:00-22...",33.406601,...,108.919278,406.0,192.399458,4.5,AZ,9.793814,3.608247,11.855670,22.680412,52.061856
3,-Gy0BAMgRN4sGlY7theqxQ,"2020 E Elliot Rd, Suite 101",144.855146,"{'looked': 1, 'forward': 1, 'trying': 1, 'new'...","[Pizza, Fast Food, Restaurants]",Tempe,215.468020,"{'finally': 3, 'broke': 1, 'check': 2, 'mod': ...","{'Monday': '10:30-22:00', 'Tuesday': '10:30-22...",33.349310,...,116.234077,77.0,149.864052,4.5,AZ,0.000000,3.773585,18.867925,24.528302,52.830189
4,-IZvuqxekWEvJqDw308daQ,"1340 E Broadway Rd, Ste 105",59.773122,"{'hard': 2, 'deep': 2, 'fry': 1, 'frozen': 1, ...","[Mexican, Breakfast & Brunch, Fast Food, Fast ...",Tempe,195.678372,"{'easy': 2, 'drive': 2, 'house': 1, 'quick': 5...","{'Monday': '6:00-0:00', 'Tuesday': '6:00-0:00'...",33.408298,...,4.687507,163.0,69.413526,3.5,AZ,15.315315,9.009009,15.315315,28.828829,31.531532
5,-cBQKodqi77Q0vk-9iDvQA,680 S Mill Ave,20.275405,"{'stop': 1, 'friends': 7, 'awesome': 1, 'pre':...","[Restaurants, American (Traditional), Bars, Ni...",Tempe,217.364427,"{'mill': 23, 'ave': 7, 'bar': 30, 'deep': 1, '...","{'Monday': '11:00-2:00', 'Tuesday': '11:00-2:0...",33.423812,...,-59.609441,170.0,34.255864,3.5,AZ,16.239316,12.820513,23.076923,26.495726,21.367521
6,-gcS8S7aiHTwYROMC5QP5A,1401 West Southern Ave,26.437185,"{'restaurant': 4, 'really': 3, 'nice': 2, 'lot...","[Restaurants, Delis, Fast Food, Pizza, America...",Tempe,64.745252,"{'definitely': 3, 'fan': 2, 'schlotzsky': 6, '...","{'Monday': '6:00-21:00', 'Tuesday': '6:00-21:0...",33.392400,...,10.910019,53.0,29.154558,3.0,AZ,16.216216,24.324324,16.216216,29.729730,13.513514
7,-s_U95xEEmJu5GK-srqpUA,"1801 E Baseline Rd, Ste 104",126.250577,"{'probably': 3, 'got': 10, 'worst': 5, 'cook':...","[Buffets, Indian, Restaurants]",Tempe,240.966379,"{'great': 36, 'selection': 5, 'variety': 20, '...","{'Monday': '17:00-21:30', 'Tuesday': '17:00-22...",33.377403,...,79.753547,240.0,134.387913,3.5,AZ,14.705882,7.352941,11.764706,30.147059,36.029412
8,-tcJmqzfaeEnpFMAelB7bA,"414 S Mill Ave, Ste 117",125.393061,"{'main': 1, 'reasons': 1, 'order': 9, 'often':...","[Mediterranean, Turkish, Restaurants]",Tempe,225.131746,"{'great': 35, 'lunch': 12, 'spot': 6, 'mill': ...","{'Monday': '9:00-22:00', 'Tuesday': '11:00-22:...",33.425965,...,84.966611,156.0,132.467999,3.5,AZ,5.940594,9.900990,14.851485,37.623762,31.683168
9,-wUKgQXAMC4H5hiBOgpT6g,1415 West Elliot Road,3.863981,"{'location': 3, 'needs': 1, 'little': 2, 'help...","[Tex-Mex, Fast Food, Restaurants, Mexican]",Tempe,33.441012,"{'taco': 5, 'bell': 4, 'never': 1, 'given': 1,...","{'Monday': '7:00-22:00', 'Tuesday': '7:00-22:0...",33.349062,...,-8.124289,20.0,5.962021,3.0,AZ,31.250000,18.750000,25.000000,12.500000,12.500000


In [None]:
a = pd.concat([test, testBad])

In [76]:
goodText[0] = goodText[0].apply(orderDict)

In [63]:
a = odict(sorted(goodText[0][0].items(), key=lambda t: t[1], reverse=True))

In [71]:
test = {}
i = 0
for key, value in a.items():
    if i > 19:
        break
    test[key] = value
    i += 1

In [77]:
goodText[0][10]

{'asada': 5,
 'breakfast': 8,
 'burrito': 12,
 'burritos': 5,
 'carne': 5,
 'cheese': 4,
 'chorizo': 4,
 'food': 9,
 'fresh': 8,
 'friendly': 5,
 'go': 7,
 'good': 5,
 'great': 8,
 'green': 6,
 'hot': 6,
 'made': 4,
 'one': 9,
 'options': 4,
 'place': 10,
 'try': 5}