In [2]:
import pandas as pd
import json
import config as cf
from tqdm import tqdm
import random
from pandas.io.json import json_normalize
from nltk.corpus import stopwords
from collections import Counter
# from collections import OrderedDict
from nltk import word_tokenize
import math
from odict import odict
import os

In [None]:
def log(x):
    try:
        return math.log(x)
    except ValueError:
        return 0

In [None]:
def notEnglish(s): 
    results = []
    for t in s:
        try:
            t.encode(encoding='utf-8').decode('ascii')
        except UnicodeDecodeError:
            results.append(True)
        else:
            results.append(False)
    return results

In [None]:
dataPath = cf.ROOT_PATH + cf.DATA_PATH
reviewPath = dataPath + 'review.json'
max_records = 1e5
df = pd.read_json(reviewPath, lines=True, chunksize=max_records)
reviews = pd.DataFrame() # Initialize the dataframe
print('next step')
i = 0
try:
   for df_chunk in tqdm(df):
       if i > 2:
            break
       reviews = pd.concat([reviews, df_chunk])
       print(i)
       i=i +1
except ValueError:
       print ('\nSome messages in the file cannot be parsed')

In [None]:
reviews = reviews.drop(reviews[notEnglish(reviews['text'])].index)

In [None]:
df = pd.read_json(dataPath + 'user.json', lines=True, chunksize=max_records)
users = pd.DataFrame() # Initialize the dataframe
print('next step')
i = 0
try:
   for df_chunk in tqdm(df):
       if i > 2:
            break
       users = pd.concat([users, df_chunk])
       print(i)
       i=i +1
except ValueError:
       print ('\nSome messages in the file cannot be parsed')

In [None]:
reviews = reviews.merge(users[['user_id', 'cool', 'useful', 'funny', 'fans', 'review_count']], left_on='user_id', right_on='user_id')

In [None]:
reviews['user_score'] = 0
reviews['user_score'] = ((reviews['useful_y'].apply(log) + reviews['funny_y'].apply(log) + reviews['cool_y'].apply(log) + reviews['fans'].apply(log))/reviews['review_count'])

In [None]:
reviews['review_score'] = 0
reviews['review_score'] = reviews['useful_x'].apply(log) + reviews['cool_x'].apply(log) + reviews['funny_x'].apply(log)

In [None]:
reviews = reviews.drop(reviews[(reviews['user_score'] == 0) & (reviews['review_score'] == 0)].index)

In [None]:
aspects = ['food', 'price', 'service', 'ambience', 'misc']
for aspect in aspects:
    reviews[aspect] = random.uniform(reviews['stars'], reviews['stars'] * 9/4 - 29/4) * (reviews['review_score'] + reviews['user_score'])

In [None]:
drop_cols = ['date', 'useful_x', 'funny_x', 'cool_x', 'useful_y', 'cool_y', 'funny_y', 'user_id', 'fans', 'review_count', 'user_score', 'review_score']
reviews = reviews.drop(drop_cols, 1)

In [None]:
states = ['AZ', 'NC', 'NV', 'IL', 'OH', 'PA', 'WI']
i = 0
for state in states:
        statePath = dataPath + 'business_consumer/' + state
        for subdir, _, files in tqdm(os.walk(statePath)):
            for file in files:
                if 'business-ids.json' not in file:
                    business = pd.read_json(os.path.join(statePath, file))
                    leftcols = ['business_id', 'text', 'food', 'price', 'service', 'ambience', 'misc']
                    business = business.merge(reviews[leftcols], left_on='business_id', right_on='business_id') 
                    business.to_json(dataPath + '%s_business.json' % state, orient= 'records')

In [None]:
s=set(stopwords.words('english'))
for state in states:
    stateBusiness = dataPath + '%s_business.json' % state
    business = pd.read_json(stateBusiness)
    business['text'] = [dict(Counter(list(filter(lambda w: not w in s and w.isalpha(),word_tokenize(txt.lower()))))) for txt in tqdm(business['text'])]
    business.to_json(dataPath + '%s_business.json' % state, orient= 'records')

In [3]:
dataPath = cf.ROOT_PATH + cf.DATA_PATH
file = dataPath + 'Tempe_business_processed.json'
business = pd.read_json(file)

In [4]:
commonCols = ['address', 'business_id', 'city', 'latitude', 'longitude', 'name', 'postal_code', 'review_count', 'stars_x', 'state']
dictCols = ['business_id', 'text', 'hours', 'categories', 'stars_y']
dropCols = ['attributes', 'is_open', 'text', 'hours', 'categories', 'stars_y']

In [5]:
sliced = business[dictCols]

In [6]:
dropped = business.drop(dropCols, 1)

In [7]:
text = sliced[['business_id', 'text', 'stars_y']]
sliced = sliced.drop('text', 1)

In [8]:
dropped = dropped.groupby(commonCols).sum()
dropped = dropped.reset_index()

In [9]:
goodText = text[text['stars_y'] >= 4.0]
badText = text[text['stars_y'] <= 2.0]

In [10]:
allId = text['business_id']
badId = badText['business_id']
goodId = goodText['business_id']
between = text[(text['stars_y'] < 4.0) & (text['stars_y'] > 2.0)]
beId = between['business_id']
dropIds = beId[~beId.isin(goodId.append(badId))]

In [11]:
dropped=dropped[~dropped['business_id'].isin(dropIds)]
text = text[~text['business_id'].isin(dropIds)]
sliced = sliced[~sliced['business_id'].isin(dropIds)]

In [12]:
# calculate stars
stars = text.groupby(['business_id', 'stars_y'])['stars_y'].size().unstack(fill_value=0)
stars = stars.reset_index()
stars[[1,2,3,4,5]] = stars[[1,2,3,4,5]].divide(stars.sum(1), 0) * 100

In [13]:
def counter(df):
    return dict(sum((Counter(dict(x)) for x in df['text']), Counter()))

In [14]:
def orderDict(x):
    d = odict(sorted(x.items(), key=lambda t: t[1], reverse=True))
    results = {}
    i = 0
    for key, value in d.items():
        if i > 19:
            break
        results[key] = value
        i += 1
    return results

In [15]:
goodText = goodText.groupby('business_id').apply(counter).reset_index()
badText = badText.groupby('business_id').apply(counter).reset_index()
goodText[0] = goodText[0].apply(orderDict)
badText[0] = badText[0].apply(orderDict)

In [16]:
sliced = sliced.groupby('business_id').last().reset_index()
sliced = sliced.drop('stars_y',1)

In [17]:
# put everything together
temp = pd.concat([sliced, goodText])
temp = temp.groupby('business_id').last().reset_index()
temp = temp.rename(index=str, columns={0: "good words"})
temp = pd.concat([temp, badText])
temp = temp.groupby('business_id').last().reset_index()
temp = temp.rename(index=str, columns={0: "bad words"})
temp = pd.concat([temp, stars]).groupby('business_id').last().reset_index()
temp = pd.concat([dropped, temp]).groupby('business_id').last().reset_index()

In [18]:
temp.to_json(dataPath + 'test.json', orient = 'records')

In [None]:
temp

In [None]:
a = pd.concat([test, testBad])

In [None]:
goodText[0] = goodText[0].apply(orderDict)

In [None]:
a = odict(sorted(goodText[0][0].items(), key=lambda t: t[1], reverse=True))

In [None]:
test = {}
i = 0
for key, value in a.items():
    if i > 19:
        break
    test[key] = value
    i += 1

In [None]:
goodText[0][10]