In [97]:
import os
import numpy as np
import pandas as pd
import json
import re
import vsmlib
import dask.bag as db
import kmedoids
from IPython.display import display
from nltk.tag.stanford import StanfordPOSTagger as posTagger

In [3]:
os.environ['STANFORD_MODELS'] = 'stanford-postagger-2018-02-27/models/english-bidirectional-distsim.tagger'
os.environ['JAVAHOME'] = 'C:\Program Files (x86)\Common Files\Oracle\Java\javapath'

In [4]:
st = posTagger('english-bidirectional-distsim.tagger', path_to_jar='stanford-postagger-2018-02-27/stanford-postagger.jar')

In [5]:
model = vsmlib.model.load_from_dir('word_linear_cbow_50d')
model.normalize()

In [6]:
reviews = db.read_text('yelp-dataset/yelp_academic_dataset_review.json').map(json.loads)
businessData = db.read_text('yelp-dataset/yelp_academic_dataset_business.json').map(json.loads)

In [22]:
reviews.count().compute()

5996996

In [23]:
businessData.count().compute()

188593

In [7]:
data = reviews.take(100)

In [8]:
dataFrame = pd.DataFrame(list(data))
dataFrame.head()

Unnamed: 0,business_id,cool,date,funny,review_id,stars,text,useful,user_id
0,iCQpiavjjPzJ5_3gPD5Ebg,0,2011-02-25,0,x7mDIiDB3jEiPGPHOmDzyw,2,The pizza was okay. Not the best I've had. I p...,0,msQe1u7Z_XuqjGoqhB0J5g
1,pomGBqfbxcqPv14c3XH-ZQ,0,2012-11-13,0,dDl8zu1vWPdKGihJrwQbpw,5,I love this place! My fiance And I go here atl...,0,msQe1u7Z_XuqjGoqhB0J5g
2,jtQARsP6P-LbkyjbO1qNGg,1,2014-10-23,1,LZp4UX5zK3e-c5ZGSeo3kA,1,Terrible. Dry corn bread. Rib tips were all fa...,3,msQe1u7Z_XuqjGoqhB0J5g
3,elqbBhBfElMNSrjFqW3now,0,2011-02-25,0,Er4NBWCmCD4nM8_p1GRdow,2,Back in 2005-2007 this place was my FAVORITE t...,2,msQe1u7Z_XuqjGoqhB0J5g
4,Ums3gaP2qM3W1XcA5r6SsQ,0,2014-09-05,0,jsDu6QEJHbwP2Blom1PLCA,5,Delicious healthy food. The steak is amazing. ...,0,msQe1u7Z_XuqjGoqhB0J5g


In [9]:
groupedData = dataFrame.groupby('business_id')

In [10]:
reviewDict = dict()
i = 0
for key, group in groupedData:    
    tagged = st.tag(group['text'])    
    print('iteration: {}, business_id: {}'.format(i, key))
    nouns = [noun for noun, pos in tagged if 'NN' in pos]    
    matrix = []
    for noun in nouns:                
        noun = re.search('\w+', noun)
        if noun: 
            noun = noun.group()        
        if model.has_word(noun):
            matrix.append(model.get_row(noun))            
    reviewDict[key] = matrix    
    i += 1

iteration: 0, business_id: -YR7K3rw6VAQ1-MjslvsoQ
iteration: 1, business_id: -bd4BQcl1ekgo7avaFngIw
iteration: 2, business_id: -ed0Yc9on37RoIoG2ZgxBA
iteration: 3, business_id: 00liP5s4IKsq97EH4Cc0Tw
iteration: 4, business_id: 0bjFYstj8USMzEV4ZQldjA
iteration: 5, business_id: 1VKjDpPJdVoiRc8I9f7U9g
iteration: 6, business_id: 3I23nQZ6k6U2G1NKpTpfhA
iteration: 7, business_id: 3Mx4renubPRnjHUw1n2UkA
iteration: 8, business_id: 6D4L8YpkhAh_YwaLmhoMNg
iteration: 9, business_id: 6qDRqHWqf0EeHupSUEBfKg
iteration: 10, business_id: 6tSvz_21BMo3a4GaItwa0g
iteration: 11, business_id: 7AlULGZI1pHt0imODsqdkg
iteration: 12, business_id: 7m1Oa1VYV98UUuo_6i0EZg
iteration: 13, business_id: 7vxm0G4g857HzRqqpeu6bw
iteration: 14, business_id: 7xA6iSP0Ndn08tpBFQtUKA
iteration: 15, business_id: 8-su-O_330PebTOp60RILQ
iteration: 16, business_id: 9E1q2uEMd881wnruicNTUA
iteration: 17, business_id: A2pHOVp0zh7grb1bbZxoBQ
iteration: 18, business_id: AtbXj3Rg1GF6Dj5zu6KCDg
iteration: 19, business_id: AxeQEz3-s9_1T

In [11]:
def lxjOp(A, B):    
    maxProducts = []
    for a in A:
        innerProducts = []
        for b in B:            
            innerProducts.append(np.inner(a, b) / np.linalg.norm(a) * np.linalg.norm(b))
        maxProducts.append(np.max(innerProducts))   
    
    return np.mean(np.array(maxProducts))

In [12]:
def lxjDist(A, B):
    A = np.array(A)
    B = np.array(B)
    return (lxjOp(A, B) + lxjOp(B, A)) / 2

In [13]:
lxjDist(reviewDict.get('elqbBhBfElMNSrjFqW3now'), reviewDict.get('zdE82PiD6wquvjYLyhOJNA'))

0.727904736995697

In [14]:
dim = len(reviewDict.values())
distanceMatrix = np.zeros((dim, dim))

In [15]:
i=0
for value1 in reviewDict.values():    
    j = 0
    for value2 in reviewDict.values():
        if i < j:
            distanceMatrix[i, j] = lxjDist(value1, value2)
            distanceMatrix[j, i] = distanceMatrix[i, j]
        elif i == j:
            distanceMatrix[i, j] = 0
        j = j+1
    i = i+1

In [16]:
businessDf = pd.DataFrame()
i = 0
for businessId in reviewDict.keys():    
    print('iteration: {}'.format(i))
    def filterFunc(x):
        return x['business_id'] == businessId
    businessDf = businessDf.append(list(businessData.filter(filterFunc)))
    i += 1

iteration: 0
iteration: 1
iteration: 2
iteration: 3
iteration: 4
iteration: 5
iteration: 6
iteration: 7
iteration: 8
iteration: 9
iteration: 10
iteration: 11
iteration: 12
iteration: 13
iteration: 14
iteration: 15
iteration: 16
iteration: 17
iteration: 18
iteration: 19
iteration: 20
iteration: 21
iteration: 22
iteration: 23
iteration: 24
iteration: 25
iteration: 26
iteration: 27
iteration: 28
iteration: 29
iteration: 30
iteration: 31
iteration: 32
iteration: 33
iteration: 34
iteration: 35
iteration: 36
iteration: 37
iteration: 38
iteration: 39
iteration: 40
iteration: 41
iteration: 42
iteration: 43
iteration: 44
iteration: 45
iteration: 46
iteration: 47
iteration: 48
iteration: 49
iteration: 50
iteration: 51
iteration: 52
iteration: 53
iteration: 54
iteration: 55
iteration: 56
iteration: 57
iteration: 58
iteration: 59
iteration: 60
iteration: 61
iteration: 62
iteration: 63
iteration: 64
iteration: 65
iteration: 66
iteration: 67
iteration: 68
iteration: 69
iteration: 70
iteration: 71
it

In [17]:
businessDf

Unnamed: 0,address,attributes,business_id,categories,city,hours,is_open,latitude,longitude,name,neighborhood,postal_code,review_count,stars,state
0,13637 N Tatum Blvd,"{'Alcohol': 'beer_and_wine', 'Ambience': '{'ro...",-YR7K3rw6VAQ1-MjslvsoQ,"Nightlife, Bars, Sports Bars, Restaurants, Sus...",Phoenix,"{'Monday': '10:30-20:30', 'Tuesday': '10:30-20...",1,33.610821,-111.977497,Eastwind Sushi & Grill,,85032,157,3.0,AZ
0,6990 E Shea Blvd,"{'Alcohol': 'full_bar', 'Ambience': '{'romanti...",-bd4BQcl1ekgo7avaFngIw,"Breakfast & Brunch, Diners, Restaurants",Scottsdale,"{'Monday': '6:30-15:0', 'Tuesday': '6:30-15:0'...",1,33.582508,-111.931220,Eggstasy,,85254,610,3.5,AZ
0,523 Fremont St,"{'Alcohol': 'full_bar', 'Ambience': '{'romanti...",-ed0Yc9on37RoIoG2ZgxBA,"Beer Gardens, Nightlife, Thai, Restaurants",Las Vegas,"{'Monday': '11:0-23:0', 'Tuesday': '11:0-23:0'...",1,36.168802,-115.139880,Le Thai,Downtown,89101,1590,4.0,NV
0,9051 E Indian Bend Rd,"{'BikeParking': 'True', 'BusinessAcceptsCredit...",00liP5s4IKsq97EH4Cc0Tw,"Food, Coffee & Tea",Scottsdale,"{'Monday': '5:0-21:0', 'Tuesday': '5:0-21:0', ...",1,33.538119,-111.886272,Starbucks,,85250,54,2.0,AZ
0,120 Meyran Ave,"{'Alcohol': 'none', 'Ambience': '{'romantic': ...",0bjFYstj8USMzEV4ZQldjA,"Restaurants, Food Trucks, Desserts, Food, Brea...",Pittsburgh,"{'Monday': '7:0-18:0', 'Tuesday': '7:0-18:0', ...",1,40.440708,-79.958625,Redhawk Coffee,Oakland,15213,107,5.0,PA
0,8909 E Indian Bend Rd,"{'Alcohol': 'full_bar', 'Ambience': '{'romanti...",1VKjDpPJdVoiRc8I9f7U9g,"Sports Bars, Chicken Wings, Restaurants, Ameri...",Scottsdale,,1,33.538139,-111.889439,Hooters,,85250,98,3.0,AZ
0,"1116 S Dobson Rd, Ste 105","{'AcceptsInsurance': 'False', 'BikeParking': '...",3I23nQZ6k6U2G1NKpTpfhA,"Beauty & Spas, Massage",Mesa,"{'Monday': '10:0-22:0', 'Tuesday': '10:0-22:0'...",1,33.394432,-111.875578,Comfort Foot,,85202,37,4.0,AZ
0,"Caesars Palace Las Vegas Hotel Casino, 3570 S ...","{'AgesAllowed': '21plus', 'Alcohol': 'full_bar...",3Mx4renubPRnjHUw1n2UkA,"Bars, Dance Clubs, Adult Entertainment, Nightlife",Las Vegas,"{'Tuesday': '22:0-4:0', 'Thursday': '22:0-4:0'...",1,36.117303,-115.174176,OMNIA Nightclub,The Strip,89109,1030,3.0,NV
0,7245 E Gold Dust,"{'Alcohol': 'full_bar', 'Ambience': '{'romanti...",6D4L8YpkhAh_YwaLmhoMNg,"Restaurants, Italian",Scottsdale,"{'Monday': '11:0-22:0', 'Tuesday': '11:0-22:0'...",0,33.577930,-111.925334,Romano's Macaroni Grill,,85258,122,2.5,AZ
0,"6257 Carolina Commons Dr, Ste 400","{'Alcohol': 'full_bar', 'Ambience': '{'romanti...",6qDRqHWqf0EeHupSUEBfKg,"Restaurants, Chicken Wings, Burgers, Barbeque",Fort Mill,"{'Monday': '11:0-21:0', 'Tuesday': '11:0-21:0'...",1,34.936240,-80.837925,521 BBQ & Grill,,29707,151,4.0,SC


In [25]:
M, C = kmedoids.kMedoids(distanceMatrix, 4)

In [101]:
businessDf['clusterLabel'] = 'none'
for label in C:
    for point_idx in C[label]:
        businessDf.iloc[point_idx, businessDf.columns.get_loc('clusterLabel')] = label
        #print('label {0}:　point_idx: {1}'.format(label, point_idx))

In [102]:
for name, group in grouped:
    display(group)

Unnamed: 0,name,categories,clusterLabel
0,521 BBQ & Grill,"Restaurants, Chicken Wings, Burgers, Barbeque",0
0,German Auto Service,"Automotive, Auto Repair, Car Dealers, Auto Par...",0
0,Filiberto's Mexican Food,"Restaurants, Mexican",0
0,WSS,"Fashion, Shopping, Shoe Stores",0
0,Costco Wholesale,"Wholesale Stores, Tires, Food, Automotive, Gro...",0
0,Harkins Theatres Arizona Mills 25 w/ IMAX,"Arts & Entertainment, Cinema",0
0,Destiny's Bride,"Shopping, Bridal",0
0,Camping World - Concord,"RV Rental, Auto Repair, RV Dealers, Hotels & T...",0
0,AT&T,"Television Service Providers, Mobile Phones, S...",0


Unnamed: 0,name,categories,clusterLabel
0,"Ellis Island Hotel, Casino & Brewery","Barbeque, Bars, Pubs, American (Traditional), ...",1
0,That Italian Place Cafe & Eatery,"Italian, Restaurants, Cafes",1


Unnamed: 0,name,categories,clusterLabel
0,Eastwind Sushi & Grill,"Nightlife, Bars, Sports Bars, Restaurants, Sus...",2
0,Eggstasy,"Breakfast & Brunch, Diners, Restaurants",2
0,Le Thai,"Beer Gardens, Nightlife, Thai, Restaurants",2
0,Starbucks,"Food, Coffee & Tea",2
0,Redhawk Coffee,"Restaurants, Food Trucks, Desserts, Food, Brea...",2
0,Hooters,"Sports Bars, Chicken Wings, Restaurants, Ameri...",2
0,Comfort Foot,"Beauty & Spas, Massage",2
0,OMNIA Nightclub,"Bars, Dance Clubs, Adult Entertainment, Nightlife",2
0,Romano's Macaroni Grill,"Restaurants, Italian",2
0,Jjanga Japanese Restaurant,"Nightlife, Japanese, Karaoke, Sushi Bars, Rest...",2


Unnamed: 0,name,categories,clusterLabel
0,Lowe's Home Improvement Warehouse,"Hardware Stores, Home Services, Home & Garden,...",3
0,Robinson Automotive,"Automotive, Auto Repair",3
0,Fry's,"Restaurants, Food, Bakeries, Grocery, Delis, B...",3
0,JCPenney,"Fashion, Baby Gear & Furniture, Department Sto...",3
0,Lo-Lo's Chicken & Waffles,"Restaurants, Waffles, Breakfast & Brunch, Sout...",3
0,National Car Rental,"Car Rental, Hotels & Travel",3
