## Yelp inference (test-set) generation
- Author: Nana/Ching Wen Yang 
- Date Created: Aug 26, 2022


In [1]:
YELPPATH = '/share/data/yelp'
RP = 'yelp_academic_dataset_review.json'
BUSINESS = 'yelp_academic_dataset_business.json'
rppath = f'{YELPPATH}/{RP}'
busipath = f'{YELPPATH}/{BUSINESS}'

In [2]:
import pandas as pd
import json

In [3]:
Bid2Info = {}
with open (busipath, 'r') as f:
    for line in f:
        line = json.loads(line)
        busi_id = line['business_id']
        category = line['categories']
        
        name = line['name']
        star = line['stars']
        Bid2Info[busi_id] = {'name':name, 
                             'category':category, 
                             'star':star, 
                             'review_ids':[], 
                             'business_id':busi_id}

In [4]:
tags = set()
from collections import defaultdict
counter = defaultdict(int)
NoCat = 0 
for ts in Bid2Info.values():
    try:
        ts = ts['category'].split(',')
    except:
        NoCat += 1
        continue
    for t in ts:
        t = t.strip()
        tags.add(t)
        counter[t]+=1
print(f'No category: {NoCat}')
# print(f'#tags: {len(tags)}') 

No category: 103


In [30]:
path = '../data/yelp_tags.csv'
tags = pd.DataFrame({'tag': list(tags)})
tags.to_csv(path, index = False)

In [5]:
scounter = sorted(counter.items(), key = lambda x:-x[1])
# ('Eyewear & Opticians', 1016)
scounter.index(('Eyewear & Opticians', 1016)) # 124
# scounter[:124]

124

In [6]:
import sys
# caution: path[0] is reserved for script path (or '' in REPL)
sys.path.insert(1, '../data')
from yelp_tags import YelpFoodTags

RID_Dict = {}
FoodReviewIDs = set()
isCoveredReview = 0 
YelpFoodTags = set(YelpFoodTags)
KnownRestaurants = set()
with open (rppath, 'r') as f:
    for id, line in enumerate(f):
        line = json.loads(line)
        store = Bid2Info[line['business_id']]
        # 'Shipping Centers, Local Services, Notaries, Mailbox Centers, Printing Services'
        flag = False
        if store['business_id'] in KnownRestaurants:
            isCoveredReview += 1 
            flag = True
        elif store['category']:
            for tag in store['category'].split(','):
                tag = tag.strip()
                if tag in YelpFoodTags:
                    isCoveredReview += 1
                    flag = True
                    KnownRestaurants.add(store['business_id'])
                break 
        if flag:
            FoodReviewIDs.add(line['review_id'])
            
        RID_Dict[line['review_id']] = line

In [7]:
MYSEED = 1024
import random 
random.seed(MYSEED)
print(f'Successfully seeding {MYSEED}')
len(FoodReviewIDs)

Successfully seeding 1024


3258904

### Generate random/general inference data 
- 100 筆，選擇任意餐廳的任意評論。
- please run the 3 cells contiguously. 

In [42]:
n = 100

In [43]:
chosenIDs = random.sample(FoodReviewIDs, n)
chosenReviews = [RID_Dict[x] for x in chosenIDs]

In [44]:
from collections import Counter
c = Counter([x['business_id'] for x in chosenReviews])
len(c) # # of unique restaurant in this inference data

99

In [45]:
import json 
# the path is commented to avoid revising the file by accident
# rand_test = '../data/yelp-food-test_rand100.json'
with open(rand_test, 'w') as f:
    json.dump(chosenReviews, f, ensure_ascii = False, indent = 4)

### Generate restaurant-specific data
- 100 筆，只選擇兩間最多評論的餐廳的任意評論各五十筆。
- 兩間最多評論的餐廳（over 7K reviews）:
    - (_ab50qdWOk0DdB6XOrBitw) Acme Oyster House
    - (ac1AeYqs8Z4_e2X5M3if2A) Oceana Grill 
- Please run the 3 cells contiguously.  

In [46]:
## add review number attribute
from collections import defaultdict
BID_Freq = defaultdict(set)
with open (rppath, 'r') as f:
    for id, review in enumerate(f): 
        review = json.loads(review)
        bid = review['business_id']
        rid = review['review_id']
        BID_Freq[bid].add(rid)

In [47]:
BFreqS = sorted(BID_Freq.items(), key = lambda pair:-len(pair[1]))

numRes = 2
chosenReviews = []

for s in BFreqS[:numRes]:
    print(s[0], Bid2Info[s[0]]['name'])
    chosenIDs = random.sample(s[1], int(n/numRes))
    chosenReviews.extend([RID_Dict[x] for x in chosenIDs])

_ab50qdWOk0DdB6XOrBitw Acme Oyster House
ac1AeYqs8Z4_e2X5M3if2A Oceana Grill


In [48]:
import json 
# the path  is commented to avoid revising the file by accident
# two_restaurant_test = '../data/yelp-food-test_2rest100.json'
with open(two_restaurant_test, 'w') as f:
    json.dump(chosenReviews, f, ensure_ascii = False, indent =4)

### Restaurant Covered Ratio
- Out of the 15K (unique business_id) stores(business) that yelp records, 

  about 7K (46%) are restaurants/food-related stores. 

In [48]:
restaurant = 0
for ts in Bid2Info.values():
    try:
        ts = ts['category'].split(',')
    except:
        NoCat += 1
        continue
    for t in ts:
        t = t.strip()
        if t in YelpFoodTags:
            restaurant += 1
            break 
restaurant/len(Bid2Info) 

0.4609633778085217

In [51]:
restaurant

69304

### Review Covered Ratio 
- Out of the 699K data, roughly 325K are commenting on restuarants/stores relevant to food. 
- The coverage is 46.6%. 

In [60]:
# review covered ratio: 
isCoveredReview/len(REVIEWS) 

0.4662050733303959

In [7]:
# !cp /share/home/yuxiang/yelp_data/mapping/tags.txt /share/home/nana2929/repo_en/data

In [62]:
isCoveredReview

3258904

In [39]:
len(REVIEWS) 

6990280