## YELP data to S22 format
- Author: Nana
- Created: 2022.08.22
---
- Goal: preprocess yelp to fit the format provided by semEval 2022.
- Source data: `/share/data/yelp/yelp_academic_dataset_review.json`
- Converted data: `/share/data/nana2929/yelp/`


In [36]:
from utils import structs
import json 
import numpy as np

In [37]:
# wow are we turning everything to ONE File? 
# splitting to 3 files? 

In [38]:
YELPPATH = '/share/data/yelp'
RP = 'yelp_academic_dataset_review.json'
rppath = f'{YELPPATH}/{RP}'
yelpdir = '/share/home/nana2929/yelp'

In [39]:
REVIEWS = []
with open (rppath, 'r') as f:
    for id, line in enumerate(f):
        line = json.loads(line)
        if id == 0: print(line)
        REVIEWS.append(line) 

{'review_id': 'KU_O5udG6zpxOg-VcAEodg', 'user_id': 'mh_-eMZ6K5RLWhZyISBhwA', 'business_id': 'XQfwVwDr-v0ZS3_CbbE5Xw', 'stars': 3.0, 'useful': 0, 'funny': 0, 'cool': 0, 'text': "If you decide to eat here, just be aware it is going to take about 2 hours from beginning to end. We have tried it multiple times, because I want to like it! I have been to it's other locations in NJ and never had a bad experience. \n\nThe food is good, but it takes a very long time to come out. The waitstaff is very young, but usually pleasant. We have just had too many experiences where we spent way too long waiting. We usually opt for another diner or restaurant on the weekends, in order to be done quicker.", 'date': '2018-07-07 22:09:11'}


In [40]:
def transform(x):
    if x == 3: return 'Neutral'
    elif x < 3: return 'Negative'
    else: return 'Positive'

def Yelp2JSON(reviews):
    '''
    input: YELP a list of dictionaries
    return: SemEval22 format json dictionaries (with positivity)
    '''
    result = []
    for review in reviews:
        RD = structs.ReviewDict()
        RD['review_id'] = review['review_id']
        RD['text'] = review['text'].replace('\n', '')
        RD['overall_polarity'] = transform(int(review['stars']))
        result.append(RD)
    return result 

### Because of the large size of YELP, split the files to avoid OOM/crashes

In [41]:
NSPLIT = 6

In [42]:
N_REVIEWS = np.array_split(REVIEWS, NSPLIT)

In [43]:
import time
for i, split in enumerate(N_REVIEWS):
    converted_split = Yelp2JSON(split)
    jsonfilename = f'{yelpdir}/s22_formatted_yelp_reviews_{i+1}.json'
    with open(jsonfilename, 'w') as fout:
        json.dump(converted_split, fout, ensure_ascii = False, indent = 4)
    print(f'{len(split)} data is successfully converted to {jsonfilename}.')

1165047 data is successfully converted to /share/home/nana2929/yelp/s22_formatted_yelp_reviews_1.json.
1165047 data is successfully converted to /share/home/nana2929/yelp/s22_formatted_yelp_reviews_2.json.
1165047 data is successfully converted to /share/home/nana2929/yelp/s22_formatted_yelp_reviews_3.json.
1165047 data is successfully converted to /share/home/nana2929/yelp/s22_formatted_yelp_reviews_4.json.
1165046 data is successfully converted to /share/home/nana2929/yelp/s22_formatted_yelp_reviews_5.json.
1165046 data is successfully converted to /share/home/nana2929/yelp/s22_formatted_yelp_reviews_6.json.


In [44]:
# display file size

In [45]:
%%bash
cd ../../yelp
ls -lh *

-rw-r--r-- 1 nana2929 nana2929 809M Aug 22 20:44 s22_formatted_yelp_reviews_1.json
-rw-r--r-- 1 nana2929 nana2929 815M Aug 22 20:45 s22_formatted_yelp_reviews_2.json
-rw-r--r-- 1 nana2929 nana2929 822M Aug 22 20:45 s22_formatted_yelp_reviews_3.json
-rw-r--r-- 1 nana2929 nana2929 808M Aug 22 20:46 s22_formatted_yelp_reviews_4.json
-rw-r--r-- 1 nana2929 nana2929 814M Aug 22 20:46 s22_formatted_yelp_reviews_5.json
-rw-r--r-- 1 nana2929 nana2929 817M Aug 22 20:47 s22_formatted_yelp_reviews_6.json


In [46]:
# try reading in 
with open(f'{yelpdir}/s22_formatted_yelp_reviews_1.json', 'r') as fin:
    x = json.load(fin)
x[0]

{'review_id': 'KU_O5udG6zpxOg-VcAEodg',
 'sent_id': '',
 'text': "If you decide to eat here, just be aware it is going to take about 2 hours from beginning to end. We have tried it multiple times, because I want to like it! I have been to it's other locations in NJ and never had a bad experience. The food is good, but it takes a very long time to come out. The waitstaff is very young, but usually pleasant. We have just had too many experiences where we spent way too long waiting. We usually opt for another diner or restaurant on the weekends, in order to be done quicker.",
 'overall_polarity': 'Neutral',
 'opinions': []}