In [1]:
import pandas as pd
import numpy as np

In [2]:
reviews_path = 'yelp_data/yelp_academic_dataset_review.json'
business_path = 'yelp_data/yelp_academic_dataset_business.json'

reviews_json = pd.read_json(reviews_path, lines=True, chunksize=1000000)
business_df = pd.read_json(business_path, lines=True)

In [8]:
"""
Filters out very long reviews
"""
def filter_reviews(df):
    res = []
    for i in df.text.values:
        if len(i) < 150:
            res.append(i)
    print("Number of reviews: ", len(res))
    return res

In [3]:
"""
Gets the indexes of all the relevant businesses that are restaurants/bars/coffee shops
"""

key_words = ["Restaurants", "Food", "Coffee", "Tea", "Bars"]
selected_business = []
for ind, val in enumerate(business_df.categories.values):
    if val == None:
        continue
    if any(x in val for x in key_words):
        selected_business.append(ind)

In [4]:
business_subset = business_df.iloc[selected_business]
selected_ids = business_subset.business_id.values

In [5]:
#Gets first 1,000,000 reviews as reviews_json is a generator
rev_df = next(reviews_json)

In [6]:
"""
Converts the reviews dataframe to contain only the selected businesses
"""
reviews = rev_df[["text", "business_id"]]
reviews = reviews[reviews['business_id'].isin(selected_ids)]

In [9]:
reviews = filter_reviews(reviews)

Number of reviews:  86085


In [11]:
"""
Saves a txt file with plain text of relevant reviews for use in finetuning GPT2 model
"""
np.savetxt('reviews.txt', reviews, fmt='%s')

In [12]:
"""
Creates a dataset for training a classifier
"""
#gets the next 1,000,000 reviews after the first batch
real_reviews = next(reviews_json)
real_reviews = real_reviews[["text", "business_id"]]
real_reviews = real_reviews[real_reviews['business_id'].isin(selected_ids)]
real_reviews = filter_reviews(real_reviews)
real_reviews_df = pd.DataFrame(real_reviews, columns=['Review'])
real_reviews_df['Real']=1

In [34]:
real_reviews_df.columns

Index(['Review', 'Real'], dtype='object')

In [35]:
real_reviews_df.to_csv("real_reviews.csv", index=False)