# About
This notebook is to subset Yelp datasets that Team 7 would analyze. These are Yelp datasets on restaurants in top 5 cities in Yelp business dataset that have highest number of restaurants, and they existed prior to the pandemic.

In [1]:
import json
import pandas as pd
import preprocess



# Load business_ids of the selected restaurants

In [2]:
ids = pd.read_csv('ids_restaurants_top5cities_pre_pandemic.csv')

In [3]:
ids.shape

(14970, 1)

# Subset Yelp business dataset

In [4]:
# load Yelp business dataset
data_file = open("yelp_academic_dataset_business.json")
data = []
for line in data_file:
    data.append(json.loads(line))
business_df = pd.DataFrame(data)
data_file.close()

In [6]:
# subset business dataset
team7_restaurant_raw = business_df.merge(ids, on = 'business_id')

In [7]:
team7_restaurant_raw.shape

(14970, 14)

In [9]:
# fix inconsistency in city names
team7_restaurant = preprocess.clean_city_names(team7_restaurant_raw)

In [13]:
# export
team7_restaurant.to_json(r'yelp_team7_dataset_restaurant.json')

# Subset Yelp review dataset

In [10]:
# load Yelp review dataset
data_file = open("yelp_academic_dataset_review.json")
data = []
for line in data_file:
    data.append(json.loads(line))
review_df = pd.DataFrame(data)
data_file.close()

In [11]:
review_df.shape

(8635403, 9)

In [12]:
# subset review dataset
team7_review = review_df.merge(ids, on = 'business_id')

In [13]:
team7_review.shape #sanity check

(2574637, 9)

In [14]:
# export
team7_review.to_json(r'yelp_team7_dataset_review.json')

# Subset Yelp covid dataset

In [4]:
# load Yelp review dataset
data_file = open("yelp_academic_dataset_covid_features.json")
data = []
for line in data_file:
    data.append(json.loads(line))
covid_df = pd.DataFrame(data)
data_file.close()

In [5]:
covid_df.shape

(209795, 9)

In [6]:
# subset review dataset
team7_covid = covid_df.merge(ids, on = 'business_id')

In [7]:
team7_covid.shape

(0, 9)

# Subset Yelp tip dataset

In [15]:
# load Yelp user dataset
data_file = open("yelp_academic_dataset_tip.json")
data = []
for line in data_file:
    data.append(json.loads(line))
tip_df = pd.DataFrame(data)
data_file.close()

In [16]:
tip_df.shape

(1162119, 5)

In [17]:
# subset review dataset
team7_tip = tip_df.merge(ids, on = 'business_id')

In [18]:
team7_tip.shape

(381056, 5)

In [19]:
# export
team7_tip.to_json(r'yelp_team7_dataset_tip.json')

# Subset Yelp checkin dataset

In [21]:
# load Yelp checkin dataset
data_file = open("yelp_academic_dataset_checkin.json")
data = []
for line in data_file:
    data.append(json.loads(line))
checkin_df = pd.DataFrame(data)
data_file.close()

In [22]:
checkin_df.shape

(138876, 2)

In [23]:
# subset review dataset
team7_checkin = checkin_df.merge(ids, on = 'business_id')

In [24]:
team7_checkin.shape

(14438, 2)

In [25]:
# export
team7_checkin.to_json(r'yelp_team7_dataset_checkin.json')