# About
This notebook is to create a data frame containing `business_id`'s of restaurants which Team 7 would analyze. These are restaurants in top 5 cities in Yelp business dataset that have highest number of restaurants, and they existed prior to the pandemic. 

In [1]:
import json
import pandas as pd
import preprocess



# 1. Get the restaurants' business_ids

## Load Yelp business dataset

In [2]:
data_file = open("yelp_academic_dataset_business.json")
data = []
for line in data_file:
    data.append(json.loads(line))
business_df = pd.DataFrame(data)
data_file.close()

## Subset to restaurants in top 5 cities

In [4]:
# subset the business dataset to obtain only restaurants located in the U.S.
restaurants_us = preprocess.get_restaurants_usa(business_df, 'categories_agg_211011.csv')

In [5]:
restaurants_us.shape

(32318, 14)

In [6]:
# get the top 5 cities that has the most restaurants
top5 = preprocess.get_top_cities(restaurants_us)
top5

['PORTLAND', 'ATLANTA', 'AUSTIN', 'ORLANDO', 'BOSTON']

In [11]:
# identify keywords
include_city1 = restaurants_us['city'].str.contains('PORTLAND', na = False)
include_city2 = restaurants_us['city'].str.contains('AUSTIN', na = False)
include_city3 = restaurants_us['city'].str.contains('ATLANTA', na = False)
include_city4 = restaurants_us['city'].str.contains('ORLANDO', na = False)
include_city5 = restaurants_us['city'].str.contains('BOSTON', na = False)

In [12]:
restaurants_us['city'] = restaurants_us['city'].str.upper()

In [13]:
# subset restaurants in top 5 cities
restaurants_top5_raw = restaurants_us[include_city1 | include_city2 | include_city3 | include_city4 | include_city5]

In [14]:
restaurants_top5_raw.shape

(15478, 14)

In [15]:
# get rid of misidentified restaurants due to city name similarity
# these restaurants are located in cities with names similar to the top 5 cities but they are in different states
restaurants_top5 = preprocess.clean_states(restaurants_top5_raw)

In [16]:
restaurants_top5.shape

(15476, 14)

In [17]:
# get these restaurants' business_ids
ids_restaurants_top5 = restaurants_top5[['business_id']]

# 2. Get reviews' business_ids

## Load Yelp review dataset

In [19]:
data_file = open("yelp_academic_dataset_review.json")
data = []
for line in data_file:
    data.append(json.loads(line))
review_df = pd.DataFrame(data)
data_file.close()

## Subset review dataset

In [20]:
# get business_ids of reviews posted pre pandemic
ids_reviews_pre_covid = preprocess.get_ids_reviews_pre_covid(review_df)

In [21]:
ids_reviews_pre_covid.shape

(158582, 1)

# 3. Inner join restaurants' and reviews' business_ids

In [22]:
ids_restaurants_top5cities_pre_pandemic = ids_restaurants_top5.merge(ids_reviews_pre_covid, on = 'business_id')
ids_restaurants_top5cities_pre_pandemic

Unnamed: 0,business_id
0,tCbdrRPZA0oiIYSmHG3J0w
1,ufCxltuh56FF4-ZFZ6cVhg
2,jGennaZUr2MsJyRhijNBfA
3,iPD8BBvea6YldQZPHzVrSQ
4,jx91IMdGOmLOo8h_F9z39g
...,...
14965,m5eUPVD0Hu39Ff-Uqe-FLA
14966,87f7kR7nTz8WHnmtLM_S6w
14967,jYgqSazE0gUyI7qq086Dzw
14968,r5Uag1JqYjr2nbxQCVqm8A


In [23]:
ids_restaurants_top5cities_pre_pandemic.shape

(14970, 1)

In [26]:
# export business_id of this restaurants
ids_restaurants_top5cities_pre_pandemic.to_csv(r'ids_restaurants_top5cities_pre_pandemic.csv',index = False)