In [9]:
import pandas as pd
import requests
import numpy as np
import os

In [10]:
address = pd.read_csv('../data/name_address.csv')
concat_address = address.loc[:,'facility_address'] +', ' + address.loc[:,'facility_city'] +', '+ 'CA' +' '+address.loc[:,'facility_zip'].astype(str) 
address = pd.concat([address, concat_address.rename('concat_address')], axis = 1)

In [11]:
address.shape

(54990, 6)

In [15]:
# API AND HEADER CODE
api_key = 'c_dme_QHiReM7etGbm1DP5PUtvUbP_qtuYONDrk0p-L9-dsnTJkqzdZJ3hiFCGX_Trxd86wwEak33A2ErKId-43mAkum9lecAzEILSdc0bBtt0eFc8f-XcSs287XX3Yx'
headers = {'Authorization': 'Bearer %s' % api_key}


In [16]:
# Function to extract Yelp ID from Yelp Business Match and insert into Yelp Business Details API

def get_business_details(name, address_1, city, state, country):
    ''' 
    Obtains ratings, review count, latitude/longitude, price, and restaurant categories 
    from Yelp Business Details Endpoint using the Yelp Business ID obtained from 
    Yelp Business Matches Endpoint
    
    name: name of business - string input
    address_1: Street Address e.g. 2436 Columbine Circle - string input
    city: city in abbreviated form e.g. CA - string input
    country: country - string input
    
    Yelp Only Allows for 5000 requests per day. This code calls upon the YELP API twice.
    '''
    
    # Get Yelp ID from Business Matches Endpoint
    url_bm = 'https://api.yelp.com/v3/businesses/matches'
    
    # Parameters for Business Match in Dictionary Format
    parameters_bm = {\
                 'name': name,\
                 'address1' : address_1,\
                 'city': city,\
                 'state': state,\
                 'country': country
             }
    
    # Call on Yelp Business Match API
    r_bm = requests.get(url = url_bm, params = parameters_bm, headers = headers)
    json_data_bm = r_bm.json()
    
    # Extract Yelp ID. 
    
    if 'businesses' in json_data_bm.keys():
    
        if json_data_bm.get('businesses')==[]:
            yelp_id = None
        else:
            yelp_id = json_data_bm['businesses'][0]['id']
    else:
        yelp_id = None
        
        #try except 

    
    # Call on Yelp Business Details API to get necessary info
    url_bd = 'https://api.yelp.com/v3/businesses/{}'.format(yelp_id)
    
    
    # Yelp ID was not obtained, output would display None. If it has a yelp ID, fetch data from Yelp API
    if yelp_id == None:
        output = {
            'name': None,
            'address': None,
            'review_count': None,
            'rating' : None,
            'price': None,
            'categories': None,
            'latitude': None,
            'longitude': None,
            'yelp_id': None,
            'hours': None
    
}
    else:
        
        # Request info from Yelp Business Details Endpoint
        r_bd = requests.get(url = url_bd, headers = headers)
        json_data_bd = r_bd.json()
        
        answer = json_data_bd
        
        if 'name' in answer.keys():
            output =  {'name': answer.get('name')}
        else:
            output = {'name': None}
            
        if 'address' in answer.keys(): 
            output['address'] =  ', '.join([str(elem)for elem in answer.get('location').get('display_address')])
        else:
            output['address'] = None
        
        if 'categories' in answer.keys(): 
            categories = [sub['title'] for sub in answer.get('categories')]
            output['categories'] =  ', '.join([str(elem) for elem in [sub['title'] for sub in answer.get('categories')]])
        else:
            output['categories'] = None
        
        if 'review_count' in answer.keys():
            output['review_count'] = answer.get('review_count')
        else:
            output['review_count'] = None
        
        if 'rating' in answer.keys():
            output['rating'] = answer.get('rating')
        else:
            output['rating'] = None
        
        if 'price' in answer.keys():
            output['price'] = answer.get('price')
        else:
            output['price'] = None
            
        if 'coordinates' in answer.keys():
            output['latitude_yelp'] = answer.get('coordinates').get('latitude')
            output['longitude_yelp'] = answer.get('coordinates').get('longitude')
        else:
            output['latitude'] = None
            output['longitude'] = None
            
    output['yelp_id'] = yelp_id
    output['program_name'] = name
    output['facility_address'] = address_1
    output['facility_city'] = city
    
    return output

In [None]:
# Testing if code works
get_business_details(address.iloc[1001,1], address.iloc[1001,2], address.iloc[1001,3], 'CA', 'US')

In [17]:
# Create For Loop To Iterate Over 

# Create a list out of the columns of interest
# Since there is a daily limit of 5000 calls per day will. Need to slice the list in appropriate increments

res_name = (address['program_name']).tolist()[2700:5401]
res_address = (address['facility_address']).tolist()[2700:5401]
res_city = (address['facility_city'].tolist())[2700:5401]

# Output File Name
output_filename = '../data/yelp_data_2700to5401.csv'

results = []

for name, address_1, city in zip(res_name, res_address, res_city):
    bd_results = get_business_details(name, address_1, city, 'CA', 'US')
    results.append(bd_results)
    
    if len(results) % 100 == 0:
        pd.DataFrame(results).to_csv("{}_bak".format(output_filename))

# Convert list result to dataframe
pd.DataFrame(results).to_csv(output_filename, encoding = 'utf8')



In [168]:
###### Compile all data into one file
file_list = []
path = '../data/'
compiled_filename = '../data/compiled_yelp_data.csv.gz'
for file in os.listdir(path):
    if file.startswith('yelp_data'):
        path_to_file = str(path+file)
        df = pd.read_csv(path_to_file, encoding = 'latin-1')
        file_list.append(df)
    
compiled_yelp_data = pd.concat(file_list, ignore_index = True)
df = compiled_yelp_data

In [169]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54990 entries, 0 to 54989
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Unnamed: 0        54990 non-null  int64  
 1   name              43829 non-null  object 
 2   address           2 non-null      object 
 3   categories        43776 non-null  object 
 4   review_count      43831 non-null  float64
 5   rating            43831 non-null  float64
 6   price             37293 non-null  object 
 7   latitude_yelp     43799 non-null  float64
 8   longitude_yelp    43799 non-null  float64
 9   yelp_id           44037 non-null  object 
 10  program_name      52489 non-null  object 
 11  facility_address  54989 non-null  object 
 12  facility_city     54989 non-null  object 
 13  latitude          2500 non-null   object 
 14  longitude         0 non-null      float64
 15  hours             0 non-null      float64
dtypes: float64(6), int64(1), object(9)
memor

In [170]:
# Clean Up Data
df = df.drop(columns = ['Unnamed: 0','address','latitude','longitude','hours','yelp_id'])


In [171]:
df.head(2)

Unnamed: 0,name,categories,review_count,rating,price,latitude_yelp,longitude_yelp,program_name,facility_address,facility_city
0,Gen Korean BBQ House,"Korean, Barbeque, Asian Fusion",5426.0,4.0,$$,33.805338,-118.328352,GEN KOREAN BBQ,24301 CRENSHAW BLVD,TORRANCE
1,Denny's,"Diners, Breakfast & Brunch, American (Traditio...",92.0,2.5,$,34.020786,-117.958669,DENNY'S,15553 E VALLEY BLVD,LA PUENTE


In [172]:
split_cat =  df.categories.str.split(', ',5,expand = True)
split_cat.head()

Unnamed: 0,0,1,2,3
0,Korean,Barbeque,Asian Fusion,
1,Diners,Breakfast & Brunch,American (Traditional),
2,Bubble Tea,Taiwanese,,
3,Mexican,Seafood,Cocktail Bars,
4,Bakeries,American (Traditional),Breakfast & Brunch,


In [173]:
df['cat1'] = split_cat[0]
df['cat2'] = split_cat[1]
df['cat3'] = split_cat[2]
df['cat4'] = split_cat[3]
df.head()

Unnamed: 0,name,categories,review_count,rating,price,latitude_yelp,longitude_yelp,program_name,facility_address,facility_city,cat1,cat2,cat3,cat4
0,Gen Korean BBQ House,"Korean, Barbeque, Asian Fusion",5426.0,4.0,$$,33.805338,-118.328352,GEN KOREAN BBQ,24301 CRENSHAW BLVD,TORRANCE,Korean,Barbeque,Asian Fusion,
1,Denny's,"Diners, Breakfast & Brunch, American (Traditio...",92.0,2.5,$,34.020786,-117.958669,DENNY'S,15553 E VALLEY BLVD,LA PUENTE,Diners,Breakfast & Brunch,American (Traditional),
2,Ten Ren's Tea Time,"Bubble Tea, Taiwanese",273.0,3.5,$,33.995954,-117.889903,TEA LAB,18912 E GALE AVE STE #A,ROWLAND HEIGHTS,Bubble Tea,Taiwanese,,
3,El Botanitas,"Mexican, Seafood, Cocktail Bars",40.0,2.5,$$,33.931955,-118.203798,EL BOTANITAS,3614 MLK JR BLVD,LYNWOOD,Mexican,Seafood,Cocktail Bars,
4,Polly's Pies Restaurant,"Bakeries, American (Traditional), Breakfast & ...",347.0,3.5,$$,33.87495,-118.0723,POLLY'S PIES,17198 S NORWALK BLVD,CERRITOS,Bakeries,American (Traditional),Breakfast & Brunch,


In [174]:
# Create Boolean For Categories
cat1_list = df['cat1'].unique().tolist()
cat2_list = df['cat2'].unique().tolist()
cat3_list = df['cat3'].unique().tolist()
cat4_list = df['cat4'].unique().tolist()
cat_list  = cat1_list + cat2_list + cat3_list + cat4_list
cat_list = list(dict.fromkeys(cat_list))

In [175]:
df.shape

(54990, 14)

In [176]:
# Creates Boolean columns out of category columns
def category2bool(dataframe, categorylist):
    for i in categorylist:
        catbool = (dataframe['cat1'] == i) | (dataframe['cat2'] == i) | (dataframe['cat3'] == i) | (dataframe['cat4'] == i)
        dataframe[str(i)] = catbool*1

In [177]:
# Run function 
category2bool(df,cat_list)

In [178]:
# Saves Yelp DataFrame into zip file

df.to_csv(compiled_filename,compression='gzip')

In [182]:
data = pd.read_csv('../data/compiled_yelp_data.csv.gz', compression = 'gzip', dtype = {'name': 'string','categories': 'string', 'review_count':np.float64, 'latitude_yelp':np.float64,'longitude_yelp':np.float64,'facility_address' : 'string', 'facility_city' : 'string','cat1' : 'string','cat2':'string','cat3':'string','cat4':'string','program_name':'string'})
data.head()

Unnamed: 0.1,Unnamed: 0,name,categories,review_count,rating,price,latitude_yelp,longitude_yelp,program_name,facility_address,...,Hostels,Videographers,Keys & Locksmiths,Musicians,Personal Shopping,Cosmetic Dentists,Tax Law,Archery,Game Meat,Used Car Dealers
0,0,Gen Korean BBQ House,"Korean, Barbeque, Asian Fusion",5426.0,4.0,$$,33.805338,-118.328352,GEN KOREAN BBQ,24301 CRENSHAW BLVD,...,0,0,0,0,0,0,0,0,0,0
1,1,Denny's,"Diners, Breakfast & Brunch, American (Traditio...",92.0,2.5,$,34.020786,-117.958669,DENNY'S,15553 E VALLEY BLVD,...,0,0,0,0,0,0,0,0,0,0
2,2,Ten Ren's Tea Time,"Bubble Tea, Taiwanese",273.0,3.5,$,33.995954,-117.889903,TEA LAB,18912 E GALE AVE STE #A,...,0,0,0,0,0,0,0,0,0,0
3,3,El Botanitas,"Mexican, Seafood, Cocktail Bars",40.0,2.5,$$,33.931955,-118.203798,EL BOTANITAS,3614 MLK JR BLVD,...,0,0,0,0,0,0,0,0,0,0
4,4,Polly's Pies Restaurant,"Bakeries, American (Traditional), Breakfast & ...",347.0,3.5,$$,33.87495,-118.0723,POLLY'S PIES,17198 S NORWALK BLVD,...,0,0,0,0,0,0,0,0,0,0


In [166]:
df[df['rating'] == '$']

Unnamed: 0,name,categories,review_count,rating,price,latitude_yelp,longitude_yelp,program_name,facility_address,facility_city,...,Hostels,Videographers,Keys & Locksmiths,Musicians,Personal Shopping,Cosmetic Dentists,Tax Law,Archery,Game Meat,Used Car Dealers
2701,Sicha Siam,152,3.5,$,Thai,34.124475,34.124475,,SICHA SIAM RESTAURANT,4403 EAGLE ROCK BLVD,...,0,0,0,0,0,0,0,0,0,0
2702,Superior Grocers,20,1.5,$,"Grocery, Bakeries, Meat Shops",33.968930,33.968930,,SUPERIOR GROCERS #105 - MEAT,7300 ATLANTIC BLVD,...,0,0,0,0,0,0,0,0,0,0
2705,Cafe of Paris,73,4.0,$,"Breakfast & Brunch, Sandwiches, Cafes",34.064121,34.064121,,CAFE OF PARIS,6399 WILSHIRE BLVD #101,...,0,0,0,0,0,0,0,0,0,0
2706,The Slice,137,3.5,$,"Pizza, Italian",34.011758,34.011758,,THE SLICE,1622 OCEAN PARK BLVD,...,0,0,0,0,0,0,0,0,0,0
2707,Kavita Grocery,35,3.5,$,"Grocery, Convenience Stores, International Gro...",34.021860,34.021860,,KAVITA GROCERIES,10201 VENICE BLVD,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5194,Village Liquor,6,2.5,$,"Beer, Wine & Spirits",33.872925,33.872925,,VILLAGE LIQUOR,12224 E ARTESIA BLVD,...,0,0,0,0,0,0,0,0,0,0
5196,San Andres Market and Restaurant,1,3.0,$,"Mexican, Grocery",33.991688,33.991688,,SAN ANDRES MARKET AND DELI,5520 S AVALON BLVD,...,0,0,0,0,0,0,0,0,0,0
5197,Fruta y Cultura,16,4.5,$,"Juice Bars & Smoothies, Coffee & Tea, Salad",34.261817,34.261817,,FRUTA DE CAFE,13512 VAN NUYS BLVD,...,0,0,0,0,0,0,0,0,0,0
5198,Boulevard Cafe,85,4.0,$,"Breakfast & Brunch, Cafes, American (Traditional)",34.580912,34.580912,,BOULEVARD CAFE,2211 E PALMDALE BLVD,...,0,0,0,0,0,0,0,0,0,0
