# Starting Point - DRAFT
This the starting point for single Jupyter Notebook
 
 
## Dependency
##### 1 barnum Library
The barnum python library is used to generate a random list of zip codes.  It is install with the following:
  'pip install barnum'
  
##### 2 secrets.py
The collection of restaurant data uses the Yelp API that requires a key. This key is stored within the secrets.py that contains a single variable, "yelpKey".

In [66]:
#-- Import Libraries
import pandas as pd
import os
import math
import barnum
import requests


# Yelp API key in secrets.py; .gitignore prevents the secrets.py from being pushed to GitHub
from secrets import yelpKey


#-- Configuration Settings

#- Common Settings
# Name of the column that contains the zip code information
zipcodeColumnName = "Zipcode"

# Folder that is to contain output of different processing
outputDirectory = "AnalysisData"


#- Random Zipcodes
# Number of zip codes to gather data for; use 100 for analysis and 3 for testing
numRandomZipcodes = 150

# Name of the file that contains the DataFrame of the random zipcodes
randomZipcodesFileName = "randomZipcode.csv"


#- Collect Yelp Datasets
# Yelp search radius; used with API call
yelpSearchRadius = 3000

# TRUE- use the file of the random zipcodes for yelp dataset FALSE- use the DataFrame in memory
useFileForYelpSearch = False

# Name of the file that contains DataFrame of all data returned from the Yelp API calls
yelpDataFileName = "yelpData.csv"

# Name of the file that contains DataFrame of the zipcode and counts return from Yelp API calls
yelpDataSummaryFileName = "YelpSummaryData.csv"


#- Summarize Yelp Data
# TRUE- use the file of the DataFrame that contains the yelp businesses FASLE- use the DataFrame in memory
useFileForYelpSummarize = True

# Name of the file that contains the Dataframe that summarizes the Yelp data
summarizedYelpFileName = "summarizedYelpData.csv"


# 1 Create Random Zip Codes   
This step creates a list of the random zip codes within the study area of Southern California. The barnum library is used to generate the random list. The random zip codes are verified to ensure that they are within the study area and then stored to disk.

In [55]:
#-- Create Random Zipcodes

#- Create List
hasAllZipcodes = False
randomZipcodes = []

while hasAllZipcodes == False:
    theZipCode = barnum.create_city_state_zip()[0]
    numZipCode = int(theZipCode)
    
    if (numZipCode >= 90001) and (numZipCode <= 93005):
        randomZipcodes.append(theZipCode)
        
    if (numRandomZipcodes == len(randomZipcodes)):
        hasAllZipcodes = True 

        
#- Create DataFrame
randomZipcodes_df = pd.DataFrame(randomZipcodes)

randomZipcodes_df.columns = [zipcodeColumnName]


#- Save to Disk
randomZipcodesPath = os.path.join(".", outputDirectory, randomZipcodesFileName)

randomZipcodes_df.to_csv(randomZipcodesPath)


#- Preview Random Zipcodes
randomZipcodes_df.head()

Unnamed: 0,Zipcode
0,91334
1,90044
2,91614
3,92029
4,92530


# 2 Collect Yelp Data
For each of the random zipcodes the Yelp API is used to collect a single datasets using the Business Search API (https://www.yelp.com/developers/documentation/v3/business_search). The Yelp API is queried two times:

* Yelp filter of "restaurant" and "gluten_free".
* Yelp filter of only "restaurant"

There is validation to ensure that the business returned from the Yelp API is contained with the zip code provided. Additionally, it was found that some businesses did not have a "price" attribute within the JSON and there is validation to prevent this from stopping the processing.  
  
The yelp data is stored within a DataFrame, yelpData_df, and is also exported to a CSV file in: <outputDirectory>/<yelpDataFileName>
  
Another dataset that contains the zipcode with the count of the records found with the two searches is also created.  This dataset is stored within a DataFrame, yelpDataSummary_df, and exported to a CSV file in: <outputDirectory>/<yelpDataSummaryFileName>  
    
    

In [37]:
def getDataForZipcode(isGlutenFreeSearch, searchZipCode):
    ''' Searches the Yelp API to get the business that satisfy the filter
    
    Accepts : isGlutenFreeSearcj (bool) TRUE- search for gluten free term FALSE- search just for restaurant
                searchZipCode (str) zip code to search for records within
    
    Returns : (dictionary) contains information of business for the zip code
                ID: (str) Unique Yelp ID for the business
                Name: (str) Name of the business
                Zipcode: (str) Location of the business
                Latitude: (num) coordinate of the business location
                Longitude: (num) coordinate of the business location
                Price: (str) Price level of the business. Value is one of $, $$, $$$, $$$$ and NA
                Rating: (num) Rating for this business (value ranges from 1, 1.5, ... 4.5, 5)
                IsGlutenFree: (num) 1 - used with the gluten free filter 0 not used with gluten free search
                
                Or None when errors encountered with requests
    '''
    
    #- Prepare Search
    # Source Url
    baseYelpUrl = "https://api.yelp.com/v3/businesses/search"

    # API Key passed through header
    headers = {
            'Authorization': 'Bearer %s' % yelpKey,
    }
    
    # Search Term
    searchTerm = 'restaurant'
    
    if (isGlutenFreeSearch == True):
        searchTerm = 'gluten_free,restaurant'
    
    
    # Dictionary stores data
    yelpData = {
        'ID': [],
        'Name': [],
        'Zipcode': [],
        'Latitude': [],
        'Longitude': [],
        'Price' : [],
        'Rating' : [],
        'IsGlutenFree': [],
    }
    
    
    #- Search
    #  API limits 50 records being returned at once; must loop and request offset of results to get all records
    recordLimit = 50
    currentOffset = 0
    hasMoreData = True
    retryCount = 0
    totalCount = 0
    
    while hasMoreData == True:
        
        #- Prepare Parameters
        parameters = {
            'location': searchZipCode,
            'term': searchTerm,
            'limit': recordLimit,
            'offset': currentOffset,
            }
        
        #- Request
        print(f"  Requesting data. Offset: {currentOffset}  Known Total: {totalCount}")
        
        response = requests.request('GET', baseYelpUrl, headers=headers, params=parameters)
        
        
        #- Check Response
        if (response.status_code == requests.codes.ok):
            
            # Reset Retry
            retryCount = 0
            
            
            # Get Json from Response
            responseJson = response.json()
            
            
            # Search Businesses
            for business in responseJson['businesses']:
                
                # Determine Use Business
                useBusiness = checkBusinessForUsage(business, searchZipCode)
                
                if (useBusiness == True):
                    
                    # Populate Dictionary with Business Information
                    yelpData['ID'].append(business['id'])
                    yelpData['Name'].append(business['name'])
                    yelpData['Zipcode'].append(business['location']['zip_code'])
                    
                    yelpData['Latitude'].append(business['coordinates']['latitude'])
                    yelpData['Longitude'].append(business['coordinates']['longitude'])
                    
                    yelpData['Price'].append(getPriceForBusiness(business))
                    yelpData['Rating'].append(business['rating'])
                    
                    # Update search type
                    if (isGlutenFreeSearch == True):
                        yelpData['IsGlutenFree'].append(1)
                    else:
                        yelpData['IsGlutenFree'].append(0)
         
            #- Prepare for Next search
            # API only supports 50 records at a time; must query with offset
            currentOffset = (currentOffset + recordLimit)
            
            totalCount = responseJson['total']
            
            if (currentOffset > responseJson['total']):
                print(f"Collected all data. Current Offset: {currentOffset}  Total: {responseJson['total']}")
                hasMoreData = False
        
        else:
            #- Error with request
            retryCount += 1
            print(f"Response Error for data: {response.status_code}  Retry Count: {retryCount}")     
            
            #- Attempt to retry request
            if (retryCount == 4):
                print("Error getting data for zipcode")
                return None
    
                  
    #- Metadata on Data
    print(f"Total businesses found: {len(yelpData['ID'])}  Search Term: {searchTerm}")
               
          
    #- Return data from function
    return yelpData

In [8]:
def checkBusinessForUsage(businessInfo, searchZipcode):
    ''' Determines if the business can be used in the Analysis
    
    Accepts : businessInfo (dictionary) contains the metadata for individual business 
                searchZipCode (str) zip code searching for data within
    
    Return : bool TRUE- business meets critera, able to use FALSE- unable to use business
    '''
    
    #- Check Within Search Zipcode
    businessZipCode = businessInfo['location']['zip_code']
    
    if (businessZipCode != searchZipcode):
        return False
    
    
    return True

In [9]:
def getPriceForBusiness(businessInfo):
    ''' Gets the price for a business; not all businesses contain this property within the JSON;
    when not found just uses NA.
    
    Accepts : businessInfo (dictionary) metadata on an individual business
    
    Returns : (num) value from price tag
    '''
    try:
        
        return businessInfo['price']
    
    except:
        return 'NA'

In [56]:
#-- Collect Yelp Datasets

#- Get Random Zipcodes
if (useFileForYelpSearch == True):
    randomZipcodesPath = os.path.join(".", outputDirectory, randomZipcodesFileName)
    
    randomZipcodes_df = pd.read_csv(randomZipcodesPath)

else:
    if (randomZipcodes_df is None):
        raise Exception("Unable to collect Yelp dataset; missing reference to randomZipcodes_df")

        
#- Prepare Variables
zipcodeSummary = {
    zipcodeColumnName: [],
    'HasApiFailure': [],
    'Count_GlutenFree': [],
    'Count_Restaurant': []
}

yelpData_df = None
hasFirstYelpData = True
counter = 0


#- Collect Data
for index, row in randomZipcodes_df.iterrows():
    
    #- Get Zipcode
    searchZipcode = str(row[0])
    
    
    #- Message
    counter += 1
    print(f"-> Search -> {searchZipcode}  -> {counter} of {randomZipcodes_df.shape[0]}")
    
    
    #- Get Data from Yelp: Gluten Free Search
    yelpDataForZipcode = getDataForZipcode(True, searchZipcode)
    
    
    #- Get Data for Yelp: Restaurant
    yelpDataRestaurantForZipcode = getDataForZipcode(False, searchZipcode)
    
    
    #- Create DataFrames; check success getting data from endpoint
    if not (yelpDataForZipcode is None) and not (yelpDataRestaurantForZipcode is None):
        
        #- Gluten Free
        # Create DataFrame
        yelpDataForZipcode_df = pd.DataFrame(yelpDataForZipcode)
    
        # Determine number of records
        countYelpDataForZipcode = yelpDataForZipcode_df.shape[0]
    
        # Merge to Master DataFrame
        if (hasFirstYelpData == True):
            hasFirstYelpData = False
            yelpData_df = yelpDataForZipcode_df
    
        else:
            yelpData_df = pd.concat([yelpData_df, yelpDataForZipcode_df])
    
    
        #- All Restaurants
        # Create DataFrame
        yelpDataForZipcode_df = pd.DataFrame(yelpDataRestaurantForZipcode)
        
        # Determine number of records
        countYelpDataSummaryForZipcode = yelpDataForZipcode_df.shape[0]
        
        # Merge to Master DataFrame
        yelpData_df = pd.concat([yelpData_df, yelpDataForZipcode_df])
        
        
        #- Update Summary
        zipcodeSummary[zipcodeColumnName].append(searchZipcode)
        zipcodeSummary['HasApiFailure'].append(0)
        
        zipcodeSummary['Count_GlutenFree'].append(countYelpDataForZipcode)
        zipcodeSummary['Count_Restaurant'].append(countYelpDataSummaryForZipcode)  
        
    else:
        
        #- Error with at least one search filter; do not use zipcode
        zipcodeSummary[zipcodeColumnName].append(searchZipcode)
        zipcodeSummary['HasApiFailure'].append(1)
        
        zipcodeSummary['Count_GlutenFree'].append(0)
        zipcodeSummary['Count_Restaurant'].append(0) 
    

#- Message
print("<--<")
print("Completed getting data from Yelp API")
    
    
#- Export: Yelp Data
yelpDataFilePath = os.path.join('.', outputDirectory, yelpDataFileName)

yelpData_df.to_csv(yelpDataFilePath)


#- Export: Yelp Summary Data
yelpDataSummaryFilePath = os.path.join('.', outputDirectory, yelpDataSummaryFileName)

yelpDataSummary_df = pd.DataFrame(zipcodeSummary)

yelpDataSummary_df.to_csv(yelpDataSummaryFileName)


#- Completed Message
print("Completed export of data")

-> Search -> 91334  -> 1 of 150
  Requesting data. Offset: 0  Known Total: 0
Collected all data. Current Offset: 50  Total: 27
Total businesses found: 0  Search Term: gluten_free,restaurant
  Requesting data. Offset: 0  Known Total: 0
Collected all data. Current Offset: 50  Total: 38
Total businesses found: 0  Search Term: restaurant
-> Search -> 90044  -> 2 of 150
  Requesting data. Offset: 0  Known Total: 0
  Requesting data. Offset: 50  Known Total: 463
  Requesting data. Offset: 100  Known Total: 463
  Requesting data. Offset: 150  Known Total: 463
  Requesting data. Offset: 200  Known Total: 463
  Requesting data. Offset: 250  Known Total: 463
  Requesting data. Offset: 300  Known Total: 463
  Requesting data. Offset: 350  Known Total: 463
  Requesting data. Offset: 400  Known Total: 463
  Requesting data. Offset: 450  Known Total: 463
Collected all data. Current Offset: 500  Total: 463
Total businesses found: 0  Search Term: gluten_free,restaurant
  Requesting data. Offset: 0  Kn

  Requesting data. Offset: 250  Known Total: 990
  Requesting data. Offset: 300  Known Total: 990
  Requesting data. Offset: 350  Known Total: 990
  Requesting data. Offset: 400  Known Total: 990
  Requesting data. Offset: 450  Known Total: 990
  Requesting data. Offset: 500  Known Total: 990
  Requesting data. Offset: 550  Known Total: 990
  Requesting data. Offset: 600  Known Total: 990
  Requesting data. Offset: 650  Known Total: 990
  Requesting data. Offset: 700  Known Total: 990
  Requesting data. Offset: 750  Known Total: 990
  Requesting data. Offset: 800  Known Total: 990
  Requesting data. Offset: 850  Known Total: 990
  Requesting data. Offset: 900  Known Total: 990
  Requesting data. Offset: 950  Known Total: 990
Collected all data. Current Offset: 1000  Total: 990
Total businesses found: 98  Search Term: restaurant
-> Search -> 92604  -> 7 of 150
  Requesting data. Offset: 0  Known Total: 0
  Requesting data. Offset: 50  Known Total: 321
  Requesting data. Offset: 100  Kno

Response Error for data: 400  Retry Count: 3
  Requesting data. Offset: 1000  Known Total: 6600
Response Error for data: 400  Retry Count: 4
Error getting data for zipcode
-> Search -> 92123  -> 11 of 150
  Requesting data. Offset: 0  Known Total: 0
  Requesting data. Offset: 50  Known Total: 1000
  Requesting data. Offset: 100  Known Total: 1000
  Requesting data. Offset: 150  Known Total: 1000
  Requesting data. Offset: 200  Known Total: 1000
  Requesting data. Offset: 250  Known Total: 1000
  Requesting data. Offset: 300  Known Total: 1000
  Requesting data. Offset: 350  Known Total: 1000
  Requesting data. Offset: 400  Known Total: 1000
  Requesting data. Offset: 450  Known Total: 1000
  Requesting data. Offset: 500  Known Total: 1000
  Requesting data. Offset: 550  Known Total: 1000
  Requesting data. Offset: 600  Known Total: 1000
  Requesting data. Offset: 650  Known Total: 1000
  Requesting data. Offset: 700  Known Total: 1000
  Requesting data. Offset: 750  Known Total: 1000
 

Response Error for data: 400  Retry Count: 3
  Requesting data. Offset: 1000  Known Total: 1300
Response Error for data: 400  Retry Count: 4
Error getting data for zipcode
  Requesting data. Offset: 0  Known Total: 0
  Requesting data. Offset: 50  Known Total: 3200
  Requesting data. Offset: 100  Known Total: 3200
  Requesting data. Offset: 150  Known Total: 3200
  Requesting data. Offset: 200  Known Total: 3200
  Requesting data. Offset: 250  Known Total: 3200
  Requesting data. Offset: 300  Known Total: 3200
  Requesting data. Offset: 350  Known Total: 3200
  Requesting data. Offset: 400  Known Total: 3200
  Requesting data. Offset: 450  Known Total: 3200
  Requesting data. Offset: 500  Known Total: 3200
  Requesting data. Offset: 550  Known Total: 3200
  Requesting data. Offset: 600  Known Total: 3200
  Requesting data. Offset: 650  Known Total: 3200
  Requesting data. Offset: 700  Known Total: 3200
  Requesting data. Offset: 750  Known Total: 3200
  Requesting data. Offset: 800  Kn

  Requesting data. Offset: 300  Known Total: 375
  Requesting data. Offset: 350  Known Total: 375
Collected all data. Current Offset: 400  Total: 375
Total businesses found: 76  Search Term: restaurant
-> Search -> 92414  -> 28 of 150
  Requesting data. Offset: 0  Known Total: 0
Collected all data. Current Offset: 50  Total: 0
Total businesses found: 0  Search Term: gluten_free,restaurant
  Requesting data. Offset: 0  Known Total: 0
  Requesting data. Offset: 50  Known Total: 145
  Requesting data. Offset: 100  Known Total: 145
Collected all data. Current Offset: 150  Total: 145
Total businesses found: 0  Search Term: restaurant
-> Search -> 90274  -> 29 of 150
  Requesting data. Offset: 0  Known Total: 0
  Requesting data. Offset: 50  Known Total: 787
  Requesting data. Offset: 100  Known Total: 787
  Requesting data. Offset: 150  Known Total: 787
  Requesting data. Offset: 200  Known Total: 787
  Requesting data. Offset: 250  Known Total: 787
  Requesting data. Offset: 300  Known Tot

  Requesting data. Offset: 650  Known Total: 945
  Requesting data. Offset: 700  Known Total: 945
  Requesting data. Offset: 750  Known Total: 945
  Requesting data. Offset: 800  Known Total: 945
  Requesting data. Offset: 850  Known Total: 945
  Requesting data. Offset: 900  Known Total: 945
Collected all data. Current Offset: 950  Total: 945
Total businesses found: 44  Search Term: restaurant
-> Search -> 92276  -> 33 of 150
  Requesting data. Offset: 0  Known Total: 0
  Requesting data. Offset: 50  Known Total: 319
  Requesting data. Offset: 100  Known Total: 319
  Requesting data. Offset: 150  Known Total: 319
  Requesting data. Offset: 200  Known Total: 319
  Requesting data. Offset: 250  Known Total: 319
  Requesting data. Offset: 300  Known Total: 319
Collected all data. Current Offset: 350  Total: 319
Total businesses found: 1  Search Term: gluten_free,restaurant
  Requesting data. Offset: 0  Known Total: 0
  Requesting data. Offset: 50  Known Total: 1000
  Requesting data. Off

  Requesting data. Offset: 400  Known Total: 1300
  Requesting data. Offset: 450  Known Total: 1300
  Requesting data. Offset: 500  Known Total: 1300
  Requesting data. Offset: 550  Known Total: 1300
  Requesting data. Offset: 600  Known Total: 1300
  Requesting data. Offset: 650  Known Total: 1300
  Requesting data. Offset: 700  Known Total: 1300
  Requesting data. Offset: 750  Known Total: 1300
  Requesting data. Offset: 800  Known Total: 1300
  Requesting data. Offset: 850  Known Total: 1300
  Requesting data. Offset: 900  Known Total: 1300
  Requesting data. Offset: 950  Known Total: 1300
  Requesting data. Offset: 1000  Known Total: 1300
Response Error for data: 400  Retry Count: 1
  Requesting data. Offset: 1000  Known Total: 1300
Response Error for data: 400  Retry Count: 2
  Requesting data. Offset: 1000  Known Total: 1300
Response Error for data: 400  Retry Count: 3
  Requesting data. Offset: 1000  Known Total: 1300
Response Error for data: 400  Retry Count: 4
Error getting da

  Requesting data. Offset: 200  Known Total: 886
  Requesting data. Offset: 250  Known Total: 886
  Requesting data. Offset: 300  Known Total: 886
  Requesting data. Offset: 350  Known Total: 887
  Requesting data. Offset: 400  Known Total: 887
  Requesting data. Offset: 450  Known Total: 886
  Requesting data. Offset: 500  Known Total: 887
  Requesting data. Offset: 550  Known Total: 887
  Requesting data. Offset: 600  Known Total: 887
  Requesting data. Offset: 650  Known Total: 887
  Requesting data. Offset: 700  Known Total: 886
  Requesting data. Offset: 750  Known Total: 887
  Requesting data. Offset: 800  Known Total: 887
  Requesting data. Offset: 850  Known Total: 887
Collected all data. Current Offset: 900  Total: 887
Total businesses found: 18  Search Term: restaurant
-> Search -> 90304  -> 44 of 150
  Requesting data. Offset: 0  Known Total: 0
  Requesting data. Offset: 50  Known Total: 220
  Requesting data. Offset: 100  Known Total: 220
  Requesting data. Offset: 150  Kno

  Requesting data. Offset: 550  Known Total: 2200
  Requesting data. Offset: 600  Known Total: 2200
  Requesting data. Offset: 650  Known Total: 2200
  Requesting data. Offset: 700  Known Total: 2200
  Requesting data. Offset: 750  Known Total: 2200
  Requesting data. Offset: 800  Known Total: 2200
  Requesting data. Offset: 850  Known Total: 2200
  Requesting data. Offset: 900  Known Total: 2200
  Requesting data. Offset: 950  Known Total: 2200
  Requesting data. Offset: 1000  Known Total: 2200
Response Error for data: 400  Retry Count: 1
  Requesting data. Offset: 1000  Known Total: 2200
Response Error for data: 400  Retry Count: 2
  Requesting data. Offset: 1000  Known Total: 2200
Response Error for data: 400  Retry Count: 3
  Requesting data. Offset: 1000  Known Total: 2200
Response Error for data: 400  Retry Count: 4
Error getting data for zipcode
-> Search -> 90046  -> 50 of 150
  Requesting data. Offset: 0  Known Total: 0
  Requesting data. Offset: 50  Known Total: 2200
  Reques

  Requesting data. Offset: 250  Known Total: 262
Collected all data. Current Offset: 300  Total: 262
Total businesses found: 1  Search Term: gluten_free,restaurant
  Requesting data. Offset: 0  Known Total: 0
Collected all data. Current Offset: 50  Total: 26
Total businesses found: 0  Search Term: restaurant
-> Search -> 92159  -> 55 of 150
  Requesting data. Offset: 0  Known Total: 0
Collected all data. Current Offset: 50  Total: 12
Total businesses found: 0  Search Term: gluten_free,restaurant
  Requesting data. Offset: 0  Known Total: 0
Collected all data. Current Offset: 50  Total: 15
Total businesses found: 0  Search Term: restaurant
-> Search -> 92549  -> 56 of 150
  Requesting data. Offset: 0  Known Total: 0
  Requesting data. Offset: 50  Known Total: 129
  Requesting data. Offset: 100  Known Total: 129
Collected all data. Current Offset: 150  Total: 129
Total businesses found: 17  Search Term: gluten_free,restaurant
  Requesting data. Offset: 0  Known Total: 0
  Requesting data

  Requesting data. Offset: 400  Known Total: 524
  Requesting data. Offset: 450  Known Total: 524
  Requesting data. Offset: 500  Known Total: 524
Collected all data. Current Offset: 550  Total: 524
Total businesses found: 141  Search Term: restaurant
-> Search -> 91802  -> 65 of 150
  Requesting data. Offset: 0  Known Total: 0
Collected all data. Current Offset: 50  Total: 10
Total businesses found: 0  Search Term: gluten_free,restaurant
  Requesting data. Offset: 0  Known Total: 0
Collected all data. Current Offset: 50  Total: 21
Total businesses found: 0  Search Term: restaurant
-> Search -> 92127  -> 66 of 150
  Requesting data. Offset: 0  Known Total: 0
  Requesting data. Offset: 50  Known Total: 1000
  Requesting data. Offset: 100  Known Total: 1000
  Requesting data. Offset: 150  Known Total: 1000
  Requesting data. Offset: 200  Known Total: 1000
  Requesting data. Offset: 250  Known Total: 1000
  Requesting data. Offset: 300  Known Total: 1000
  Requesting data. Offset: 350  Kn

  Requesting data. Offset: 150  Known Total: 2200
  Requesting data. Offset: 200  Known Total: 2200
  Requesting data. Offset: 250  Known Total: 2200
  Requesting data. Offset: 300  Known Total: 2200
  Requesting data. Offset: 350  Known Total: 2200
  Requesting data. Offset: 400  Known Total: 2200
  Requesting data. Offset: 450  Known Total: 2200
  Requesting data. Offset: 500  Known Total: 2200
  Requesting data. Offset: 550  Known Total: 2200
  Requesting data. Offset: 600  Known Total: 2200
  Requesting data. Offset: 650  Known Total: 2200
  Requesting data. Offset: 700  Known Total: 2200
  Requesting data. Offset: 750  Known Total: 2200
  Requesting data. Offset: 800  Known Total: 2200
  Requesting data. Offset: 850  Known Total: 2200
  Requesting data. Offset: 900  Known Total: 2200
  Requesting data. Offset: 950  Known Total: 2200
  Requesting data. Offset: 1000  Known Total: 2200
Response Error for data: 400  Retry Count: 1
  Requesting data. Offset: 1000  Known Total: 2200
Res

  Requesting data. Offset: 150  Known Total: 444
  Requesting data. Offset: 200  Known Total: 444
  Requesting data. Offset: 250  Known Total: 444
  Requesting data. Offset: 300  Known Total: 444
  Requesting data. Offset: 350  Known Total: 444
  Requesting data. Offset: 400  Known Total: 444
Collected all data. Current Offset: 450  Total: 444
Total businesses found: 0  Search Term: restaurant
-> Search -> 92176  -> 79 of 150
  Requesting data. Offset: 0  Known Total: 0
Collected all data. Current Offset: 50  Total: 12
Total businesses found: 0  Search Term: gluten_free,restaurant
  Requesting data. Offset: 0  Known Total: 0
Collected all data. Current Offset: 50  Total: 5
Total businesses found: 0  Search Term: restaurant
-> Search -> 91737  -> 80 of 150
  Requesting data. Offset: 0  Known Total: 0
  Requesting data. Offset: 50  Known Total: 194
  Requesting data. Offset: 100  Known Total: 194
  Requesting data. Offset: 150  Known Total: 194
Collected all data. Current Offset: 200  To

  Requesting data. Offset: 50  Known Total: 54
Collected all data. Current Offset: 100  Total: 54
Total businesses found: 0  Search Term: restaurant
-> Search -> 92835  -> 87 of 150
  Requesting data. Offset: 0  Known Total: 0
  Requesting data. Offset: 50  Known Total: 470
  Requesting data. Offset: 100  Known Total: 470
  Requesting data. Offset: 150  Known Total: 470
  Requesting data. Offset: 200  Known Total: 470
  Requesting data. Offset: 250  Known Total: 470
  Requesting data. Offset: 300  Known Total: 470
  Requesting data. Offset: 350  Known Total: 470
  Requesting data. Offset: 400  Known Total: 470
  Requesting data. Offset: 450  Known Total: 470
Collected all data. Current Offset: 500  Total: 470
Total businesses found: 8  Search Term: gluten_free,restaurant
  Requesting data. Offset: 0  Known Total: 0
  Requesting data. Offset: 50  Known Total: 1900
  Requesting data. Offset: 100  Known Total: 1900
  Requesting data. Offset: 150  Known Total: 1900
  Requesting data. Offse

  Requesting data. Offset: 400  Known Total: 1700
  Requesting data. Offset: 450  Known Total: 1700
  Requesting data. Offset: 500  Known Total: 1700
  Requesting data. Offset: 550  Known Total: 1700
Response Error for data: 500  Retry Count: 1
  Requesting data. Offset: 550  Known Total: 1700
  Requesting data. Offset: 600  Known Total: 1700
  Requesting data. Offset: 650  Known Total: 1700
  Requesting data. Offset: 700  Known Total: 1700
  Requesting data. Offset: 750  Known Total: 1700
  Requesting data. Offset: 800  Known Total: 1700
  Requesting data. Offset: 850  Known Total: 1700
  Requesting data. Offset: 900  Known Total: 1700
  Requesting data. Offset: 950  Known Total: 1700
  Requesting data. Offset: 1000  Known Total: 1700
Response Error for data: 400  Retry Count: 1
  Requesting data. Offset: 1000  Known Total: 1700
Response Error for data: 400  Retry Count: 2
  Requesting data. Offset: 1000  Known Total: 1700
Response Error for data: 400  Retry Count: 3
  Requesting data

  Requesting data. Offset: 200  Known Total: 1200
Response Error for data: 500  Retry Count: 1
  Requesting data. Offset: 200  Known Total: 1200
  Requesting data. Offset: 250  Known Total: 1200
  Requesting data. Offset: 300  Known Total: 1200
  Requesting data. Offset: 350  Known Total: 1200
  Requesting data. Offset: 400  Known Total: 1200
  Requesting data. Offset: 450  Known Total: 1200
  Requesting data. Offset: 500  Known Total: 1200
  Requesting data. Offset: 550  Known Total: 1200
  Requesting data. Offset: 600  Known Total: 1200
  Requesting data. Offset: 650  Known Total: 1200
  Requesting data. Offset: 700  Known Total: 1200
  Requesting data. Offset: 750  Known Total: 1200
  Requesting data. Offset: 800  Known Total: 1200
  Requesting data. Offset: 850  Known Total: 1200
  Requesting data. Offset: 900  Known Total: 1200
  Requesting data. Offset: 950  Known Total: 1200
  Requesting data. Offset: 1000  Known Total: 1200
Response Error for data: 400  Retry Count: 1
  Request

  Requesting data. Offset: 300  Known Total: 994
  Requesting data. Offset: 350  Known Total: 994
  Requesting data. Offset: 400  Known Total: 994
  Requesting data. Offset: 450  Known Total: 994
  Requesting data. Offset: 500  Known Total: 994
  Requesting data. Offset: 550  Known Total: 994
  Requesting data. Offset: 600  Known Total: 994
  Requesting data. Offset: 650  Known Total: 994
  Requesting data. Offset: 700  Known Total: 994
  Requesting data. Offset: 750  Known Total: 994
  Requesting data. Offset: 800  Known Total: 994
  Requesting data. Offset: 850  Known Total: 994
  Requesting data. Offset: 900  Known Total: 994
  Requesting data. Offset: 950  Known Total: 994
Collected all data. Current Offset: 1000  Total: 994
Total businesses found: 109  Search Term: gluten_free,restaurant
  Requesting data. Offset: 0  Known Total: 0
  Requesting data. Offset: 50  Known Total: 3000
  Requesting data. Offset: 100  Known Total: 3000
  Requesting data. Offset: 150  Known Total: 3000
  

  Requesting data. Offset: 450  Known Total: 734
  Requesting data. Offset: 500  Known Total: 734
  Requesting data. Offset: 550  Known Total: 734
  Requesting data. Offset: 600  Known Total: 734
  Requesting data. Offset: 650  Known Total: 734
  Requesting data. Offset: 700  Known Total: 734
Collected all data. Current Offset: 750  Total: 734
Total businesses found: 133  Search Term: restaurant
-> Search -> 90267  -> 113 of 150
  Requesting data. Offset: 0  Known Total: 0
  Requesting data. Offset: 50  Known Total: 90
Collected all data. Current Offset: 100  Total: 90
Total businesses found: 0  Search Term: gluten_free,restaurant
  Requesting data. Offset: 0  Known Total: 0
  Requesting data. Offset: 50  Known Total: 86
Collected all data. Current Offset: 100  Total: 86
Total businesses found: 0  Search Term: restaurant
-> Search -> 90070  -> 114 of 150
  Requesting data. Offset: 0  Known Total: 0
Collected all data. Current Offset: 50  Total: 6
Total businesses found: 0  Search Term:

Response Error for data: 400  Retry Count: 2
  Requesting data. Offset: 1000  Known Total: 2700
Response Error for data: 400  Retry Count: 3
  Requesting data. Offset: 1000  Known Total: 2700
Response Error for data: 400  Retry Count: 4
Error getting data for zipcode
-> Search -> 91616  -> 119 of 150
  Requesting data. Offset: 0  Known Total: 0
Collected all data. Current Offset: 50  Total: 11
Total businesses found: 0  Search Term: gluten_free,restaurant
  Requesting data. Offset: 0  Known Total: 0
Collected all data. Current Offset: 50  Total: 14
Total businesses found: 0  Search Term: restaurant
-> Search -> 92199  -> 120 of 150
  Requesting data. Offset: 0  Known Total: 0
Collected all data. Current Offset: 50  Total: 6
Total businesses found: 0  Search Term: gluten_free,restaurant
  Requesting data. Offset: 0  Known Total: 0
Collected all data. Current Offset: 50  Total: 9
Total businesses found: 0  Search Term: restaurant
-> Search -> 90807  -> 121 of 150
  Requesting data. Offse

  Requesting data. Offset: 600  Known Total: 693
  Requesting data. Offset: 650  Known Total: 693
Collected all data. Current Offset: 700  Total: 693
Total businesses found: 13  Search Term: restaurant
-> Search -> 91767  -> 127 of 150
  Requesting data. Offset: 0  Known Total: 0
  Requesting data. Offset: 50  Known Total: 295
  Requesting data. Offset: 100  Known Total: 295
  Requesting data. Offset: 150  Known Total: 295
  Requesting data. Offset: 200  Known Total: 295
  Requesting data. Offset: 250  Known Total: 295
Collected all data. Current Offset: 300  Total: 295
Total businesses found: 8  Search Term: gluten_free,restaurant
  Requesting data. Offset: 0  Known Total: 0
  Requesting data. Offset: 50  Known Total: 1300
  Requesting data. Offset: 100  Known Total: 1300
  Requesting data. Offset: 150  Known Total: 1300
  Requesting data. Offset: 200  Known Total: 1300
  Requesting data. Offset: 250  Known Total: 1300
  Requesting data. Offset: 300  Known Total: 1300
  Requesting dat

KeyboardInterrupt: 

In [86]:
#-- Preview Yelp Data
print(yelpData_df.shape)

yelpData_df.head()

(1892, 10)


Unnamed: 0.1,Unnamed: 0,ID,Name,Zipcode,Latitude,Longitude,Price,Rating,IsGlutenFree,PriceNum
0,0,9_9ky5mDi3Wtg2tUKsE2Sw,Five Guys,91329,34.223368,-118.501753,,4.0,1.0,
1,0,9_9ky5mDi3Wtg2tUKsE2Sw,Five Guys,91329,34.223368,-118.501753,,4.0,0.0,
2,0,uBhHXv1jFeWkWtnCDClTsw,Citrus Kitchen,91737,34.139053,-117.57748,$$,4.5,1.0,2.0
3,1,LFFCHro2VA4Q68C2ru2XAw,MOD Pizza,91737,34.137142,-117.575335,$,4.5,1.0,1.0
4,2,Ro92uIofBPna3G2PK5e-TQ,Trader Joe's,91737,34.13771,-117.57444,$$,4.5,1.0,2.0


In [87]:
#-- Preview Yelp Summary Data
print(yelpDataSummary_df.shape)

yelpDataSummary_df.head()

(110, 5)


Unnamed: 0.1,Unnamed: 0,Zipcode,HasApiFailure,Count_GlutenFree,Count_Restaurant
0,0,90307,0,0,0
1,1,92685,0,0,0
2,2,91329,0,1,1
3,3,90250,1,0,0
4,4,91737,0,4,19


# 3 Summarize Yelp Data
For each zipcode found within the sample dataset, summarize the number of business found, the menu and standard deviation for the price and rating.  This summarization is done for both those business that satisfied the "Gluten Free/Restaurant" filter and the "Restaurant" filter.  
  
A new DataFrame is created, summarizedYelpData_df, and it is also saved to disk. 

In [79]:
def summarizeYelpDataForZipcode(subset_df, searchPrefix, summarizeResult):
    ''' Summarizes the Yelp data based on the DataFrame provided
    
    Accepts : subset_df (DataFrame) records for the zipcode and filter
                searchPrefix (str) prefix used with column names; "GF_" or "ALL_"
                summarizedResults (dictionary) contains summarized data
                
    Returns : summarizedResults (dictionary) append information 
                'Zipcode' (str) name of the zipcode
                'GF_Total' (num) total number of businesses
                'GF_Price_1' (num) total with price of '$'
                'GF_Price_2' (num) total with price of '$$'
                'GF_Price_3' (num) total with price of '$$$'
                'GF_Price_4' (num) total with price of '$$$$'
                'GF_Price_0' (num) total with price of 'NA'
                'GF_Rating_10' (num) total with range of 1.0
                'GF_Rating_15' (num) total with range of 1.5
                'GF_Rating_20' (num) total with range of 2.0
                'GF_Rating_25' (num) total with range of 2.5
                'GF_Rating_30' (num) total with range of 3.0
                'GF_Rating_35' (num) total with range of 3.5
                'GF_Rating_40' (num) total with range of 4.0
                'GF_Rating_45' (num) total with range of 4.5
                'GF_Rating_50' (num) total with range of 5.0
                'GF_Price_Mean' (num) average for price
                'GF_Price_Std' (num) standard deviation for price
                'GF_Rating_Mean' (num) average for rating
                'GF_Rating_Std' (num) standard deviation for rating
                'ALL_Total' (num) total number of businesses
                'ALL_Price_1' (num) total with price of '$'
                'ALL_Price_2' (num) total with price of '$$'
                'ALL_Price_3' (num) total with price of '$$$'
                'ALL_Price_4' (num) total with price of '$$$$'
                'ALL_Price_0' (num) total with price of 'NA'
                'ALL_Rating_10' (num) total with range of 1.0
                'ALL_Rating_15' (num) total with range of 1.5
                'ALL_Rating_20' (num) total with range of 2.0
                'ALL_Rating_25' (num) total with range of 2.5
                'ALL_Rating_30' (num) total with range of 3.0
                'ALL_Rating_35' (num) total with range of 3.5
                'ALL_Rating_40' (num) total with range of 4.0
                'ALL_Rating_45' (num) total with range of 4.5
                'ALL_Rating_50' (num) total with range of 5.0
                'ALL_Price_Mean' (num) average for price
                'ALL_Price_Std' (num) standard deviation for price
                'ALL_Rating_Mean' (num) average for rating
                'ALL_Rating_Std' (num) standard deviation for rating
    '''
    
    #- Total
    summarizeResult[f'{searchPrefix}Total'].append(subset_df.shape[0])
    

    #- Price
    summarizeResult[f'{searchPrefix}Price_1'].append(subset_df[subset_df['Price'] == '$'].shape[0])
    summarizeResult[f'{searchPrefix}Price_2'].append(subset_df[subset_df['Price'] == '$$'].shape[0])
    summarizeResult[f'{searchPrefix}Price_3'].append(subset_df[subset_df['Price'] == '$$$'].shape[0])
    summarizeResult[f'{searchPrefix}Price_4'].append(subset_df[subset_df['Price'] == '$$$$'].shape[0])
    summarizeResult[f'{searchPrefix}Price_0'].append(subset_df[subset_df['Price'] == 'NA'].shape[0])


    #- Rating
    summarizeResult[f'{searchPrefix}Rating_10'].append(subset_df[subset_df['Rating'] == 1.0].shape[0])
    summarizeResult[f'{searchPrefix}Rating_15'].append(subset_df[subset_df['Rating'] == 1.5].shape[0])
    summarizeResult[f'{searchPrefix}Rating_20'].append(subset_df[subset_df['Rating'] == 2.0].shape[0])
    summarizeResult[f'{searchPrefix}Rating_25'].append(subset_df[subset_df['Rating'] == 2.5].shape[0])
    summarizeResult[f'{searchPrefix}Rating_30'].append(subset_df[subset_df['Rating'] == 3.0].shape[0])
    summarizeResult[f'{searchPrefix}Rating_35'].append(subset_df[subset_df['Rating'] == 3.5].shape[0])
    summarizeResult[f'{searchPrefix}Rating_40'].append(subset_df[subset_df['Rating'] == 4.0].shape[0])
    summarizeResult[f'{searchPrefix}Rating_45'].append(subset_df[subset_df['Rating'] == 4.5].shape[0])
    summarizeResult[f'{searchPrefix}Rating_50'].append(subset_df[subset_df['Rating'] == 5.0].shape[0])


    #- Averages
    summarizeResult[f'{searchPrefix}Price_Mean'].append(subset_df['PriceNum'].mean())
    summarizeResult[f'{searchPrefix}Rating_Mean'].append(subset_df['Rating'].mean())


    #- Standard Deviation
    summarizeResult[f'{searchPrefix}Price_Std'].append(subset_df['PriceNum'].std())
    summarizeResult[f'{searchPrefix}Rating_Std'].append(subset_df['Rating'].std())
    
    
    
    return summarizeResult
    

In [63]:
def calculatePrice(row):
    ''' Converts the Yelp text of dollar signs into numeric value
    
    Accepts : row individual row from DataFrame; has "Price" column 
    
    Returns : (num) numeric value that is converted from text value
    '''
    
    value = 0.0
    
    if (row['Price'] == '$'):
        value = 1
        
    elif (row['Price'] == '$$'):
        value = 2
        
    elif (row['Price'] == '$$$'):
        value = 3
        
    elif (row['Price'] == '$$$$'):
        value = 4
        
    else:
        value = math.nan
        
    
    return value

In [92]:
#-- Summarize Data Based on Zipcode

#- Get Yelp Data
if (useFileForYelpSummarize == True):
    
    print("Using files on disk to summarize")
    
    # Yelp Business DataFrame
    yelpDataFilePath = os.path.join('.', outputDirectory, yelpDataFileName)
    
    yelpData_df = pd.read_csv(yelpDataFilePath)
    
    
    # Yelp Zipcode Summary DataFrame
    yelpDataSummaryFilePath = os.path.join('.', outputDirectory, yelpDataSummaryFileName)
    
    yelpDataSummary_df = pd.read_csv(yelpDataSummaryFilePath)
    
else:
    if (yelpData_df is None) or (yelpDataSummary_df is None):
        raise Exception("Unable to collect Yelp dataset; missing reference to yelpDataSummary_df or yelpData_df")

               
#- Prepare DataFrame: Numeric Price
# Converts text of different number of $ to numeric value to allow calculations
yelpData_df['PriceNum'] = yelpData_df.apply(lambda row: calculatePrice(row), axis=1)


#- Create Results Container
# Container is to be converted into DataFrame
summarizedResults = {
    zipcodeColumnName: [],
    'GF_Total' : [],
    'GF_Price_1': [],
    'GF_Price_2': [],
    'GF_Price_3': [],
    'GF_Price_4': [],
    'GF_Price_0': [],
    'GF_Rating_10': [],
    'GF_Rating_15': [],
    'GF_Rating_20': [],
    'GF_Rating_25': [],
    'GF_Rating_30': [],
    'GF_Rating_35': [],
    'GF_Rating_40': [],
    'GF_Rating_45': [],
    'GF_Rating_50': [],
    'GF_Price_Mean': [],
    'GF_Rating_Mean': [],
    'GF_Price_Std': [],
    'GF_Rating_Std': [],
    'ALL_Total' : [],
    'ALL_Price_1': [],
    'ALL_Price_2': [],
    'ALL_Price_3': [],
    'ALL_Price_4': [],
    'ALL_Price_0': [],
    'ALL_Rating_10': [],
    'ALL_Rating_15': [],
    'ALL_Rating_20': [],
    'ALL_Rating_25': [],
    'ALL_Rating_30': [],
    'ALL_Rating_35': [],
    'ALL_Rating_40': [],
    'ALL_Rating_45': [],
    'ALL_Rating_50': [],
    'ALL_Price_Mean': [],
    'ALL_Rating_Mean': [],
    'ALL_Price_Std': [],
    'ALL_Rating_Std': [],
    }


#- Group by Zipcode
zipcodeYelpData_GroupBy = yelpData_df.groupby(zipcodeColumnName)


#- Summarize for each Zipcode
for groupName, groupedYelpData_df in zipcodeYelpData_GroupBy:
    
    # Zipcode
    summarizedResults[zipcodeColumnName].append(groupName)
    
    
    # Filter: Gluten Free
    subset_df = groupedYelpData_df.loc[(groupedYelpData_df['IsGlutenFree'] == 1)]
    
    summarizedResults = summarizeYelpDataForZipcode(subset_df, "GF_", summarizedResults)
    
    
    # Filter: Restaurants
    subset_df = groupedYelpData_df.loc[(groupedYelpData_df['IsGlutenFree'] == 0)]
    
    summarizedResults = summarizeYelpDataForZipcode(subset_df, "ALL_", summarizedResults)
    

    
#- Create DataFrame
summarizedYelpData_df = pd.DataFrame(summarizedResults)


#- Save to Disk
summarizedYelpPath = os.path.join(".", outputDirectory, summarizedYelpFileName)

summarizedYelpData_df.to_csv(summarizedYelpPath)


print(f"Completed summarizing the data  {summarizedYelpPath}")

Using files on disk to summarize
Completed summarizing the data  ./AnalysisData/summarizedYelpData.csv


In [91]:
#- Preview Summarized Data
print(summarizedYelpData_df.shape)

pd.set_option('display.max_columns', 500)
summarizedYelpData_df.head()

(35, 39)


Unnamed: 0,Zipcode,GF_Total,GF_Price_1,GF_Price_2,GF_Price_3,GF_Price_4,GF_Price_0,GF_Rating_10,GF_Rating_15,GF_Rating_20,GF_Rating_25,GF_Rating_30,GF_Rating_35,GF_Rating_40,GF_Rating_45,GF_Rating_50,GF_Price_Mean,GF_Rating_Mean,GF_Price_Std,GF_Rating_Std,ALL_Total,ALL_Price_1,ALL_Price_2,ALL_Price_3,ALL_Price_4,ALL_Price_0,ALL_Rating_10,ALL_Rating_15,ALL_Rating_20,ALL_Rating_25,ALL_Rating_30,ALL_Rating_35,ALL_Rating_40,ALL_Rating_45,ALL_Rating_50,ALL_Price_Mean,ALL_Rating_Mean,ALL_Price_Std,ALL_Rating_Std
0,90009,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,,,,1,1,0,0,0,0,0,0,0,0,0,0,0,1,0,1.0,4.5,,
1,90038,43,12,22,5,4,0,0,0,0,1,0,7,22,13,0,2.023256,4.034884,0.886093,0.413972,116,51,43,7,6,0,2,0,4,11,7,21,45,23,3,1.700935,3.711207,0.82662,0.78061
2,90052,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,,,,2,1,0,0,0,0,1,0,0,1,0,0,0,0,0,1.0,1.75,,1.06066
3,90239,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,,,,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,,5.0,,
4,90260,8,4,4,0,0,0,0,1,0,0,0,1,1,4,1,1.5,4.0,0.534522,1.101946,68,48,18,0,0,0,0,1,8,5,10,13,14,14,3,1.272727,3.522059,0.448775,0.895402
