In [1]:
import gmplot
import pandas as pd
from yelpapi import YelpAPI
from sklearn.preprocessing import MinMaxScaler
%run API_STUFF.txt

Loaded API KEY AND SECRET FROM FILE


In [9]:
#Scrape data from yelp from the zip codes provided and the search term.
def search_yelp(state, term, zips):
    #yelp API authorization and shared secret
    yelp_api = YelpAPI(API_KEY, API_SECRET)
    #run test search to get the center Lat and Long we need to center the map on and 
    #find total results for this zip
    try:
        search_results = yelp_api.search_query(term=term, location = state, limit = 1, radius_filter= 4000)
    except:
        search_results = yelp_api.search_query(term=term, location = state, limit = 1, radius_filter= 4000)
    latitude = []
    longitude = []
    price = []
    raiting = []
    review_count = []
    name = []
    zipcode = []
    ZIP = []
    closed = 0
    nozip = 0
    #search each zip code in the array of zips from this state
    for i, zipcode in enumerate(zips):
        #print statement to follow progress of search
        print('Processing zipcode {} of {}.  Zip : {}'.format(i,len(zips), zipcode), end = '\r')
        #sometimes yelp API throws errors, so try and if it fails try again.
        try:
            search_results = yelp_api.search_query(term=term, location = zipcode, limit = 1, radius_filter= 4000)
        except:
            search_results = yelp_api.search_query(term=term, location = zipcode, limit = 1, radius_filter= 4000)
        #save total number of restuarants in search so we know how many pages to scrape
        total_restaurants = search_results['total']
        #Yelp only lets us scrap 1000 results so set the page counter accordingly, 
        #1000 OR total restaurants /50 results a page
        if(total_restaurants<1000):
            tosearch = total_restaurants//50
        else:
            tosearch = 20
        #run the search on every yelp results query we are allowed to
        for search in range(tosearch):

            #get the results for the search 50 at a time
            try:
                newsearch = yelp_api.search_query(term=term, location = zipcode, limit = 50, offset = 50*search, radius_filter= 4000)
            except:
                newsearch = yelp_api.search_query(term=term, location = zipcode, limit = 50, offset = 50*search, radius_filter= 4000)
            for entry in range(50):
                #try to run the search, if yelp says no page, youve hit the end of the results so dont do anything
                try:
                    #if the business is open (or not closed) save the relavant information
                    if(newsearch['businesses'][entry]['is_closed'] == False):
                        latitude.append(newsearch['businesses'][entry]['coordinates']['latitude'])
                        longitude.append(newsearch['businesses'][entry]['coordinates']['longitude'])
                        name.append(newsearch['businesses'][entry]['id'])
                        #some restaurants have no price rating and it was throwing an error, if no rating rate a 0
                        try:
                            price.append(newsearch['businesses'][entry]['price'])
                        except:
                            price.append('')
                        raiting.append(newsearch['businesses'][entry]['rating'])
                        review_count.append(newsearch['businesses'][entry]['review_count'])
                        #some restaurants have no zip code saved, if this is the case save no zip, eliminate entry later
                        try:
                            ZIP.append(newsearch['businesses'][entry]['location']['zip_code'])
                        except:
                            ZIP.append('')
                    #if entry is closed iterate closed counter just so we can know about them
                    else:
                        closed = closed + 1
                except:
                    None
    #save all the data parsed from scraping into a dataframe
    data = pd.DataFrame()
    data['name'] = name
    data['latitude'] = latitude
    data['longitude'] = longitude
    data['price'] = [len(x) for x in price]
    data['raiting'] = raiting
    data['review_count'] = review_count
    data['Zipcode'] = ZIP
    #print some basic info about the scrape we ran
    print('{} businesses were closed and not included in the data'.format(closed))
    print('{} businesses had no zip and were not included in the data'.format(nozip))
    #return the dataframe as well as the central location that the map should be centered on
    return data, search_results['region']['center']

In [3]:
def load_pop_data(location):
    #https://blog.splitwise.com/2013/09/18/the-2010-us-census-population-by-zip-code-totally-free/
    #load data referencing population to zip code
    population_by_zip = pd.read_csv('Pop_by_zip.csv', names= ['Zipcode', 'Pop'], skiprows=1)
    #make the zip code a numeric value
    population_by_zip.Zipcode = pd.to_numeric(population_by_zip.Zipcode)
    #http://federalgovernmentzipcodes.us/
    #load the data referencing the latitude and longitude of each zip code
    zip_code_lat_long = pd.read_csv('zipcode-database.csv')
    #keep the columns that we want to use
    zip_code_lat_long = zip_code_lat_long[['Zipcode', 'State', 'Lat', 'Long', 'City']]
    #keep the entries from the state we are investigating
    zip_code_lat_long = zip_code_lat_long[(zip_code_lat_long.State == location)]
    #make the zip code a numeric value
    zip_code_lat_long.Zipcode = pd.to_numeric(zip_code_lat_long.Zipcode)
    #merge the data on zip code so we know the population by latitude and longitude
    pop_by_lat_long = zip_code_lat_long.merge(population_by_zip, on='Zipcode', how='inner')
    #fill Nan values with 1 for zip codes that have no people stored
    pop_by_lat_long = pop_by_lat_long.fillna(1)
    #placeholder for list of zips we are working with
    zips = []
    #generate an array of the zipcodes important to this state
    for zipcode in pop_by_lat_long.Zipcode:
        #exclude the zip code that Yelp isn't recognizing from MI
        if len(str(zipcode)) == 5 and (zipcode != 48921):
            zips.append(zipcode)
    #return the dataframe and a list of the zip codes for this state
    return pop_by_lat_long, zips

In [4]:
#generates a list of lat and longs based on the density of businesses of the specified type per capita, the more 
#businesses the more times each lat long pair will be returned
def reformat_pop_entries(df):
    #create a new datafame with the appropriate headers
    new_df = pd.DataFrame(columns = ['Lat', 'Long'])
    #make sure the index count is sequential, used for tracking progress
    df = df.reset_index(drop=True)
    #placeholder array for heat value: a value that is higher when the density of businesses per capita is higher
    heatvalue = []
    #for each row (zipcode in the state that contains businesses of the search type), calculate businesses per capita 
    for ind, location in df.iterrows():
        #print a progress report
        print('On row {} of {}'.format(ind, len(df)), end = '\r')
        
        #if people actually live in this zip code, calcultate the business density
        if(location.Pop != 0):
            heat = location['counts']/location.Pop
        #else, assign this as nan, we can drop this later, nobody lives there
        else:
            heat = float('NaN')
        #save the business density value in the list
        heatvalue.append(heat)
    #put together the new dataframe with our business density data
    new_df['Lat'] = df.Lat
    new_df['Long'] = df.Long
    new_df['City'] = df.City
    new_df['Pop'] = df.Pop
    new_df['heatvalue'] = heatvalue
    new_df = new_df.dropna()
    scaler = MinMaxScaler()
    scaler.fit(new_df['heatvalue'])
    #normalize the data from 1 to 0, the place with the highest density of target business type as 100, places with 
    #no businesses of the searched type as 0
    new_df['heatvalue'] = scaler.transform(new_df['heatvalue'].reshape(-1, 1))*100
    
    #dropcount = []
    #prepare the new dataframe that will contain lat long pairs
    latlongcounts = pd.DataFrame()
    #for each zip code
    for i, location in new_df.iterrows():
        #generate between 100 and 0 rows of lat long pairs depending on the density of the business type per capita
        for number in range(int(round(location.heatvalue))):
            #create a single entry for the heatmap to use as a measure of density
            temp = pd.DataFrame([[location.Lat, location.Long, location.City]], columns=['Lat', 'Long', 'City'])
            #add the entry to the dataframe
            latlongcounts = latlongcounts.append(temp)
    #make sure the index is sequential (it should already be but I like doing this)
    latlongcounts = latlongcounts.reset_index(drop=True)
    return latlongcounts

In [5]:
        search_results = yelp_api.search_query(term=term, location def run(location, term):
    #compile the list of populations by lat and long and retieve the  list of relavant zips
    pop_by_zip, zips = load_pop_data(location)
    #search yelp for all the entries we can find
    restaurants, center = search_yelp(location, term, zips)
    #initiate map data and center it on the middle of the state we are looking at
    gmap = gmplot.GoogleMapPlotter(center['latitude'], center['longitude'], 7.5)
    #because we are searching by zip with a range of 25 miles sometimes we will get duplicate entries 
    #drop entries will identical id values because we picked them up more than once by searching adjacent zips
    restaurants.drop_duplicates(subset = 'name', inplace = True)
    #set the count of each restuarant to 1 next to its entry, will be used by groupby function
    restaurants['counts'] = 1
    #group all the data by zip code so we know how many businesses are in each zip
    restaurantcounts = restaurants.groupby('Zipcode', as_index=False).sum()
    #drop the columns we aren't interested in
    restaurantcounts = restaurantcounts[['Zipcode', 'counts']]
    #prepare the zipcode to be merged by typing it as string
    restaurantcounts['Zipcode'] = [str(x)  for x in restaurantcounts['Zipcode']]
    #prepare the zipcode to be merged by typing it as string
    pop_by_zip['Zipcode'] = [str(x)  for x in pop_by_zip['Zipcode']]
    #merge the zipcode and population data with the restuarant counts by zip
    pop_by_zip_w_b_counts = pop_by_zip.merge(restaurantcounts, on='Zipcode', how='outer')
    #drop rows where there are nan values
    pop_by_zip_w_b_counts_clean = pop_by_zip_w_b_counts.dropna()
    #return a list of lats and longs that is generated by determining how many businesses of this type per capita
    entries_adj_pop = reformat_pop_entries(pop_by_zip_w_b_counts_clean)
    #plot out a heat map using lat and longs, the more businesses per capita the more times each lat long pair
    #is in the entries_adj_pop dataframe
    gmap.heatmap(entries_adj_pop.Lat, entries_adj_pop.Long, radius = 40)
    #prepair the filename based of search terms
    filename = term + '_in_' + location + ".html"
    #draw the file
    gmap.draw(filename)
    print('Map file saved as {}'.format(filename))
    #return the dataframe that contains the counts of restaurants in each zip/city for inspection if needed
    return pop_by_zip_w_b_counts_clean

In [11]:
allstuff = run(location = 'MI',term =  'Pizza')        search_results = yelp_api.search_query(term=term, location 

26 businesses were closed and not included in the data
0 businesses had no zip and were not included in the data
On row 503 of 748



On row 504 of 748On row 505 of 748On row 506 of 748On row 507 of 748On row 508 of 748On row 509 of 748On row 510 of 748On row 511 of 748On row 512 of 748On row 513 of 748On row 514 of 748On row 515 of 748On row 516 of 748On row 517 of 748On row 518 of 748On row 519 of 748On row 520 of 748On row 521 of 748On row 522 of 748On row 523 of 748On row 524 of 748On row 525 of 748On row 526 of 748On row 527 of 748On row 528 of 748On row 529 of 748On row 530 of 748On row 531 of 748On row 532 of 748On row 533 of 748On row 534 of 748On row 535 of 748On row 536 of 748On row 537 of 748On row 538 of 748On row 539 of 748On row 540 of 748On row 541 of 748On row 542 of 748On row 543 of 748On row 544 of 748On row 545 of 748On row 546 of 748On row 547 of 748On row 548 of 748On row 549 of 748On row 550 of 748On row 551 of 748On row 552 of 748On row 553 of 748On row 554 of 748On row 555 of 748On row 556 of 748On row 557 of 748On row 558 of 748On row 559

In [16]:
allstuff = run(location = 'CA',term =  'Pizza')

In [18]:
allstuff = run(location = 'PA',term =  'Chinese')

In [19]:
allstuff = run(location = 'PA',term =  'PIZZA')