In [2]:
import os
import pandas as pd
import barnum
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import requests
from census import Census
from us import states


# Yelp API key in secrets.py; .gitignore prevents the secrets.py from being pushed to GitHub
#from secrets import yelpKey
#from secrets import api_key


<h3/>ZIP Code Generator </h3>
generates random zip codes within a specified range




In [4]:
numTimes = 10
zip_codes = []
i = int
hasAllZipcodes = False

while hasAllZipcodes == False:
    theZipCode = barnum.create_city_state_zip()[0]
    numZipCode = int(theZipCode)
    if (numZipCode >= 90001) and (numZipCode <= 93005):
        zip_codes.append(theZipCode)
    if (numTimes == len(zip_codes)):
        hasAllZipcodes = True 
 
zip_codes=pd.Series(zip_codes)
zip_codes.to_csv('zip_code_list.csv')
print(zip_codes)

0    92325
1    91755
2    91841
3    92091
4    92121
5    91914
6    92356
7    92862
8    90081
9    92805
dtype: object


<h3/>Yelp Data Collection:</h3>
Queries the Yelp search API to get those businesses that 'gluten_free' category for a given zip code.
https://www.yelp.com/developers/documentation/v3/business_search

In [5]:
def getDataForZipcode(searchZipCode):
    ''' Searches the Yelp API to get the business that satisfy the 'gluten_free' search term
    
    Accepts : searchZipCode (str) zip code to search for records within
    
    Returns : (dictionary) contains information of business for the zip code
                ID: Unique Yelp ID for the business
                Name: Name of the business
                ZipCode: Location of the business
                Latitude: coordinate of the business location
                Longitude: coordinate of the business location
                Price: Price level of the business. Value is one of $, $$, $$$, $$$$ and NA
                Rating: Rating for this business (value ranges from 1, 1.5, ... 4.5, 5)
    '''
    
    #- Prepare Search
    # Source Url
    baseYelpUrl = "https://api.yelp.com/v3/businesses/search"

    # API Key passed through header
    headers = {
            'Authorization': 'Bearer %s' % yelpKey,
    }
    
    # Search Term
    searchTerm = 'gluten_free'
    
    # Dictionary stores data
    yelpData = {
        'ID': [],
        'Name': [],
        'Zipcode': [],
        'Latitude': [],
        'Longitude': [],
        'Price' : [],
        'Rating' : []
    }
    
    
    #- Display Message
    print(" ")
    print(f"Started search of Yelp API zip code: {searchZipCode}...")
    
    
    #- Search
    #  API limits 50 records being returned at once; must loop and request offset of results to get all records
    recordLimit = 50
    currentOffset = 0
    hasMoreData = True
    
    while hasMoreData == True:
        
        #- Prepare Parameters
        parameters = {
            'location': searchZipCode,
            'term': 'gluten_free',
            'limit': recordLimit,
            'radius': 3000,
            'offset': currentOffset,
            }
        
        
        #- Request
        print(f"  Requesting data. Offset: {currentOffset}")
        
        response = requests.request('GET', baseYelpUrl, headers=headers, params=parameters)
        
        
        #- Check Response
        if (response.status_code == requests.codes.ok):
            
            # Get Json from Response
            responseJson = response.json()
            
            
            # Search Businesses
            for business in responseJson['businesses']:
                
                # Determine Use Business
                useBusiness = checkBusinessForUsage(business, searchZipCode, searchTerm)
                
                if (useBusiness == True):
                    
                    # Populate Dictionary with Business Information
                    yelpData['ID'].append(business['id'])
                    yelpData['Name'].append(business['name'])
                    yelpData['Zipcode'].append(business['location']['zip_code'])
                    
                    yelpData['Latitude'].append(business['coordinates']['latitude'])
                    yelpData['Longitude'].append(business['coordinates']['longitude'])
                    
                    yelpData['Price'].append(getPriceForBusiness(business))
                    yelpData['Rating'].append(business['rating'])
          
        
        #- Prepare for Next search
        # API only supports 50 records at a time; must query with offset
        currentOffset = (currentOffset + recordLimit)
        
        if (currentOffset > responseJson['total']):
            print(f"Collected all data. Current Offset: {currentOffset}  Total: {responseJson['total']}")
            hasMoreData = False
    
                  
    #- Metadata on Data
    print(" ")
    print(f"Search Zipcode: {searchZipCode}")
    print(f"Total businesses: {len(yelpData['ID'])}")
               
          
    #- Return data from function
    return yelpData


In [6]:
def checkBusinessForUsage(businessInfo, searchZipcode, searchTerm):
    ''' Determines if the business can be used in the Analysis
    
    Accepts : businessInfo (dictionary) contains the metadata for individual business 
                searchZipCode (str) zip code searching for data within
                searchTerm (str) category used with the search; ensure found
    
    Return : bool TRUE- business meets critera, able to use FALSE- unable to use business
    '''
    
    #- Check Within Search Zipcode
    businessZipCode = businessInfo['location']['zip_code']
    
    if (businessZipCode != searchZipcode):
        return False
    
    
    
    # NOTE: Found that when checking for the category there were very few records that satisfy this
    
    #- Check Search Term
#     foundSearchTerm = False
    
#     for category in businessInfo['categories']:
        
#         if (category['alias'] == searchTerm):
#             foundSearchTerm = True
#             break
    
    
    return True
     

In [7]:
def getPriceForBusiness(businessInfo):
    ''' Gets the price for a business; not all businesses contain this property within the JSON;
    when not found just uses NA.
    
    Accepts : businessInfo (dictionary) metadata on an individual business
    
    Returns : (num) value from price tag
    '''
    try:
        
        return businessInfo['price']
    
    except:
        return 'NA'

In [9]:
#-- Collect Data, Multiple Datasets

#- UserName
#userName = "Connie"

#- Get File of Random Zipcodes
print("--->")
print("Getting random zipcodes...")



#fileName = f'RandomZipCodes_{userName}.csv'

#randomZipcodesPath = os.path.join(".", "Output", fileName)

randomZipcodes_df = pd.read_csv("zip_code_list.csv")


#- Collect Data
for index, row in randomZipcodes_df.iterrows():
    
    #- Get Zipcode
    searchZipCode = str(row[1])

    
    #- Get Data from Yelp
    yelpDataForZipCode = getDataForZipcode(searchZipCode)


    #- Create DataFrame
    yelpData_df = pd.DataFrame(yelpDataForZipCode)

    
    #-- Export Data
    dataExportPath = os.path.join(".", "Output", f"YelpData_{searchZipCode}.csv")

    yelpData_df.to_csv(dataExportPath)

    print(f"Exported data to disk. Path: {dataExportPath}")

    
#- Complete Message
print(" ")
print("Completed getting information for zipcodes")


--->
Getting random zipcodes...
 
Started search of Yelp API zip code: 91755...
  Requesting data. Offset: 0
  Requesting data. Offset: 50
  Requesting data. Offset: 100
  Requesting data. Offset: 150
  Requesting data. Offset: 200
Collected all data. Current Offset: 250  Total: 201
 
Search Zipcode: 91755
Total businesses: 5
Exported data to disk. Path: ./Output/YelpData_91755.csv
 
Started search of Yelp API zip code: 91841...
  Requesting data. Offset: 0
  Requesting data. Offset: 50
  Requesting data. Offset: 100
  Requesting data. Offset: 150
  Requesting data. Offset: 200
  Requesting data. Offset: 250
Collected all data. Current Offset: 300  Total: 270
 
Search Zipcode: 91841
Total businesses: 0
Exported data to disk. Path: ./Output/YelpData_91841.csv
 
Started search of Yelp API zip code: 92091...
  Requesting data. Offset: 0
  Requesting data. Offset: 50
Collected all data. Current Offset: 100  Total: 69
 
Search Zipcode: 92091
Total businesses: 0
Exported data to disk. Path: 

In [10]:
#-- Collect Data

# Zip Code to search
# Able to pass this as function in future
searchZipCode = "92104"


#- Get Data From Yelp
yelpDataForZipCode = getDataForZipcode(searchZipCode)


#- Create DataFrame
yelpData_df = pd.DataFrame(yelpDataForZipCode)



 
Started search of Yelp API zip code: 92104...
  Requesting data. Offset: 0
  Requesting data. Offset: 50
  Requesting data. Offset: 100
  Requesting data. Offset: 150
  Requesting data. Offset: 200
  Requesting data. Offset: 250
Collected all data. Current Offset: 300  Total: 271
 
Search Zipcode: 92104
Total businesses: 119


In [11]:
#-- Preview Data
yelpData_df.head(200)

Unnamed: 0,ID,Name,Zipcode,Latitude,Longitude,Price,Rating
0,0JmO9IUPf4qHa-lqTe8c3Q,Curryosity,92104,32.729480,-117.129270,,4.0
1,FRfUgOKobN0foq12s9OUTA,Mike Hess Brewing Company,92104,32.747702,-117.128543,$,4.5
2,jLND0q_AFpsxY8CQ3njKeQ,Bread and Cheese Catering,92104,32.750252,-117.136963,,5.0
3,nap_QrXJTEzJgj0_VbxBBg,619 Spirits North Park,92104,32.750151,-117.129790,$$,4.5
4,-BZ9L7G1OOUsb_uMYYuYeg,Butter Up Bakery,92104,32.750126,-117.137421,$$,5.0
5,KX1aYCfF_bKSSu5lCrgQwA,Couch Potatoes,92104,32.730572,-117.113571,,5.0
6,YLV7ZZoyOAXW3eqidGXBZA,The Smoking Goat,92104,32.741353,-117.130230,$$,4.0
7,H3XluJQlArmynJc2YBErzg,Thorn St Brewery,92104,32.739350,-117.125484,$,4.5
8,2viC5B-psszDSGKHkp8I2Q,Dunedin New Zealand Eats,92104,32.742340,-117.129860,$$,4.0
9,xyw7ytvqkXw7LUJuBStn4g,Nomad Donuts,92104,32.748720,-117.127480,$$,4.0


In [12]:
#-- Export Data
#dataExportPath = os.path.join(".", "Output", f"YelpData_{searchZipCode}.csv")

#yelpData_df.to_csv(dataExportPath)

#print(f"Exported data to disk. Path: {dataExportPath}")

Exported data to disk. Path: ./Output/YelpData_92104.csv


Additional Yelp data.


In [50]:
#-- Get List of Files on Disk
outputPath = os.path.join(".", "Output")

files = os.listdir(outputPath)


#-- Load into Single DataFrame
yelpData_df = None

for file in files:
    
    if (file.startswith("YelpData_") == True):
        
        #- Get DataFrame
        filePath = os.path.join(".", "Output", file)
    
        zipCodeYelpData_df = pd.read_csv(filePath)
        
        
        if (yelpData_df == None):
            yelpData_df = zipCodeYelpData_df
            
        else:
            yelpData_df = pd.concat(zipCodeYelpData_df, yelpData_df)
        
        
        
       
    

TypeError: Could not compare [None] with block values

In [49]:
def getSummarizedYelpData(fileName):
    '''
    
    Accepts : filePath (str) path to the csv that contains the information for the zipcode
    
    Returns : (dictionary) contains the results from Yelp data
                "df" - DataFrame from the csv file
                "summary" - dictionary that contains the summary information
                    "TotalRecords" - total number of records for zip code
                    "Rating_$" 
                    "Rating_$$"
                    "Rating_$$$"
                    "Rating_$$$"
                    "Rating_NA"
                    
    '''
    
    #-- Read File
    filePath = os.path.join(".", "Output", fileName)
    
    yelpData_df = pd.read_csv(filePath)
    
    
    #-- Summarize Data
    
    
    
    
    
    
    #-- Create Result Dictionary
    # Used to return multiple objects to calling function
    results = {
        "df": yelpData_df
    }
    
    return results


<h3/>Summarizing Yelp Data<h3>

Unnamed: 0.1,Unnamed: 0,ID,Name,Zipcode,Latitude,Longitude,Price,Rating


In [46]:
#-- Get List of Files on Disk
outputPath = os.path.join(".", "Output")

files = os.listdir(outputPath)


#-- Load into Single DataFrame
yelpData_df is None

for file in files:
    
    if (file.startswith("YelpData_") == True):
        
        #- Get DataFrame
        filePath = os.path.join(".", "Output", file)
    
        zipCodeYelpData_df =(pd.read_csv(filePath))
        
        
        if (yelpData_df is None):
            yelpData_df = zipCodeYelpData_df
            
        else:
            yelpData_df = pd.concat(zipCodeYelpData_df, yelpData_df)
        
        
     
       
    

TypeError: first argument must be an iterable of pandas objects, you passed an object of type "DataFrame"

In [33]:
yelpData_df

Unnamed: 0.1,Unnamed: 0,ID,Name,Zipcode,Latitude,Longitude,Price,Rating


<h3/>Census Bureau API</h3>
Retrieves specified demographic data from the 2016 American Community Survey



In [None]:
c = Census(api_key, year=2016)

#API call

census_data = c.acs5.get(("NAME","B01003_001E",
                          "B19001_017E",
                          "B19113_001E",
                          "B25002_002E"),{'for':'zip code tabulation area:*'})

# Convert to DataFrame
census_df = pd.DataFrame(census_data)

# Column Rename
census_df = census_df.rename(columns={"B01003_001E": "Population",
                                     "B25002_002E": "Households",
                                     "B19113_001E": "Median family income",
                                     "B19001_017E":"Households with household income $200,000 or more",
                                     "NAME": "Name", "zip code tabulation area": "Zipcode"})

census_df.sort_values("Median family income", ascending=True).head()


In [None]:
#data munging
cleaned_census_df = census_df[census_df['Median family income'] > 0 ] 
cleaned_census_df.sort_values("Median family income", ascending=True).head()
income_df = cleaned_census_df.copy()

In [None]:
#make a csv file
income_df.to_csv('incomebyzip.csv')

income_df.head()