In [1]:
import pandas as pd 
import json
from arcgis.features import FeatureLayer
import requests
import time

# API Call to Louisville Open Data. 

| Steps      | Description | Notes     |
| :---        |    :----   |          :--- |
| 1. Open Data API call      | API call for restaurant inspection scores | Works   |
| 2. Clean   | Remove un-needed data        | drop cols and sort data. Also ned to only keep the most recent scores      |

In [None]:
url = 'https://services1.arcgis.com/79kfd2K6fskCAkyg/arcgis/rest/services/FoodServiceData/FeatureServer/0'

batch_size = 1000  # Number of records to retrieve per batch
offset = 0  # Initial offset value
data_list = []
# ​Create the feature layer object
feature_layer = FeatureLayer(url)

while True:
    # Query the feature layer with pagination
    query_result = feature_layer.query(where='1=1', out_fields='*', return_geometry=False, result_offset=offset, result_record_count=batch_size)
    
    # Retrieve the features from the query result
    features = query_result.features
    
    # Process the data for the current batch
    for feature in features:
        data_list.append(feature.attributes)
    
    # Break the loop if the response is empty or the desired number of records is reached
    if len(features) == 0 or len(data_list) >= 1000:
        break
    
    # Increment the offset by the batch size
    offset += batch_size
# Create a DataFrame from the data list
df = pd.DataFrame(data_list)

Verify the data comes in correct

In [None]:
df.head()

Drops the extra cols we dont need. 

In [None]:
cols_drop = ['EstablishmentID', 'InspectionID', 'PlaceName', 'Address2', 'TypeDescription', 'NameSearch', 'Intersection']
df.drop(cols_drop, axis=1, inplace=True)
df.head(2)

In [None]:
df.shape

Converting InspectionDate to date time

In [None]:
df['InspectionDate'] = pd.to_datetime(df['InspectionDate'])
# sorting inspection dates
df.sort_values('InspectionDate', ascending=False, inplace=True)
# dropping duplicate rest based on its first occurrence
df.drop_duplicates(subset='EstablishmentName', keep='first', inplace=True)
df.head(2)

In [None]:
df.shape

In [None]:
df.value_counts('EstablishmentName')

In [None]:
df = df.sort_values(by='score', ascending=True)
df.head()

In [None]:
# df.to_json('../json_files/health.json')

# Yelp API 

Imports my API key for yelp. 

In [None]:
from akeys import api_key

In [None]:
api_key
endpoint = "businesses/search"
url = f"https://api.yelp.com/v3/{endpoint}"
headers = {
    "Authorization": f"Bearer {api_key}"
}
params = {
    "term": "restaurants",
    "location": "Louisville, KY",
    "limit": 50  # Set the desired limit per request (maximum is 50)
}

restaurant_data = []  # List to store the extracted data

offset = 0  # Initial offset value
results_per_request = params["limit"]  # Results per request (50)
total_results = float("inf")  # Initialize total_results to an arbitrary high value

while offset < total_results:
    params["offset"] = offset  # Set the offset parameter
    
    response = requests.get(url, headers=headers, params=params)
    
    if response.status_code != 200:
        print(f"Error: {response.status_code} - {response.text}")
        break
    
    data = response.json()

    if offset == 0:
        total_results = data["total"]
    
    businesses = data["businesses"]
    
    for business in businesses:
        name = business.get("name")
        rating = business.get("rating")
        review_count = business.get("review_count")
        price = business.get("price")
        address = ", ".join(business.get("location", {}).get("display_address", []))
        restaurant_data.append({
            "Name": name,
            "Rating": rating,
            "Review Count": review_count,
            "Price": price,
            "Address": address
        })
    offset += results_per_request  
    time.sleep(2)  

# Create a DataFrame from the extracted data
df2 = pd.DataFrame(restaurant_data)

# Print the DataFrame
df2.head()

In [None]:
df2 = df2.sort_values(by="Rating", ascending=True)
df2 = df2.loc[df["Rating"] != 0.0]
df2 = df2.reset_index(drop=True)
df2['Rating'] = (df2['Rating']*20).astype(int)
df2.head()
df2.to_json('../json_files/yelp.json')

In [None]:
df2['Address'] = df2['Address'].str.split(',').str[0]
df2['Rating'] = (df2['Rating']*20).astype(int)
df2.head()

In [None]:
df['Address'] = df['Address'].str.lower().str.strip()
df2['Address'] = df2['Address'].str.lower().str.strip()

In [None]:
merged_df = df.merge(df2, on='Address', how='left')
merged_df = merged_df.sort_values('Rating', ascending=True)
merged_df = merged_df[['EstablishmentName', 'Address', 'score', 'Rating', 'Review Count']]
merged_df.head(20)