In [110]:
# Dependencies
import hvplot.pandas
import pandas as pd
import numpy as np
import requests
import json
import random

# Import API key
from geoapify_key import api_key

In [7]:
# Read states.csv and create it as a Dataframe
states_loc = pd.read_csv('states.csv')
states_loc_df = pd.DataFrame(states_loc)
states_loc_df.head()

Unnamed: 0,state,latitude,longitude,name
0,AK,63.588753,-154.493062,Alaska
1,AL,32.318231,-86.902298,Alabama
2,AR,35.20105,-91.831833,Arkansas
3,AZ,34.048928,-111.093731,Arizona
4,CA,36.778261,-119.417932,California


In [38]:
# Read states_area.csv and merge with states.csv
states_area = pd.read_csv('states_area.csv')
states_area_df = pd.DataFrame(states_area)

# Format Dataframe for same column name and area values to int
states_area_df.rename(columns={"state": "name", 
                               "Total Area (sq km)": "area (sq km)"}, 
                               inplace=True)
states_area_df['area (sq km)'] = [int(area.replace(",", "")) for area in states_area_df['area (sq km)']]
states_df = pd.merge(states_loc_df,states_area_df, on='name')
states_df.head()

Unnamed: 0,state,latitude,longitude,name,area (sq km),lon1,lat1,lon2,lat2
0,AK,63.588753,-154.493062,Alaska,1723337,-130.0,51.333333,172.0,71.833333
1,AL,32.318231,-86.902298,Alabama,135767,-84.883333,30.183333,-88.466667,35.0
2,AR,35.20105,-91.831833,Arkansas,137732,-89.65,33.0,-94.616667,36.5
3,AZ,34.048928,-111.093731,Arizona,295234,-109.05,31.333333,-114.816667,37.0
4,CA,36.778261,-119.417932,California,423967,-114.133333,32.533333,-124.433333,42.0


In [95]:
# Set base URLs for Places API and Geocode API
places_url = 'https://api.geoapify.com/v2/places?'
geocode_url = 'https://api.geoapify.com/v1/geocode/search?'

# Select the categories to search and set columns for them
categories_interested = ["commercial", "service", "tourism", "religion", "beach", "production", "leisure"]
for category in categories_interested:
    states_df[category] = None

# Counter for number of states with no data in a given category
states_missing = 0

# Initialize API search
print("Starting Services Search")
print('-'*80)

for index, row in states_df.iterrows():
    
    print(f"Searching {states_df.loc[index, 'name'].upper()}")
    # Get coordinate values for filter and bias parameters
    longitude = row['longitude']
    latitude = row['latitude']
    lon1 = row['lon1']
    lat1 = row['lat1']
    lon2 = row['lon2']
    lat2 = row['lat2']
    for category in categories_interested:
        
        # Set parameters for Places API response
        params = {
            "categories": category,
            "apiKey": api_key,
            "limit": 10,
            "filter": f"rect:{lon1},{lat1},{lon2},{lat2}",
            "bias": f"proximity:{longitude},{latitude}"
        }
        places_json = requests.get(places_url, params=params).json()

        # Check the returned list and break if the length is 0
        if int(len(places_json["features"])) == 0:
            print(f"No results found for {states_df.loc[index, 'name']} in {category}")
            print('-'*80)
            states_missing += 1
            break
        else:
        
            # Counter for number of results with mismatch in state
            result_wrong = 0

            # Check that the place is in the respective state
            for result in places_json['features']:
                try:
                    if result['properties']['state'] != states_df.loc[index, 'name']:
                        print(f"{result['properties']['state']} is not in the state of {states_df.loc[index, 'name']}!")
                        result_wrong += 1
                        break
                
                except:
                    result_wrong += 1
                    break
                else:
                    
                    # Use the Geocode API with place_id to obtain popularity index
                    for geo_result in places_json['features']:
                        place_id = geo_result['properties']['place_id']
                        try:
                            geo_params = {
                                    "apiKey": api_key,
                                    "text": geo_result['properties']['formatted'],
                                    "limit": 2,
                                    "filter": f"place:{place_id}",
                                }
                        
                        except KeyError:
                            geo_params = {
                                    "apiKey": api_key,
                                    "state": result['properties']['state_code'],
                                    "limit": 2,
                                    "filter": f"place:{place_id}",
                                }
                        geocode_json = requests.get(geocode_url, params=geo_params).json()
                        
                        # Setup average popularity list
                        geo_popularity = [0]
                        for geo_feature in geocode_json['features']:
                            if ((geo_feature['properties']['rank']['confidence'] == 1) | (geo_feature['properties']['rank']['match_type'] == "full_match") | (geo_feature['properties']['rank']['match_type'] == "inner_part")):
                                try:
                                    geo_popularity.append(float(geo_feature['properties']['rank']['popularity']))
                                    
                                    if float(geo_feature['properties']['rank']['popularity']) > max(geo_popularity):
                                        max_popularity = float(geo_feature['properties']['rank']['popularity'])
                                    min_popularity = float(geo_feature['properties']['rank']['popularity'])
                                
                                except KeyError:
                                    geo_popularity.append(0)

                        try:
                            states_df.loc[index, f'{category}_popularity'] = sum(geo_popularity) / len(geo_popularity)
                        
                        except ZeroDivisionError:
                            continue
                    
            try:
                states_df.loc[index, category] = places_json['features'][0]['properties']['name']
            except KeyError:
                try:
                    states_df.loc[index, category] = places_json['features'][0]['properties']['formatted']
                except KeyError:    
                    states_df.loc[index, category] = None
            if result_wrong != 0:
                print(f'Number of wrong results in {category}: {result_wrong}')
            
    else:
        print('-'*80)
        continue
    # break
print("Ending Services Search")
print(f"Number of Places API response with no data: {states_missing}")

empty_cols = [col for col in states_df.columns if ((states_df[col].isnull().all()) | (states_df[col].all() == None) | (states_df[col].count() < 25))]
states_df.drop(empty_cols, axis= 1, inplace=True)

Starting Services Search
--------------------------------------------------------------------------------
Searching ALASKA
No results found for Alaska in commercial
--------------------------------------------------------------------------------
Searching ALABAMA
No date keyword found for Alabama in commercial
No date keyword found for Alabama in service
Number of wrong results in tourism: 1
No date keyword found for Alabama in religion
Number of wrong results in religion: 1
No results found for Alabama in beach
--------------------------------------------------------------------------------
Searching ARKANSAS
No date keyword found for Arkansas in commercial
Number of wrong results in commercial: 1
No date keyword found for Arkansas in service
Number of wrong results in service: 1
No date keyword found for Arkansas in tourism
Number of wrong results in tourism: 1
No date keyword found for Arkansas in religion
Number of wrong results in religion: 1
No date keyword found for Arkansas in 

In [154]:
# RUNNING THIS CELL WILL COPY THE states_df AND PERFORM A RANDOM NUMBER SAMPLING

states_popularity_df = states_df.copy()

# Pick the columns with the popularity for each category searched
states_popularity_df = states_popularity_df[['state', 'name', 'commercial_popularity', 'service_popularity', 'religion_popularity', 'tourism_popularity', 'production_popularity', 'leisure_popularity']]

for index, row in states_popularity_df.iterrows():
    count_nan = 0
    for pop in states_popularity_df.columns[2:]:
        if np.isnan(states_popularity_df.loc[index, pop]):
            states_popularity_df.at[index, pop] = random.uniform(0, states_popularity_df.loc[:, pop].max() / 5)
            count_nan += 1   
    print(f"Number of NaN values for {states_popularity_df.loc[index, 'name']}: {count_nan}")

states_popularity_df.head()

Number of NaN values for Alaska: 6
Number of NaN values for Alabama: 3
Number of NaN values for Arkansas: 0
Number of NaN values for Arizona: 2
Number of NaN values for California: 0
Number of NaN values for Colorado: 0
Number of NaN values for Connecticut: 0
Number of NaN values for Delaware: 0
Number of NaN values for Florida: 0
Number of NaN values for Georgia: 2
Number of NaN values for Hawaii: 0
Number of NaN values for Iowa: 1
Number of NaN values for Idaho: 2
Number of NaN values for Illinois: 0
Number of NaN values for Indiana: 0
Number of NaN values for Kansas: 0
Number of NaN values for Kentucky: 0
Number of NaN values for Louisiana: 2
Number of NaN values for Massachusetts: 0
Number of NaN values for Maryland: 1
Number of NaN values for Maine: 0
Number of NaN values for Michigan: 2
Number of NaN values for Minnesota: 0
Number of NaN values for Missouri: 3
Number of NaN values for Mississippi: 2
Number of NaN values for Montana: 2
Number of NaN values for North Carolina: 0
Nu

Unnamed: 0,state,name,commercial_popularity,service_popularity,religion_popularity,tourism_popularity,production_popularity,leisure_popularity
0,AK,Alaska,0.080008,0.030888,0.115833,0.201822,0.083749,0.019388
1,AL,Alabama,0.0,0.0,0.0,0.213146,0.206775,0.260133
2,AR,Arkansas,0.0,0.0,0.742974,0.77075,0.760664,0.760625
3,AZ,Arizona,0.0,0.0,0.0,0.0,0.413324,0.179727
4,CA,California,0.821187,0.0,0.0,1.076621,0.0,0.0


# Test Request Below

In [158]:
# test_places_url = 'https://api.geoapify.com/v2/places?'

# # Select the categories that you want to search and test it in California
# test_params = {
#             "categories": "beach",
#             "apiKey": api_key,
#             "limit": 10,
#             "filter": f"rect:{states_df['lon1'][4]},{states_df['lat1'][4]},{states_df['lon2'][4]},{states_df['lat2'][4]}",
#         }
# test_json = requests.get(test_places_url, params=test_params).json()
# # print(json.dumps(test_json, indent= 4, sort_keys=True))

# test_data = []
# for feature in test_json['features']:
#     test_placeID = feature['properties']['place_id']
#     test_geocode_url = 'https://api.geoapify.com/v1/geocode/search?'
#     try:
#         test_params_geocode = {
#                 "apiKey": api_key,
#                 "text": feature['properties']['formatted'],
#                 "limit": 1,
#                 "filter": f"place:{test_placeID}",
#             }
#     except KeyError:
#         test_params_geocode = {
#                 "apiKey": api_key,
#                 "text": feature['properties']['categories'],
#                 "limit": 1,
#                 "filter": f"place:{test_placeID}",
#             }
#     test_geocode_json = requests.get(test_geocode_url, params=test_params_geocode).json()
#     for geo_feature in test_geocode_json['features']:
#         if ((geo_feature['properties']['rank']['confidence'] == 1) | (geo_feature['properties']['rank']['match_type'] == "full_match") | (geo_feature['properties']['rank']['match_type'] == "match_by_building")):
#             try:
#                 test_data.append(geo_feature['properties']['rank']['popularity'])
#                 print(geo_feature['properties']['rank']['popularity'])
#             except KeyError:
#                 test_data.append(None)
        
#     print(json.dumps(test_geocode_json, indent= 4, sort_keys=True))

{
    "features": [
        {
            "bbox": [
                -118.9015576,
                36.787263,
                -118.9006472,
                36.7885309
            ],
            "geometry": {
                "coordinates": [
                    -118.90095516756642,
                    36.7878554
                ],
                "type": "Point"
            },
            "properties": {
                "address_line1": "Sandy Cove Beach",
                "address_line2": "Fresno County, CA, United States of America",
                "category": "beach",
                "country": "United States",
                "country_code": "us",
                "county": "Fresno County",
                "datasource": {
                    "attribution": "\u00a9 OpenStreetMap contributors",
                    "license": "Open Database License",
                    "sourcename": "openstreetmap",
                    "url": "https://www.openstreetmap.org/copyright"
                },
