In [None]:
# Author: Edgar Alfonseca
# LinkedIn: https://www.linkedin.com/in/edgar-alfonseca/
# GitHub: https://github.com/edgaralfonseca
#
# This Python script batch geocodes addresses using the NYC OTI Geoclient API v1.0
#
#
# API description: https://api-portal.nyc.gov/api-details#api=geoclient&operation=geoclient
# v1.0 documentation: https://api.nyc.gov/geoclient/v1/doc/
# GitHub repo: https://github.com/CityOfNewYork/geoclient

# Pre-requisites
#
# 1) Create a new account on the NYC API Developers Portal: https://api-portal.nyc.gov/
# 2) Request an API key by subscribing to the Geoclient API in the portal


# Notes
# The OTI geoclient api handles 2,500 requests per minute / 500,000 requests per day
# NYC Department of City Planning's Geosupport (https://www.nyc.gov/site/planning/data-maps/open-data/dwn-gde-home.page) is used to power Geoclient
# Sometimes there might be a several week delay in Geoclient reflecting what is in GeoSupport
# Geoclient serves up a subset of attributes whereas Geosupport has all attributes

In [None]:
# Import necessary python modules and prepare data

import pandas as pd
import requests
import numpy as np
import time

In [None]:
# Import sample NYC address data (close to 6k records)

url = "https://raw.githubusercontent.com/edgaralfonseca/python-oti-geoclient-api-v1/main/nyc_sample_almost_6k_addresses.csv"

nyc_address_df = pd.read_csv(url)

# minor data cleaning on the postcode (zip code)

nyc_address_df['postcode'] = nyc_address_df['postcode'].astype(str).str[:5]

In [None]:
nyc_address_df.head(10)

Unnamed: 0,row_id,house_number,street_name,borough,postcode
0,1,114,SEIGEL STREET,Brooklyn,11206
1,2,1920,UNION STREET,Brooklyn,11233
2,3,2555,WILLIAMSBRIDGE ROAD,Bronx,10469
3,4,763,JENNINGS STREET,Bronx,10459
4,5,275,PRESIDENT STREET,Brooklyn,11231
5,6,1402,NEW YORK AVENUE,Brooklyn,11210
6,7,658,DRIGGS AVENUE,Brooklyn,11211
7,8,740,EAST 222 STREET,Bronx,10467
8,9,390,1 AVENUE,Manhattan,10010
9,10,91,VISITATION PLACE,Brooklyn,11231


**Example 1: Calling the OTI Geoclient "Address" API endpoint**

Create a custom function that takes a pandas dataframe (what you want to geocode) as an input and creates an output that is a copy of your original dataframe left joined to the API results.

In [None]:
# Create a custom function to make API calls to the 'Address' endpoint

def oti_geoclient_api_address_endpoint(api_endpoint, headers, df_name, df_key_field, housenum_input_col, street_input_col, boro_input_col=None, zip_input_col=None, response_columns=None):
    """
    Fetch data from the OTI geoclient API, merge the response with the original dataframe, and return the merged dataframe.

    Parameters:
    - api_endpoint (str): The API endpoint URL.
    - headers (dict): The headers to send with the API request.
    - df_name (pd.DataFrame): The input pandas DataFrame.
    - df_key_field (str): The name of the primary key column in the DataFrame.
    - housenum_input_col (str): The name of the column in the DataFrame that provides the house number for the API.
    - street_input_col (str): The name of the column in the DataFrame that provides the street name for the API.
    - boro_input_col (str): The name of the column in the DataFrame that provides the borough for the API (required if zip is not given).
    - zip_input_col (str): The name of the column in the DataFrame that provides the zip code for the API (required if borough is not given).
    - response_columns (dict): Optional. A dictionary specifying which API response columns you want to keep.

    Returns:
    - pd.DataFrame: The merged DataFrame containing the original data and the filtered API response data.
    """

    # Create a session object
    session = requests.Session()
    session.headers.update(headers)

    # Define the function to send a request
    def send_request(house_number, street, borough=None, zip_code=None):
        params = {
            'houseNumber': house_number,
            'street': street,
        }
        if borough:
            params['borough'] = borough
        if zip_code:
            params['zip'] = zip_code

        try:
            response = session.get(api_endpoint, params=params, headers=headers)
            if response.status_code == 200:
                json_response = response.json()  # Parse the JSON response
                if 'address' in json_response:
                    return json_response['address']  # Return the 'address' object
                else:
                    return {}
            else:
                return {}
        except Exception as e:
            return {}

    # Prepare data for processing
    house_numbers = df_name[housenum_input_col].tolist()
    streets = df_name[street_input_col].tolist()
    boroughs = df_name[boro_input_col].tolist() if boro_input_col else [None] * len(df_name)
    zip_codes = df_name[zip_input_col].tolist() if zip_input_col else [None] * len(df_name)
    key_field_values = df_name[df_key_field].tolist()

    # List to store results
    results = []

    # Calculate the delay needed to stay within the rate limit
    delay_per_request = 60 / 2500  # 60 seconds divided by 2500 requests

    # Send requests sequentially with delay
    for house_number, street, borough, zip_code in zip(house_numbers, streets, boroughs, zip_codes):
        result = send_request(house_number, street, borough, zip_code)
        results.append(result)
        time.sleep(delay_per_request)  # Delay between requests to respect the rate limit

    # Convert the list of responses to a DataFrame
    if results and any(results):  # Check if results list is not empty and contains non-empty dictionaries
        response_df = pd.DataFrame(results)

        # If response_columns dictionary is provided, filter to keep only those columns
        if response_columns:
            response_df = response_df[response_columns]

        # Add the df_key_field from the original dataframe to the response_df for merging
        response_df[df_key_field] = key_field_values

        # Perform a left join of the original DataFrame with the response DataFrame on df_key_field
        merged_df = pd.merge(df_name, response_df, on=df_key_field, how='left')
    else:
        # If all results are empty, return the original DataFrame
        merged_df = df_name.copy()

    # Close the session when done
    session.close()

    return merged_df

In [None]:
# Create a copy of the nyc address pandas dataframe and sample 1000 records

address_input_df = nyc_address_df.sample(n=1000, random_state=1).copy()

In [None]:
# Prepare parameters for API

# Read the subscription key from a text file

with open('/content/OTI geoclient API primary key.txt', 'r') as file:
    subscription_key = file.read().strip()

# Set the headers with the subscription key
headers_param = {
    'Cache-Control': 'no-cache',
    'Ocp-Apim-Subscription-Key': subscription_key
}

address_api_url_param = "https://api.nyc.gov/geo/geoclient/v1/address.json"

search_return_columns_to_keep = ['bbl', 'bblBoroughCode', 'bblTaxBlock',
    'bblTaxLot', 'buildingIdentificationNumber', 'latitude', 'longitude',
    'xCoordinate', 'yCoordinate', 'communityDistrict', 'communityDistrictNumber',
    'geosupportFunctionCode',
    'geosupportReturnCode', 'geosupportReturnCode2', 'returnCode1a', 'returnCode1e'
]

In [None]:
# Call the API using the custom function

oti_api_address_output_df = oti_geoclient_api_address_endpoint(
    api_endpoint= address_api_url_param,
    headers= headers_param,
    df_name= address_input_df,
    df_key_field='row_id',
    housenum_input_col = 'house_number', street_input_col = 'street_name' , boro_input_col= 'borough' , zip_input_col= 'postcode',
    response_columns= search_return_columns_to_keep)

In [None]:
# Review api output dataframe

oti_api_address_output_df.head(10)

Unnamed: 0,row_id,house_number,street_name,borough,postcode,bbl,bblBoroughCode,bblTaxBlock,bblTaxLot,buildingIdentificationNumber,...,longitude,xCoordinate,yCoordinate,communityDistrict,communityDistrictNumber,geosupportFunctionCode,geosupportReturnCode,geosupportReturnCode2,returnCode1a,returnCode1e
0,2597,141,5 AVENUE,Brooklyn,11217,3009470011,3,947,11,3019401,...,-73.97923,990011,186368,306,6,1B,0,0,0,0
1,4698,305,EAST HOUSTON STREET,Manhattan,10002,1003500056,1,350,56,1004268,...,-73.983445,988839,202084,103,3,1B,0,0,0,0
2,3978,411,EAST 10 STREET,Manhattan,10009,1003820100,1,382,100,1078024,...,-73.97691,990650,203643,103,3,1B,0,0,0,0
3,2237,1597,NEW YORK AVENUE,Brooklyn,11210,3075610037,3,7561,37,3428759,...,-73.944801,999571,170098,317,17,1B,0,0,0,0
4,2295,511,EAST 20 STREET,Manhattan,10010,1009780001,1,978,1,1083689,...,-73.97734,990530,206629,106,6,1B,0,1,1,0
5,2739,253,NOSTRAND AVENUE,Brooklyn,11205,3017847502,3,1784,7502,3426325,...,-73.951515,997696,190727,303,3,1B,0,0,0,0
6,2409,1224,JEROME STREET,Brooklyn,11239,3044520213,3,4452,213,3421577,...,-73.876619,1018485,177486,305,5,1B,0,0,0,0
7,3946,45-57,DAVIS STREET,Queens,11101,4000850030,4,85,30,4000715,...,-73.944615,999597,210435,402,2,1B,0,0,0,0
8,1265,210,WEST 150 STREET,Manhattan,10039,1020350001,1,2035,1,1084147,...,-73.937404,1001574,239877,110,10,1B,0,0,0,0
9,3170,864,49 STREET,Brooklyn,11220,3056370032,3,5637,32,3137539,...,-74.001794,983752,172760,312,12,1B,0,0,0,0


In [None]:
# Export geocoded output to csv

oti_api_address_output_df.to_csv('oti_api_address_output_df.csv', index=False)

**Example 2: Using the OTI Geoclient BIN API endpoint**

A BIN (Building Identification Nummber) is a unique, immutable, citywide standard for building identification developed by NYC Department of City Planning. It is a 7-byte numeric item. You can read more about them here: https://nycplanning.github.io/Geosupport-UPG/chapters/chapterVI/section03/

In [None]:
# Create a custom function to make API calls to the 'BIN' endpoint

def oti_geoclient_api_bin_endpoint(api_endpoint, headers, df_name, df_key_field, api_input_column, response_columns=None):
    """
    Fetch data from the OTI geoclient API, merge the response with the original dataframe, and return the merged dataframe.

    Parameters:
    - api_endpoint (str): The API endpoint URL.
    - headers (dict): The headers to send with the API request.
    - df_name (pd.DataFrame): The input pandas DataFrame.
    - df_key_field (str): The name of the primary key column in the DataFrame.
    - api_input_column (str): The name of the column in the DataFrame that provides input for the API.
    - response_columns (dict): Optional. A dictionary specifying which API response columns you want to keep.

    Returns:
    - pd.DataFrame: The merged DataFrame containing the original data and the filtered API response data.
    """

    # Create a session object
    session = requests.Session()
    session.headers.update(headers)

    # Define the function to send a request
    def send_request(bin_input):
        params = {'bin': bin_input}
        #print(f"Sending request to API with URL: {api_endpoint} and headers: {headers}")  # Print the full URL and headers
        try:
            response = session.get(api_endpoint, params=params, headers=headers)
            if response.status_code == 200:
                json_response = response.json()  # Parse the JSON response
                if 'bin' in json_response:
                    return json_response['bin']  # Return the 'bin' object
                else:
                    return {}
            else:
                return {}
        except Exception as e:
            #print(f"Request failed for {bin_input}: {e}")
            return {}

    # Prepare data for processing
    bins = df_name[api_input_column].tolist()
    key_field_values = df_name[df_key_field].tolist()

    # List to store results
    results = []

    # Calculate the delay needed to stay within the rate limit
    delay_per_request = 60 / 2500  # 60 seconds divided by 2500 requests

    # Send requests sequentially with delay
    for bin_input in bins:
        result = send_request(bin_input)
        results.append(result)
        time.sleep(delay_per_request)  # Delay between requests to respect the rate limit

    # Convert the list of responses to a DataFrame
    if results and any(results):  # Check if results list is not empty and contains non-empty dictionaries
        response_df = pd.DataFrame(results)

        # If response_columns dictionary is provided, filter to keep only those columns
        if response_columns:
            response_df = response_df[response_columns]

        # Add the df_key_field from the original dataframe to the response_df for merging
        response_df[df_key_field] = key_field_values

        # Perform a left join of the original DataFrame with the response DataFrame on df_key_field
        merged_df = pd.merge(df_name, response_df, on=df_key_field, how='left')
    else:
        # If all results are empty, return the original DataFrame
        #print("API returned empty results for all rows.")
        merged_df = df_name.copy()

    # Close the session when done
    session.close()

    return merged_df

In [None]:
# Prepare a pandas dataframe to pass to OTI api

# Specify the columns you want to keep from the original dataframe

input_columns_to_keep = ['row_id', 'buildingIdentificationNumber']  # Replace with the columns you want to keep

# Create a copy of the OTI address output pandas dataframe and sample 500 records

bin_input_df = oti_api_address_output_df[oti_api_address_output_df['buildingIdentificationNumber'].notna()][input_columns_to_keep].sample(n=500, random_state=1).copy()

In [None]:
# Prepare parameters for API

# Note the headers_param was already set in Example #1

bin_api_url_param = "https://api.nyc.gov/geo/geoclient/v1/bin.json"

bin_return_columns_to_keep = ['bbl', 'bblBoroughCode', 'bblTaxBlock',
    'bblTaxLot',
    'internalLabelXCoordinate', 'internalLabelYCoordinate',
    'geosupportFunctionCode',
    'geosupportReturnCode'
]

In [None]:
# Call the API using the custom function

oti_api_bin_output_df = oti_geoclient_api_bin_endpoint(
    api_endpoint= bin_api_url_param,
    headers= headers_param,
    df_name= bin_input_df,
    df_key_field='row_id',
    api_input_column='buildingIdentificationNumber',
    response_columns= bin_return_columns_to_keep)

In [None]:
# Review api output dataframe

oti_api_bin_output_df.head(10)

Unnamed: 0,row_id,buildingIdentificationNumber,bbl,bblBoroughCode,bblTaxBlock,bblTaxLot,internalLabelXCoordinate,internalLabelYCoordinate,geosupportFunctionCode,geosupportReturnCode
0,1029,3119614,3051870032,3,5187,32,996555,173377,BN,0
1,2305,3325569,3032530028,3,3253,28,1004718,192588,BN,0
2,725,3399068,3017680046,3,1768,46,999932,192170,BN,0
3,5015,3399250,3025560058,3,2556,58,995465,205275,BN,0
4,4569,4618244,4125290239,4,12529,239,1047327,187467,BN,0
5,4403,1079982,1022000009,1,2200,9,1006443,253362,BN,0
6,4897,1089873,1015380021,1,1538,21,998244,224298,BN,0
7,1580,1052316,1016440065,1,1644,65,1000269,230555,BN,0
8,2163,3057984,3020310001,3,2031,1,991669,192969,BN,0
9,2822,2001152,2023590210,2,2359,210,1008757,237672,BN,0


In [None]:
# Export geocoded output to csv

oti_api_bin_output_df.to_csv('oti_api_bin_output_df.csv', index=False)

**Example 3: Calling the OTI Geoclient BBL API endpoint**

A Borough-Block-and-Lot (BBL) is a single data item used that can be used to uniquely identify a city tax lot. It is maintained by the NYC Department of Finance (DOF). A city tax lot is a a subdivision of the broader city tax geography, which DOF manages.

You can read more about them here: https://nycplanning.github.io/Geosupport-UPG/chapters/chapterVI/section02/

In [None]:
# Create a custom function to make API calls to the 'BBL' endpoint

def oti_geoclient_api_bbl_endpoint(api_endpoint, headers, df_name, df_key_field, boro_input_col, block_input_col, lot_input_col, response_columns=None):
    """
    Fetch data from the OTI geoclient API, merge the response with the original dataframe, and return the merged dataframe.

    Parameters:
    - api_endpoint (str): The API endpoint URL.
    - headers (dict): The headers to send with the API request.
    - df_name (pd.DataFrame): The input pandas DataFrame.
    - df_key_field (str): The name of the primary key column in the DataFrame.
    - boro_input_col (str): The name of the column in the DataFrame that provides the borough input for the API.
    - block_input_col (str): The name of the column in the DataFrame that provides the block input for the API.
    - lot_input_col (str): The name of the column in the DataFrame that provides the lot input for the API.
    - response_columns (dict): Optional. A dictionary specifying which API response columns you want to keep.

    Returns:
    - pd.DataFrame: The merged DataFrame containing the original data and the filtered API response data.
    """

    # Create a session object
    session = requests.Session()
    session.headers.update(headers)

    # Define the function to send a request
    def send_request(borough, block, lot):
        params = {
            'borough': borough,
            'block': block,
            'lot': lot
        }
        #print(f"Sending request to API with URL: {api_endpoint}, params: {params}, and headers: {headers}")  # Print the full URL, params, and headers
        try:
            response = session.get(api_endpoint, params=params)
            if response.status_code == 200:
                json_response = response.json()  # Parse the JSON response
                if 'bbl' in json_response:
                    return json_response['bbl']  # Return the 'bbl' object
                else:
                    return {}
            else:
                return {}
        except Exception as e:
            #print(f"Request failed for bbl {borough}{block}{lot}: {e}")
            return {}

    # Prepare data for processing
    boroughs = df_name[boro_input_col].tolist()
    blocks = df_name[block_input_col].tolist()
    lots = df_name[lot_input_col].tolist()
    key_field_values = df_name[df_key_field].tolist()

    # List to store results
    results = []

    # Calculate the delay needed to stay within the rate limit
    delay_per_request = 60 / 2500  # 60 seconds divided by 2500 requests

    # Send requests sequentially with delay
    for borough, block, lot in zip(boroughs, blocks, lots):
        result = send_request(borough, block, lot)
        results.append(result)
        time.sleep(delay_per_request)  # Delay between requests to respect the rate limit

    # Convert the list of responses to a DataFrame
    if results and any(results):  # Check if results list is not empty and contains non-empty dictionaries
        response_df = pd.DataFrame(results)

        # If response_columns dictionary is provided, filter to keep only those columns
        if response_columns:
            response_df = response_df[response_columns]

        # Add the df_key_field from the original dataframe to the response_df for merging
        response_df[df_key_field] = key_field_values

        # Perform a left join of the original DataFrame with the response DataFrame on df_key_field
        merged_df = pd.merge(df_name, response_df, on=df_key_field, how='left')
    else:
        # If all results are empty, return the original DataFrame
        #print("API returned empty results for all rows.")
        merged_df = df_name.copy()

    # Close the session when done
    session.close()

    return merged_df

In [None]:
# Prepare a pandas dataframe to pass to OTI api

# Specify the columns you want to keep from the original dataframe

input_columns_to_keep = ['row_id', 'bblBoroughCode', 'bblTaxBlock', 'bblTaxLot']  # Replace with the columns you want to keep

# Create a copy of the OTI address output pandas dataframe and sample 500 records

bbl_input_df = oti_api_address_output_df[oti_api_address_output_df['bblBoroughCode'].notna()][input_columns_to_keep].sample(n=500, random_state=1).copy()

In [None]:
# Prepare parameters for API

# Note the headers_param was already set in Example #1

bbl_api_url_param = "https://api.nyc.gov/geo/geoclient/v1/bbl.json"

bbl_return_columns_to_keep = ['bbl','buildingIdentificationNumber',
    'latitudeInternalLabel','longitudeInternalLabel',
    'internalLabelXCoordinate', 'internalLabelYCoordinate',
    'numberOfEntriesInListOfGeographicIdentifiers','numberOfExistingStructuresOnLot',
    'numberOfStreetFrontagesOfLot',
    'geosupportFunctionCode',
    'geosupportReturnCode', 'returnCode1a'
]

In [None]:
# Call the API using the custom function

oti_api_bbl_output_df = oti_geoclient_api_bbl_endpoint(
    api_endpoint= bbl_api_url_param,
    headers= headers_param,
    df_name= bbl_input_df,
    df_key_field='row_id',
    boro_input_col='bblBoroughCode',
    block_input_col='bblTaxBlock',
    lot_input_col='bblTaxLot',
    response_columns= bbl_return_columns_to_keep)

In [None]:
# Review api output dataframe

oti_api_bbl_output_df.head(10)

Unnamed: 0,row_id,bblBoroughCode,bblTaxBlock,bblTaxLot,bbl,buildingIdentificationNumber,latitudeInternalLabel,longitudeInternalLabel,internalLabelXCoordinate,internalLabelYCoordinate,numberOfEntriesInListOfGeographicIdentifiers,numberOfExistingStructuresOnLot,numberOfStreetFrontagesOfLot,geosupportFunctionCode,geosupportReturnCode,returnCode1a
0,1029,3,5187,32,3051870032,3119614,40.642548,-73.955661,996555,173377,3,2,2,BL,0,0
1,2305,3,3253,28,3032530028,3325569,40.695263,-73.926188,1004718,192588,1,1,1,BL,0,0
2,725,3,1768,46,3017680046,3399068,40.694125,-73.943449,999932,192170,1,1,1,BL,0,0
3,5015,3,2556,58,3025560058,3399250,40.730102,-73.959535,995465,205275,6,1,3,BL,0,0
4,4569,4,12529,239,4125290239,4618244,40.681006,-73.77258,1047327,187467,1,1,1,BL,0,0
5,4403,1,2200,9,1022000009,1079981,40.862067,-73.919767,1006443,253362,3,3,1,BL,0,0
6,4897,1,1538,21,1015380021,1078601,40.782311,-73.949469,998244,224298,3,3,3,BL,0,0
7,1580,1,1644,65,1016440065,1052316,40.799482,-73.942142,1000269,230555,1,1,1,BL,0,0
8,2163,3,2031,1,3020310001,3057984,40.696329,-73.973245,991669,192969,2,1,2,BL,0,0
9,2822,2,2359,210,2023590210,2001152,40.818996,-73.911459,1008757,237672,4,1,2,BL,0,0


In [None]:
# Export geocoded output to csv

oti_api_bbl_output_df.to_csv('oti_api_bbl_output_df.csv', index=False)