In [1]:
import os
import pandas as pd
from placekey.api import PlacekeyAPI
import json
import polars as pl

In [2]:
# Configuration
PLACEKEY_API_KEY = os.environ.get('PLACEKEY_API_KEY')  # Store API key in environment variable
DATA_DIR = 'data'

In [3]:
def get_df_for_api(df, column_map):
    """Prepares a DataFrame for the Placekey API.

    Args:
        df (pd.DataFrame): Input DataFrame.
        column_map (dict): Mapping of original column names to Placekey API field names.

    Returns:
        pd.DataFrame: DataFrame ready for Placekey API input.
    """

    df = df.rename(columns=column_map)
    df = df[list(column_map.values())]
    df['iso_country_code'] = 'US'
    df['query_id'] = df['query_id'].astype(str)
    if 'postal_code' in df.columns:
        df['postal_code'] = df['postal_code'].astype(str)
    if 'region' in df.columns:
        df['region'] = df['region'].astype(str)
    return df

def process_dataset(df, column_map, output_filename):
    """Processes a dataset through the Placekey API.

    Args:
        df (pd.DataFrame): Input DataFrame.
        column_map (dict): Mapping of column names.
        output_filename (str): Name of the CSV file to store Placekeys.

    Returns:
        pd.DataFrame: DataFrame with Placekeys.
    """

    # json.loads(Blight_Violations_for_api.to_json(orient="records"))
    df_for_api = get_df_for_api(df, column_map)
    data_json = json.loads(df_for_api.to_json(orient="records"))
    responses = pk_api.lookup_placekeys(data_json, verbose=True)
    placekeys_df = pd.read_json(json.dumps(responses), dtype={'query_id':str})
    placekeys_df.to_csv(os.path.join(DATA_DIR, output_filename), index=False)
    return placekeys_df

In [7]:
if not PLACEKEY_API_KEY:
        raise ValueError("Please set the PLACEKEY_API_KEY environment variable.")

pk_api = PlacekeyAPI(PLACEKEY_API_KEY)

# Process Blight Violations
blight_df = pd.read_csv(os.path.join(DATA_DIR, 'Blight_Violations.csv'))
blight_df['zip_code'].astype(str)
blight_column_map = {
    "ticket_id": "query_id",
    "violation_address" : "street_address",
    "state": "region",
    "zip_code": "postal_code",
    "X" : "latitude",
    "Y" : "longitude",
    "country" : "iso_country_code",
    "city" : "city"
    }
placekeys_Blight_Violations = process_dataset(blight_df, blight_column_map, 'placekeys_Blight_Violations.csv')
placekeys_Blight_Violations['ticket_id'] = placekeys_Blight_Violations['query_id'].astype(str)
blight_df['ticket_id'] = blight_df['ticket_id'].astype(str)
Blight_Violations_w_placekeys = blight_df.merge(placekeys_Blight_Violations, on='ticket_id', how='left')
Blight_Violations_w_placekeys.to_csv(os.path.join(DATA_DIR, 'Blight_Violations_w_placekeys.csv'), index=False)

# Process Property Sales
sales_df = pd.read_csv(os.path.join(DATA_DIR, 'Property_Sales.csv'))
Property_Sales_column_map = {
    "sale_id": "query_id",
    "address" : "street_address",
    "X" : "latitude",
    "Y" : "longitude"
    }
placekeys_Property_Sales = process_dataset(sales_df, Property_Sales_column_map, 'placekeys_Property_Sales.csv') 
placekeys_Property_Sales['sale_id'] = placekeys_Property_Sales['query_id']
sales_df['sale_id'] = sales_df['sale_id'].astype(str)
Property_Sales_w_placekeys = sales_df.merge(placekeys_Property_Sales, on='sale_id', how='left')
Property_Sales_w_placekeys.to_csv(os.path.join(DATA_DIR, 'Property_Sales_w_placekeys.csv'), index=False)

Unnamed: 0,query_id,street_address,region,postal_code,latitude,longitude,iso_country_code,city
0,18645,599 King,MI,48202,-83.072568,42.383357,US,Det
1,18646,18604 Appoline,MI,48235,-83.072474,42.3834,US,Det
2,18648,4066 Columbus,MI,48204,-83.115057,42.359928,US,Det
3,18649,3005 Pasada,MI,48238,-83.128037,42.393455,US,Det
4,18650,20211 Westmoreland,MI,48219,-83.134466,42.389668,US,Det


In [None]:
Property_Sales_w_placekeys = pl.read_csv('')
placekeys_Property_Sales = pl.read_csv('')
placekeys_only_Property_Sales = placekeys_Property_Sales["placekey"].to_list()

Blight_Violations_w_placekeys = pl.scan_csv('').filter(
            pl.col("placekey").is_in(placekeys_only_Property_Sales).is_in(npi_address_placekeys)
        ).collect()

In [None]:
def pct_not_null(df, column_name):
    total_non_null = df.select(pl.col(column_name).is_not_null().sum()).to_numpy()[0, 0]
    total_rows = df.height
    pct_non_null = (total_non_null / total_rows) * 100
    return round(pct_non_null, 2)

In [None]:
Blight_Violations_joined_Property_Sales_placekey = Property_Sales_w_placekeys.join(Blight_Violations_w_placekeys, "placekey", "left")