In [20]:
# Author: Claire Wagner
# Purpose: To generate data about city-owned PINs.

In [21]:
import pandas as pd
import urllib.parse
import datetime
import json
from collections import OrderedDict

In [22]:
app_token = "yo0POz8pPZyyDO9jOvtesb42J" # CW - HEI Leaflet Map App Token
limit = 3000000
city_owned_api_info = { "domain" : "https://data.cityofchicago.org", "dataset" : "aksk-kvfp" }
properties_api_info = { "domain" : "https://datacatalog.cookcountyil.gov", "dataset" : "c49d-89sn" }
business_licenses_api_info = { "domain" : "https://data.cityofchicago.org", "dataset" : "uupf-x98q" }
ward_boundaries_2015_to_2023_api_info = { "domain" : "https://data.cityofchicago.org", "dataset" : "k9yb-bpqx" }
ward_boundaries_2023_api_info = { "domain" : "https://data.cityofchicago.org", "dataset" : "p293-wvbd" }
neighborhood_boundaries_api_info = { "domain" : "https://data.cityofchicago.org", "dataset" : "y6yq-dbs2" }
zoning_districts_api_info = { "domain" : "https://data.cityofchicago.org", "dataset" : "dj47-wfun" }
ward_offices_api_info = { "domain" : "https://data.cityofchicago.org", "dataset" : "htai-wnw4" }

In [23]:
# Generate attribution string that gives the URLs and the access date and time for the data sources.
fetchtime = datetime.datetime.now(datetime.timezone.utc).strftime("%d %b %Y")

In [24]:
def makeAPIRequest(api_endpoint, params, limit, read_function):
    """Helper function to make Socrata API request."""
    query = "?"
    if len(params) > 0:
        query += "&".join(params) + "&"
    query += "$limit=" + str(limit) + "&$$app_token=" + app_token
    return read_function(api_endpoint + urllib.parse.quote(query, safe="&?$=,!()"))

def get_url_response(url):
    with urllib.request.urlopen(url) as response:
        if response.status == 200:
            return response.read()
        else:
            return None

def get_metadata(api_info):
    def get_original_metadata(domain, dataset, actual_datasource_uri = None):
        original_metadata = json.loads(makeAPIRequest(
            api_endpoint = domain + "/api/views/metadata/v1/" + dataset + ".json",
            params = [],
            limit = limit,
            read_function = get_url_response
        ))
        # if this isn't the full metadata for this dataset, use the provided "reviewableUid" as the metadata
        # while providing information about the actual original datasource
        if "approvals" in original_metadata and "reviewableUid" in original_metadata["approvals"][-1]:
            print(f"reviewableUid found for {original_metadata['name']}")
            return get_original_metadata(
                domain,
                original_metadata["approvals"][-1]["reviewableUid"],
                original_metadata["actualDatasourceUri"] if "actualDatasourceUri" in original_metadata else original_metadata["dataUri"],
            )
        else:
            # we've found the correct metadata to use, so include actual_datasource_uri in the metadata
            if actual_datasource_uri != None:
                original_metadata["actualDatasourceUri"] = actual_datasource_uri
            return original_metadata
    # get metadata
    original_metadata = get_original_metadata(api_info["domain"], api_info["dataset"])
    # edit metadata for return
    edited_metadata = { key: original_metadata[key] if key in original_metadata else None for key in [
        "id", "name", "description", "dataUri", "attribution", "attributionLink", "actualDatasourceUri",
    ] }
    for key in ["createdAt", "dataUpdatedAt", "metadataUpdatedAt"]:
        if (key not in original_metadata or original_metadata[key] == None):
            edited_metadata[key] = None
        else:
            edited_metadata[key] = datetime.datetime.fromisoformat(original_metadata[key].split('T')[0]).strftime("%d %b %Y")
    edited_metadata["accessedOn"] = fetchtime
    return edited_metadata

def standardize_columns(df):
    """standardize the names of all columns in df by making them lowercase and snake_case"""
    df.columns = df.columns.str.lower().str.split().str.join('_')
    
def get_dataset_url(api_info, suffix):
    return api_info["domain"] + "/resource/" + api_info["dataset"] + "." + suffix

def get_geojson_data(api_info):
    output = {}
    output["data"] = json.loads(makeAPIRequest(
        api_endpoint = get_dataset_url(api_info, "geojson"),
        params = [],
        limit = limit,
        read_function = get_url_response
    ))
    output["metadata"] = get_metadata(api_info)
    return output

"""Aggregate x by returning a list of all unique, non-null values in x in the order they were encountered
(or, if there is only one unique value in x, returning that value)"""
def set_aggregation_function(x):
    xSet = OrderedDict()
    for item in x:
        if not pd.isna(item):
            xSet[item] = None
    xList = list(xSet.keys())
    xListLen = len(xList)
    if xListLen == 0:
        return None
    elif xListLen == 1:
        return xList[0]
    else:
        return xList

In [25]:
# fetch data from the Property Locations dataset about all properties in Wards 1-50
properties = makeAPIRequest(
    api_endpoint = get_dataset_url(properties_api_info, "json"),
    params = [
        "$select=pin, property_address, property_zip, ward, longitude, latitude, tract_geoid",
        "$where=(latitude IS NOT NULL) AND (longitude IS NOT NULL)",
    ],
    limit = limit,
    read_function = pd.read_json,
)
properties_metadata = get_metadata(properties_api_info)
properties_metadata["dataUseNotes"] = "Used to obtain location information for PINs in Cook County."

properties["pin"] = properties["pin"].apply(str).str.rjust(14, '0')

with open("../data/misc/misc.js", "w", encoding="utf-8") as f:
    f.write("const property_locations_metadata = ")
    f.write(json.dumps(properties_metadata) + ";")

In [26]:
def get_ward_boundaries():
    print("Getting data for ward boundaries")
    with open("../data/geojson/ward_boundaries.js", "w", encoding="utf-8") as f:
        f.write("const ward_boundaries_2015_to_2023 = ")
        f.write(json.dumps(get_geojson_data(ward_boundaries_2015_to_2023_api_info)) + ";")
        f.write("\n\nconst ward_boundaries_2023 = ")
        f.write(json.dumps(get_geojson_data(ward_boundaries_2023_api_info)) + ";")
    print("Finished getting data for ward boundaries")

In [27]:
def get_neighborhood_boundaries():
    print("Getting data for neighborhood boundaries")
    with open("../data/geojson/neighborhood_boundaries.js", "w", encoding="utf-8") as f:
        f.write("const neighborhood_boundaries = ")
        f.write(json.dumps(get_geojson_data(neighborhood_boundaries_api_info)) + ";")
    print("Finished getting data for neighborhood boundaries")

In [28]:
def get_zoning_districts():
    print("Getting data for zoning districts")
    with open("../data/geojson/zoning_districts.js", "w", encoding="utf-8") as f:
        f.write("const zoning_districts = ")
        f.write(json.dumps(get_geojson_data(zoning_districts_api_info)) + ";")
    print("Finished getting data for zoning districts")

In [29]:
def get_individual_properties():
    print("Getting data for individual properties")
    # fetch location info for Sunshine Gospel Ministries address (source: https://www.sunshinegospel.org/)
    sgmAddress = "500 E 61st St".lower()
    # get location data for Sunshine Gospel Ministries
    sgm = makeAPIRequest(
        api_endpoint = get_dataset_url(properties_api_info, "json"),
        params = [
            "$select=pin, property_address, longitude, latitude",
            f"$where=lower(property_address)='{sgmAddress}'",
        ],
        limit = 1,
        read_function = pd.read_json,
    ).loc[0]
    
    ward_20_office = makeAPIRequest(
        api_endpoint = get_dataset_url(ward_offices_api_info, "json"),
        params = [
            "$select=ward, alderman, address, location",
            "$where=ward=20",
        ],
        limit = 1,
        read_function = pd.read_json,
    ).loc[0]
    ward_20_office["latitude"] = ward_20_office["location"].get("latitude")
    ward_20_office["longitude"] = ward_20_office["location"].get("longitude")
    ward_20_office = ward_20_office.drop(labels=["location"])
    
    ward_offices_metadata = get_metadata(ward_offices_api_info)
    ward_offices_metadata["dataUseNotes"] = "Used to obtain location information for the Ward 20 office."
        
    with open("../data/misc/individual_properties.js", "w", encoding="utf-8") as f:
        f.write("const sunshine_gospel = ")
        f.write(sgm.to_json(orient="index") + ";")
        f.write("\n\nconst ward_20_office = ")
        f.write(ward_20_office.to_json(orient="index") + ";")
        f.write("\n\nconst ward_offices_metadata = ")
        f.write(json.dumps(ward_offices_metadata) + ";")
    print("Finished getting data for individual properties")

In [30]:
def get_city_owned():
    print("Getting data for city-owned properties")
    # fetch data from the City-Owned Land Inventory dataset about all properties currently owned by the City of Chicago that might be up for sale (see http://dev.cityofchicago.org/open%20data/data%20portal/2020/08/11/city-owned-property.html)
    city_owned = makeAPIRequest(
        api_endpoint = get_dataset_url(city_owned_api_info, "json"),
        params = [
            "$select=pin, managing_organization, lower(property_status) AS property_status, date_of_acquisition, date_of_disposition, sq_ft, last_update",
            "$where=(lower(property_status)='owned by city') AND (lower(managing_organization)='none' OR managing_organization IS NULL)",
        ],
        limit = limit,
        read_function = pd.read_json,
    )
    city_owned_metadata = get_metadata(city_owned_api_info)
    
    city_owned["pin"] = city_owned["pin"].str.replace("-","")
    city_owned_join = pd.merge(city_owned, properties, how="inner", on="pin", suffixes = ["_aksk-kvfp", "_c49d-89sn"])
    count_of_no_updated_location_info = city_owned.shape[0] - city_owned_join.shape[0]
    print("number of city-owned properties for which no location data from properties could be found:", count_of_no_updated_location_info)

    city_owned_metadata["dataUseNotes"] = 'Only PINs where "property_status" is "owned by city" and' \
    + ' "managing_organization" is "none" or blank (using case-insensitive matching) have been included on the map' \
    + ' (based on the Open Data Portal Team\'s' \
    + ' <a href="http://dev.cityofchicago.org/open%20data/data%20portal/2020/08/11/city-owned-property.html">notes</a>' \
    + ' about the dataset). Up-to-date location information was obtained using the' \
    + f' <a href={properties_metadata["dataUri"]}>"{properties_metadata["name"]}"</a> dataset' \
    + f' ({count_of_no_updated_location_info} PINs for which no up-to-date location information could be found have been excluded).'
    
    city_owned_join.filter(items=[
        "pin",
        "managing_organization",
        "property_status",
        "last_update",
        "date_of_acquisition",
        "date_of_disposition",
        "property_address",
        "property_zip",
        "ward",
        "tract_geoid",
    ]).to_excel("city_owned_pins_possibly_for_sale.xlsx", index=False)

    city_owned_join = city_owned_join.set_index("pin", drop=False)
    
    with open("../data/city-owned/city_owned_data.js", "w", encoding="utf-8") as f:
        f.write("const city_owned_data = ")
        f.write(city_owned_join.to_json(orient="index") + ";")
        f.write("\n\nconst city_owned_metadata = ")
        f.write(json.dumps(city_owned_metadata) + ";")
    print("Finished getting data for city-owned properties")

In [31]:
def get_business_licenses():
    print("Getting data for business licenses")
    business_licenses = makeAPIRequest(
        api_endpoint = get_dataset_url(business_licenses_api_info, "json"),
        params = [
            "$select=license_number, license_id AS license_record_id, doing_business_as_name, license_description, business_activity, address, ward, zip_code, longitude, latitude, license_start_date",
            "$where=city='CHICAGO' AND (latitude IS NOT NULL) AND (longitude IS NOT NULL)",
        ],
        limit = limit,
        read_function = pd.read_json,
    )
    business_licenses_metadata = get_metadata(business_licenses_api_info)
    business_licenses_metadata["dataUseNotes"] = 'Only PINs with valid coordinates where "city" is "CHICAGO" have been included on the map.'
    
    # filter out duplicate entries for the same license number, keeping only the entry with the most recent license start date
    business_licenses_filtered = business_licenses.sort_values(by=['license_number', 'license_start_date']).drop_duplicates(subset=['license_number'], keep='last').drop(columns=['license_start_date'])
    # check that each license number is unique
    assert business_licenses_filtered.shape[0] == business_licenses_filtered['license_number'].unique().shape[0]
    
    # aggregate licenses into a single entry for each address of each business
    business_licenses_aggregated = business_licenses_filtered.groupby(by=["doing_business_as_name", "address"]).agg(set_aggregation_function).reset_index()    
    with open("../data/business-licenses/business_license_data.js", "w", encoding="utf-8") as f:
        f.write("const business_license_data = ")
        f.write(business_licenses_aggregated.to_json(orient="records") + ";")
        f.write("\n\nconst business_licenses_metadata = ")
        f.write(json.dumps(business_licenses_metadata) + ";")
    print("Finished getting data for business licenses")

In [32]:
def get_scavenger_sale():
    print("Getting data for scavenger sale")
    # scavenger sale data from 28 January 2022 (source: https://web.archive.org/web/20220128184252/https://www.cookcountytreasurer.com/pdfs/scavsale/2022cookcountyscavengertaxsalelist.xlsx, accessed 3 June 2022)
    scavenger = pd.read_excel("../data/scavenger-sale/2022cookcountyscavengertaxsalelist_1-28-22.xlsx")
    standardize_columns(scavenger)
    scavenger["pin"] = scavenger["pin"].str.replace("-", "")
    scavenger["classification"] = scavenger["classification"].str.strip()
    
    # property class data (source: https://www.cookcountyassessor.com/form-document/codes-classification-property, accessed 22 June 2022)
    property_classes = pd.read_excel("../data/scavenger-sale/property_classes.xlsx")
    # join scavenger sale data with property class data
    scavenger_with_property_classes = pd.merge(scavenger, property_classes, how="left", left_on = "classification", right_on="property_code").drop(columns=['classification'])
    # make sure all scavenger sale entries have associated property class data
    assert scavenger_with_property_classes[pd.isna(scavenger_with_property_classes['property_code'])].shape[0] == 0
    
    # get location data for pins on scavenger sale list
    scavenger_final_join = pd.merge(scavenger_with_property_classes, properties, how="inner", on="pin", suffixes=["_scavenger_sale", "_c49d-89sn"])
    # make sure all pins on scavenger sale list have associated location data
    assert scavenger_final_join.shape[0] == scavenger_with_property_classes.shape[0]
    scavenger_final_join = scavenger_final_join[scavenger_final_join['property_city'] == 'CHICAGO'].filter(items = [
        'pin',
        'property_address_scavenger_sale',
        'ward',
        'delinquent_tax_year_range',
        'delinquent_tax',
        'delinquent_interest',
        'total_delinquency',
        '2020_taxes_billed',
        'property_code',
        'property_code_meaning',
        'property_code_class',
        'property_zip',
        'longitude',
        'latitude',
        'tract_geoid',
    ]).rename(columns = {
        'property_address_scavenger_sale' : 'property_address',
    }).set_index('pin', drop=False)
    # make sure there are no duplicated entries for pins
    assert scavenger_final_join.index.unique().shape[0] == scavenger_final_join.index.shape[0]
    
    scavenger_sale_metadata = {
        'id': None,
        'name': '2022 Cook County Scavenger Tax Sale List',
        'description': 'Properties scheduled to be offered at the 2022 Scavenger Sale.',
        'dataUri': 'https://web.archive.org/web/20220128181637/https://www.cookcountytreasurer.com/scavengersalemap.aspx',
        'attribution': 'Cook County Treasurer',
        'attributionLink': 'https://www.cookcountytreasurer.com/',
        'actualDatasourceUri': 'https://web.archive.org/web/20220128184252/https://www.cookcountytreasurer.com/pdfs/scavsale/2022cookcountyscavengertaxsalelist.xlsx',
        'createdAt': None,
        'dataUpdatedAt': '28 January 2022',
        'metadataUpdatedAt': None,
        'accessedOn': '3 Jun 2022',
        'dataUseNotes': 'Only PINs where "property_city" is "CHICAGO" have been included on the map.' \
        + ' Location information was obtained using the' \
        + f' <a href={properties_metadata["dataUri"]}>"{properties_metadata["name"]}"</a> dataset.'
    }
    
    property_classification_codes_metadata = {
        'id': None,
        'name': 'Definitions for the Codes for Classification of Real Property',
        'description': 'Definitions for the codes for classification of real property.',
        'dataUri': 'https://prodassets.cookcountyassessor.com/s3fs-public/form_documents/classcode.pdf?VersionId=12JVr.oX..WD4hfgjLCp5AIVfar71ndn',
        'attribution': 'Cook County Assessor',
        'attributionLink': 'https://www.cookcountyassessor.com/',
        'actualDatasourceUri': None,
        'createdAt': '03 April 2018',
        'dataUpdatedAt': None,
        'metadataUpdatedAt': None,
        'accessedOn': '22 Jun 2022',
    }
    
    with open("../data/scavenger-sale/scavenger_data.js", "w", encoding="utf-8") as f:
        f.write("const scavenger_sale_data = ") # assign join JSON data to variable for easier access by JavaScript scripts in browser
        f.write(scavenger_final_join.to_json(orient="index") + ";") # output as JSON
        f.write("\n\nconst scavenger_sale_metadata = ")
        f.write(json.dumps(scavenger_sale_metadata) + ";")
        f.write("\n\nconst property_classification_codes_metadata = ")
        f.write(json.dumps(property_classification_codes_metadata) + ";")
    print("Finished getting data for scavenger sale")

In [33]:
def get_tax_sale_ward_20():
    print("Getting data for Ward 20 tax sale")
    # Ward 20 Properties on Tax Sale list for Tax Year 2019, accessed on 17 May 2022 around 11 am ET
    taxsale = pd.read_excel("../data/tax-sale/tax_sale_tax_year_2019_ward20_accessed_17_May_2022.xlsx")
    standardize_columns(taxsale)
    taxsale['pin'] = taxsale['pin'].str.replace('-', '')
    taxsale = taxsale.filter(items=[
        'pin',
        'property_address',
        'current_mailing_address',
        'taxpayer_name',
        'tax_type',
        'tax_year',
        'total_tax_due',
        'total_due_(including_interest)',
        'classification',
        'prior_tax_years_may_also_be_unpaid',
    ])
    taxsale = taxsale[taxsale['tax_year'] == 2019]
    # make sure no entries have duplicate pins
    assert taxsale['pin'].shape[0] == taxsale['pin'].unique().shape[0]
    
    taxsale_join = pd.merge(taxsale, properties, how="inner", on="pin", suffixes=["_tax_sale", "_c49d-89sn"])
    # make sure all pins have associated location data and are in Ward 20
    assert taxsale_join.shape[0] == taxsale.shape[0]
    assert taxsale_join.shape[0] == taxsale_join[taxsale_join['ward'] == 20].shape[0]
    taxsale_join = taxsale_join.drop(columns=[
        'property_address_c49d-89sn',
    ]).rename(columns={
        'property_address_tax_sale' : 'property_address',
        'total_due_(including_interest)' : 'total_due_including_interest',
    }).set_index('pin', drop=False)
    
    taxsale_ward20_metadata = {
        'id': None,
        'name': 'Ward 20 Delinquent Tax Report',
        'description': 'Properties in Ward 20 with delinquent taxes for Tax Year 2019 (payable in 2020) that were eligible for the Annual Tax Sale that began May 12, 2022.',
        'dataUri': 'https://www.cookcountytreasurer.com/delinquenttaxes.aspx',
        'attribution': 'Cook County Treasurer',
        'attributionLink': 'https://www.cookcountytreasurer.com/',
        'actualDatasourceUri': None,
        'createdAt': None,
        'dataUpdatedAt': None,
        'metadataUpdatedAt': None,
        'accessedOn': '17 Jun 2022',
        'dataUseNotes': 'Only PINs where "tax_year" is 2019 have been included on the map.' \
        + ' Location information was obtained using the' \
        + f' <a href={properties_metadata["dataUri"]}>"{properties_metadata["name"]}"</a> dataset.'
    }
    
    with open("../data/tax-sale/tax_sale_ward20_tax_year_2019.js", "w", encoding="utf-8") as f:
        f.write("const taxsale_ward20_data = ") # assign JSON data to variable for easier access by JavaScript scripts in browser
        f.write(taxsale_join.to_json(orient="index") + ";") # output as JSON
        f.write("\n\nconst taxsale_ward20_metadata = ")
        f.write(json.dumps(taxsale_ward20_metadata) + ";")
    print("Finished getting data for Ward 20 tax sale")

In [15]:
# comment out the functions that should not be called
get_ward_boundaries()
get_neighborhood_boundaries()
get_zoning_districts()
get_individual_properties()
get_city_owned()
get_business_licenses()
get_scavenger_sale()
get_tax_sale_ward_20()

Getting data for ward boundaries
reviewableUid found for WARDS_2015
Finished getting data for ward boundaries
Getting data for neighborhood boundaries
reviewableUid found for Neighborhoods_2012b
Finished getting data for neighborhood boundaries
Getting data for zoning districts
reviewableUid found for zoning_2016_01
Finished getting data for zoning districts
Getting data for individual properties
Finished getting data for individual properties
Getting data for city-owned properties
number of city-owned properties for which no location data from properties could be found: 6
Finished getting data for city-owned properties
Getting data for business licenses
Finished getting data for business licenses
Getting data for scavenger sale
Finished getting data for scavenger sale
Getting data for Ward 20 tax sale
Finished getting data for Ward 20 tax sale
