In [1]:
import numpy as np
import pandas as pd
import re

In [2]:
# date accessed: June 14, 2022 (last updated: June 13, 2022)
city_owned = pd.read_json("https://data.cityofchicago.org/resource/aksk-kvfp.json?$limit=100000")

In [3]:
def standardize_columns(df):
    """standardize the names of all columns in df by making them lowercase and snake_case"""
    df.columns = df.columns.str.lower().str.split().str.join('_')

standardize_columns(city_owned)

In [4]:
city_owned_current = city_owned[city_owned['property_status'].str.casefold() == "owned by city"]

In [5]:
# find indices of all PINs that contain characters other than 
indices_to_fix = city_owned_current[city_owned_current['pin'].str.contains(r'[^\d-]')].index.values
city_owned_current.loc[indices_to_fix]

Unnamed: 0,id,pin,address,managing_organization,property_status,sq_ft,ward,community_area_number,community_area_name,zoning_classification,...,longitude,location,:@computed_region_rpca_8um6,:@computed_region_vrxf_vc4k,:@computed_region_6mkv_f3dw,:@computed_region_bdys_3d7i,:@computed_region_43wa_7qmu,:@computed_region_awaf_s7ux,date_of_acquisition,date_of_disposition


In [7]:
city_owned_current['pin'] = city_owned_current['pin'].str.replace('-','').apply(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  city_owned_current['pin'] = city_owned_current['pin'].str.replace('-','').apply(int)


In [8]:
properties = pd.read_csv("Assessor__Archived_05-11-2022__-_Property_Locations.csv")

In [9]:
assert city_owned_current['pin'].shape == city_owned_current['pin'].unique().shape
assert properties['pin'].shape == properties['pin'].unique().shape

In [10]:
merge = pd.merge(city_owned_current, properties, how="left", on="pin", suffixes=["_aksk-kvfp","_c49d-89sn"])

In [11]:
wip = merge.filter(items=[
    'pin',
    'id',
    'address',
    'property_address',
    'managing_organization',
    'sq_ft',
    'ward_aksk-kvfp',
    'ward_c49d-89sn',
    'zip_code',
    'property_zip',
    'last_update',
    'latitude_aksk-kvfp',
    'longitude_aksk-kvfp',
    'latitude_c49d-89sn',
    'longitude_c49d-89sn',
    'date_of_acquisition',
    'date_of_disposition',
]).rename(columns = {
    'address' : 'address_aksk-kvfp',
    'property_address' : 'address_c49d-89sn',
    'zip_code' : 'zipcode_aksk-kvfp',
    'property_zip' : 'zipcode_c49d-89sn',
})

In [12]:
wip['zipcode_c49d-89sn'] = wip['zipcode_c49d-89sn'].str.split('-').str.get(0).apply(float)

In [17]:
wip['address_match'] = (wip['address_aksk-kvfp'] == wip['address_c49d-89sn'])
wip['ward_match'] = (wip['ward_aksk-kvfp'] == wip['ward_c49d-89sn'])
wip['zipcode_match'] = (wip['zipcode_aksk-kvfp'] == wip['zipcode_c49d-89sn'])

In [17]:
wip.dtypes

pin                        int64
id                         int64
address_aksk-kvfp         object
address_c49d-89sn         object
managing_organization     object
sq_ft                    float64
ward_aksk-kvfp           float64
ward_c49d-89sn           float64
zipcode_aksk-kvfp        float64
zipcode_c49d-89sn        float64
last_update               object
latitude_aksk-kvfp       float64
longitude_aksk-kvfp      float64
latitude_c49d-89sn       float64
longitude_c49d-89sn      float64
date_of_acquisition       object
date_of_disposition       object
address_match               bool
ward_match                  bool
zipcode_match               bool
dtype: object

In [24]:
consider = wip[(wip['zipcode_c49d-89sn'] == 60637) | (wip['ward_c49d-89sn'] == 20)]

In [19]:
#mismatch = consider[~consider['address_match']].index.values

In [26]:
consider = consider.drop([3]) # index 3 pin is probably mistyped and so should be dropped

In [25]:
#for i in mismatch:
#    print(consider.loc[i, 'pin'], ":", consider.loc[i, 'address_aksk-kvfp'], ' == ', consider.loc[i, 'address_c49d-89sn'])
#    response = input()
#    consider.loc[i, 'address_match_corrected'] = (response.casefold() == 'y') 

20151220090000 : 5643 S CALUMET AVE  ==  5827 S INDIANA AVE
n


In [27]:
consider.columns

Index(['pin', 'id', 'address_aksk-kvfp', 'address_c49d-89sn',
       'managing_organization', 'sq_ft', 'ward_aksk-kvfp', 'ward_c49d-89sn',
       'zipcode_aksk-kvfp', 'zipcode_c49d-89sn', 'last_update',
       'latitude_aksk-kvfp', 'longitude_aksk-kvfp', 'latitude_c49d-89sn',
       'longitude_c49d-89sn', 'date_of_acquisition', 'date_of_disposition',
       'address_match', 'ward_match', 'zipcode_match'],
      dtype='object')

In [29]:
export = consider.filter(items=[
    'pin',
    'id',
    'address_c49d-89sn',
    'managing_organization',
    'sq_ft',
    'ward_c49d-89sn',
    'zipcode_c49d-89sn',
    'last_update',
    'latitude_c49d-89sn',
    'longitude_c49d-89sn',
    'date_of_acquisition',
    'date_of_disposition',
]).rename(columns = {
    'address_c49d-89sn' : 'address',
    'ward_c49d-89sn' : 'ward',
    'zipcode_c49d-89sn' : 'zipcode',
    'latitude_c49d-89sn' : 'latitude',
    'longitude_c49d-89sn' : 'longitude',
})

In [30]:
export.to_json("city_owned_properties_for_map_3.json", orient="records")

In [31]:
assert export['pin'].shape == export['pin'].unique().shape

In [32]:
export.index

Int64Index([   48,    53,    58,    67,   211,   215,   216,   230,   231,
              234,
            ...
            12647, 12649, 12658, 12678, 12683, 12704, 12716, 12733, 12734,
            12737],
           dtype='int64', length=1804)