<a href="https://colab.research.google.com/github/edudzikorku/police-data-viz/blob/main/police_use_of_force_geocoding_workflow.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Introduction

Two distinct geocoding services, namely `Nominatim` and `MapBox`, were employed. Geocoding of addresses within the initial dataframe, comprising 10,579 rows and 36 columns, was executed using the `Nominatim` service. This process concluded within a time span of **3 hours, 10 minutes, and 27** seconds &#x1F631;

In [None]:
from google.colab import drive
drive.mount("/content/drive")

In [None]:
%%capture
!pip install fiona shapely pyproj rtree
!pip install geopandas
!pip install tqdm
!pip install keplergl

In [None]:
# imports

# data management
import re
import requests
import pandas as pd
from tqdm import tqdm
import geopandas as gpd
from geopy.geocoders import Nominatim
from geopy.geocoders import MapBox as mb
from geopy.extra.rate_limiter import RateLimiter as rtl

# data visualization
from keplergl import KeplerGl as kgl
from google.colab import output
output.enable_custom_widget_manager()

# file download
from google.colab import files

In [None]:

# define URL of the file
url = "https://data.cityofchicago.org/api/geospatial/5jrd-6zik?method=export&format=GeoJSON"

# submit HTTP GET request to the URL
response = requests.get(url)

# parse response content as JSON
data = response.json()

# create geodataframe from JSON data
census_tracts = gpd.GeoDataFrame.from_features(data['features'], crs = 4326)

In [None]:
census_tracts = census_tracts[['name10', 'commarea_n', 'namelsad10',
       'commarea', 'geoid10', 'tractce10', 'countyfp10', 'statefp10', 'notes', 'geometry']]

In [None]:
# create output folders
!mkdir -p output/html
!mkdir -p output/csv
!mkdir -p output/shp
!mkdir -p output/geojson

In [None]:
dir = "/content/drive/MyDrive/final/"

officer_profiles = pd.read_csv(dir + "officer_profiles.csv")
trr = pd.read_csv(dir + "tactical_response_reports.csv")

In [None]:
# replace 'XX' at the end of each value in 'block' column with '99'
trr['block'] = trr['block'].str.replace("XX$", '99', regex = True)

# fill empty 'street_direction' rows with an empty string
trr['street_direction'].fillna('', inplace = True)

# create a new column, combining block number, street direction and name
trr['address'] = trr['block'] + ' ' + trr['street_direction'] + ' ' + trr['street_name'] + ',' + 'Chicago, IL'

In [None]:
# track progress of geocoding
tqdm.pandas()

# set up geolocator
geolocator = Nominatim(user_agent = 'edudzi', timeout = 10)

# set up geocoder
geocode = rtl(geolocator.geocode, min_delay_seconds = 1)

# geocode addresses
trr['loc'] = trr['address'].progress_apply(geocode)

# extract longitude and longitude values from geocoding output
trr['lon'] = trr['loc'].apply(lambda x: x.longitude if x else None)
trr['lat'] = trr['loc'].apply(lambda y: y.latitude if y else None)

100%|██████████| 10579/10579 [3:10:27<00:00,  1.08s/it]


In [None]:

# extract missed addresses from original dataframe
missed_addresses = trr[trr['loc'].isna()]

In [None]:
# use mapbox api to geocode missed addresses

# set up api key
api_key = "pk.eyJ1IjoiZWR1ZHppIiwiYSI6ImNsbDRsZWp3ZDA3ZWIzZW1rdXFrbW5obmcifQ.B-oC1_9g6aKBQc6delbiiA"

# track progress of geocoding
tqdm.pandas()

# set up geolocator
geolocator = mb(api_key = api_key, timeout = 10)

# set up geocoder
geocode = rtl(geolocator.geocode, min_delay_seconds = 1)

# geocode missed addresses
missed_addresses['loc'] = missed_addresses['address'].progress_apply(geocode)

In [None]:
# get rows that contain either Chicago or Illinois
missed_addresses['textual_loc'] = missed_addresses['loc'].apply(lambda x: re.search(r'Chicago|Illinois', str(x), flags = re.I) is not None)

# move these rows into a new dataframe
filtered_addresses = missed_addresses[missed_addresses['textual_loc']]

# drop off the unwanted column
filtered_addresses = filtered_addresses.drop(columns = ['textual_loc'])

In [None]:
# update the original dataframe
trr.update(filtered_addresses)

# extract long and latitude values from the updated dataframe

trr['lon'] = trr['loc'].apply(lambda x: x.longitude if x else None)
trr['lat'] = trr['loc'].apply(lambda y: y.latitude if y else None)

In [None]:

# manually enter missed addresses

trr.loc[42, ['lon', 'lat']] = (-87.7455555, 41.9591811)
trr.loc[1122, ['lat', 'lon']] = (41.9111737,-87.6288557)
trr.loc[2215, ['lat', 'lon']] = (41.9242993,-87.6746744)
trr.loc[4430, ['lat', 'lon']] = (41.8961202,-87.6226977)
trr.loc[4706, ['lat', 'lon']] = (41.9328091,-87.6408568)
trr.loc[7818, ['lat', 'lon']] = (41.8068764,-87.5904713)
trr.loc[8340, ['lat', 'lon']] = (41.9111736,-87.6308942)

In [None]:

wrong_addresses_list = [13536, 13538, 18086, 48786, 93990, 51093,
93989, 52286, 51093, 93993, 52285, 9666,
12147, 93993, 89755, 56741, 59413, 70059, 67452,
5436, 5744, 94006, 92306, 49626, 96194, 32853, 75463, 52284]
wrong_addresses = trr[trr['trr_id'].isin(wrong_addresses_list) | (trr['lon'] <= -88)]

In [None]:

# manually enter wrong addresses
trr.loc[226, ['lat', 'lon']] = (41.901102,-87.632786)
trr.loc[707, ['lat', 'lon']] = (41.9790949,-87.9086181)
trr.loc[1117, ['lat', 'lon']] = (41.8770972,-87.6196328)
trr.loc[2021, ['lat', 'lon']] = (41.7958495,-87.6341916)
trr.loc[2108, ['lat', 'lon']] = (41.771447,-87.666484)
trr.loc[3791, ['lat', 'lon']] = (41.8705074,-87.880885)
trr.loc[3792, ['lat', 'lon']] = (41.975873,-87.877662)
trr.loc[4437, ['lat', 'lon']] = (41.8021588,-87.6180554)
trr.loc[4466, ['lat', 'lon']] = (41.9790949,-87.9086181)
trr.loc[4467, ['lat', 'lon']] = (41.9790949,-87.9086181)
trr.loc[4493, ['lat', 'lon']] = (41.8032843,-87.5883122)
trr.loc[4581, ['lat', 'lon']] = (41.8130107,-87.6210351)
trr.loc[4728, ['lat', 'lon']] = (41.8705074,-87.8808855)
trr.loc[4841, ['lat', 'lon']] = (41.914057,-87.636782)
trr.loc[4969, ['lat', 'lon']] = (41.9802452,-87.9115595)
trr.loc[5016, ['lat', 'lon']] = (41.901604, -87.871881)
trr.loc[5102, ['lat', 'lon']] = (41.900064,-87.618182)
trr.loc[5742, ['lat', 'lon']] = (41.890743,-87.778450)
trr.loc[5950, ['lat', 'lon']] = (41.906400,-87.899423)
trr.loc[6314, ['lat', 'lon']] = (41.977579,-87.871156)
trr.loc[6733, ['lat', 'lon']] = (41.890955,-87.615203)
trr.loc[7683, ['lat', 'lon']] = (41.976101,-87.900891)
trr.loc[8168, ['lat', 'lon']] = (41.901604,-87.871881)
trr.loc[8401, ['lat', 'lon']] = (41.9802452,-87.9115595)
trr.loc[8666, ['lat', 'lon']] = (41.890103,-87.632498)
trr.loc[9181, ['lat', 'lon']] = (41.871481,-87.631633)
trr.loc[10423, ['lat', 'lon']] = (41.695996,-87.845314)
trr.loc[10472, ['lat', 'lon']] = (41.9130493,-87.6373731)
trr.loc[10567, ['lat', 'lon']] = (41.9790949,-87.9086181)

In [None]:
trr_sub = trr.drop(columns = ['loc'])

In [None]:
# create kepler map
mp = kgl(height = 500, data = {'data': trr_sub}, name = 'Tactical Response Reports, Chicago')

# add data to map
# mp.add_data(trr_sub, name = 'Tactical Response Reports, Chicago')

# display map
mp

In [None]:
trr_gdf = gpd.GeoDataFrame(trr_sub,
                           geometry = gpd.points_from_xy(trr_sub.lon, trr_sub.lat),
                           crs = 4326)

In [None]:

# save output
trr.to_csv("/content/output/csv/trr_geocoded.csv", index = None)
trr_gdf.to_file("/content/output/geojson/trr_geocoded.geojson", driver='GeoJSON')
trr_gdf.to_file("/content/output/shp/trr_geocoded.shp")

# save map
mp.save_to_html(data = {'data': trr_gdf},
                         file_name = '/content/html/output/tactical_response_reports.html',
                         config = mp.config)

In [None]:
!zip -r /content/output/trr.zip /content/output

files.download("/content/output/trr.zip")