# ETL for the Grant 2015 Snow Emergency Tows Data Set
The .CSV file for the Grant 2015 Snow Emergency does not contain the latitudes and longitudes of the towing incidents.  The GeoJSON file does include this information.  We can use a GeoPandas GeoDataFrame to manipulate this GeoJSON to extract the longitude and latitude information.  We perform the following steps:

1.  Fill missing coordinates by using Google Places to geocode the given address information.
2.  Fill missing Ward, Community, and Neighborhood information in using shapely's `polygon.contains(point)` functionality.  We have GeoJSONs with the boundaries for Minneapolis Wards, Communities, and Neighborhoods.  For each point, we can write a function that returns the corresponding Ward, Community, and Neighborhood.
3.  Extract the GeoDataFrame to an ordinary Pandas DataFrame by dropping the geometry column to a pair of columns: latitude and longitude.
4.  Split the datetime string into a date and time field.
5.  Write the final data frame to a `.csv` file to be combined with the other cleaned files.

In [1]:
# import the dependencies
import numpy as np
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
import os
import geopy

from shapely.geometry import Point
from shapely.geometry.polygon import Polygon
from geopy.geocoders import GoogleV3

from api_keys import api_key

In [2]:
# constants related to input and output

DATA_DIR = "data"
GRANT_FILE = "Snow_Emergency_Grant_Tows_2015.geojson"
OUTPUT_DIR = "output"
WARDS_FILE = "City_Council_Wards.geojson"
COMMUNITIES_FILE = "Communities.geojson"
NEIGHBORHOODS_FILE = "Minneapolis_Neighborhoods.geojson"
OUTPUT_FILE = "Grant_Tows.csv"

In [5]:
# a helper function to read files in.  GeoPandas read_file can load GeoJSONs

def load_data(filename, data_dir=DATA_DIR):
    filepath = os.path.join(data_dir, filename)
    return gpd.read_file(filepath)

In [6]:
grant_tows = load_data(GRANT_FILE)
grant_tows.head()

Unnamed: 0,FID,X,Y,Call_Taken,Location,Day,geometry
0,1,-10380250.0,5623291.0,2015-12-29T00:00:00,2326 Central Ave NE,Day 1,POINT (-93.24740688343616 45.01123618428048)
1,2,-10383340.0,5618287.0,2015-12-29T00:00:00,618 1ST N,Day 1,POINT (-93.27510825313226 44.97945116173709)
2,3,-10383080.0,5615996.0,2015-12-29T00:00:00,1824 3rd ave s,Day 1,POINT (-93.27281155185176 44.96488946723947)
3,4,-10380800.0,5614751.0,2015-12-29T00:00:00,2519 bloomington ave s,Day 1,POINT (-93.25235251858196 44.95697462618004)
4,5,-10380230.0,5626288.0,2015-12-29T00:00:00,3447 Central Ave NE,Day 1,POINT (-93.24719240972475 45.03026988637581)


In [10]:
grant_tows.shape

(1736, 7)

# 1.  Fill in Missing Geometries

In [11]:
# how many observations are missing geometry?
grant_tows.geometry.isnull().sum()

180

In [14]:
# prepare to geocode missing geometries:
geolocator = GoogleV3(api_key=api_key)

In [18]:
# Loop over the GeoDataFrame, if the geometry is missing geocode from the Location
import time

for idx in range(grant_tows.shape[0]):
    if not grant_tows.loc[idx, 'geometry']:
        address = grant_tows.loc[idx, 'Location'] + ", Minneapolis, MN"
        print(address)
        result =  geolocator.geocode(address)
        geom = Point(result.longitude, result.latitude)
        time.sleep(2) # avoid the API rate limit
        grant_tows.loc[idx, 'geometry'] = geom

University and 8th ave, Minneapolis, MN
395 2nd st s e, Minneapolis, MN
NE Grand & Lowry, Minneapolis, MN
528 e 15th st, Minneapolis, MN
Grand & Lowry, Minneapolis, MN
1923 1rst st s, Minneapolis, MN
E 16th st/ Chicago, Minneapolis, MN
3330 Fremont, Minneapolis, MN
Grand & Lowry, Minneapolis, MN
2321 n 2nd st, Minneapolis, MN
3550 E 46TH ST, Minneapolis, MN
528 e15th st, Minneapolis, MN
4th and 8th, Minneapolis, MN
604 e15th st, Minneapolis, MN
503 E 15th St, Minneapolis, MN
3540, Minneapolis, MN
408 erie nd ave se, Minneapolis, MN
1416 e 41st st, Minneapolis, MN
2418  N 3rd st, Minneapolis, MN
15th / oak grove, Minneapolis, MN
801 n. washington ave., Minneapolis, MN
17th como ave s e, Minneapolis, MN
13th Ave & Grand, Minneapolis, MN
13th Ave & Grand, Minneapolis, MN
2752 e 38 st, Minneapolis, MN
916 N Washington Ave, Minneapolis, MN
13th Ave & Grand, Minneapolis, MN
2205 blaisdell s, Minneapolis, MN
2245 blaidsdell ave, Minneapolis, MN
rear, Minneapolis, MN
2311 blaisedale s, Minneap

In [19]:
grant_tows.geometry.isnull().sum()

0

In [20]:
# Save the work done so far to avoid needing to geocode again:
grant_tows.to_csv(os.path.join(OUTPUT_DIR, "grant_temp.csv"))

# 2.  Determine Wards, Communities, Neighborhoods

In [21]:
wards = load_data(WARDS_FILE)
wards.head()

Unnamed: 0,FID,BDNUM,Shape_STAr,Shape_STLe,geometry
0,1,1,204415900.0,71797.801731,"POLYGON ((-93.2268508021494 45.0132152671148, ..."
1,2,9,72901470.0,42091.44637,"POLYGON ((-93.2431848712514 44.955829056264, -..."
2,3,3,106205700.0,57104.348729,"POLYGON ((-93.2631316871618 45.0131674841788, ..."
3,4,4,147170400.0,57462.048968,"POLYGON ((-93.2991995476941 45.0511367258392, ..."
4,5,8,78692300.0,47568.406676,"POLYGON ((-93.2747782291209 44.9483536669077, ..."


In [42]:
# A helper function for determining the ward of a point:

def find_ward(place, wards_df=wards):
    for row in wards_df.itertuples():
        if row.geometry.contains(place):
            return row.BDNUM


In [43]:
grant_tows['Ward'] = grant_tows.geometry.map(find_ward)

In [44]:
grant_tows.head()

Unnamed: 0,FID,X,Y,Call_Taken,Location,Day,geometry,Ward
0,1,-10380250.0,5623291.0,2015-12-29T00:00:00,2326 Central Ave NE,Day 1,POINT (-93.24740688343616 45.01123618428048),1
1,2,-10383340.0,5618287.0,2015-12-29T00:00:00,618 1ST N,Day 1,POINT (-93.27510825313226 44.97945116173709),3
2,3,-10383080.0,5615996.0,2015-12-29T00:00:00,1824 3rd ave s,Day 1,POINT (-93.27281155185176 44.96488946723947),6
3,4,-10380800.0,5614751.0,2015-12-29T00:00:00,2519 bloomington ave s,Day 1,POINT (-93.25235251858196 44.95697462618004),9
4,5,-10380230.0,5626288.0,2015-12-29T00:00:00,3447 Central Ave NE,Day 1,POINT (-93.24719240972475 45.03026988637581),1


In [45]:
communities = load_data(COMMUNITIES_FILE)
communities.head()

Unnamed: 0,FID,CommName,geometry
0,1,Camden,"POLYGON ((-93.31949186501051 45.0512462469094,..."
1,2,Northeast,"POLYGON ((-93.22685080614011 45.0132165472863,..."
2,3,Near North,"POLYGON ((-93.3178372365291 45.013260021455, -..."
3,4,Central,"POLYGON ((-93.27211391203549 44.9921313605316,..."
4,5,University,"POLYGON ((-93.2076071706838 45.0015313002637, ..."


In [46]:
# A helper function for determining the community of a point:

def find_community(place, community_df=communities):
    for row in community_df.itertuples():
        if row.geometry.contains(place):
            return row.CommName


In [47]:
grant_tows['Community'] = grant_tows.geometry.map(find_community)

In [48]:
grant_tows.head()

Unnamed: 0,FID,X,Y,Call_Taken,Location,Day,geometry,Ward,Community
0,1,-10380250.0,5623291.0,2015-12-29T00:00:00,2326 Central Ave NE,Day 1,POINT (-93.24740688343616 45.01123618428048),1,Northeast
1,2,-10383340.0,5618287.0,2015-12-29T00:00:00,618 1ST N,Day 1,POINT (-93.27510825313226 44.97945116173709),3,Central
2,3,-10383080.0,5615996.0,2015-12-29T00:00:00,1824 3rd ave s,Day 1,POINT (-93.27281155185176 44.96488946723947),6,Central
3,4,-10380800.0,5614751.0,2015-12-29T00:00:00,2519 bloomington ave s,Day 1,POINT (-93.25235251858196 44.95697462618004),9,Phillips
4,5,-10380230.0,5626288.0,2015-12-29T00:00:00,3447 Central Ave NE,Day 1,POINT (-93.24719240972475 45.03026988637581),1,Northeast


In [49]:
neighborhoods = load_data(NEIGHBORHOODS_FILE)
neighborhoods.head()

Unnamed: 0,FID,BDNAME,BDNUM,TEXT_NBR,Shape_STAr,Shape_STLe,NCR_LINK,IMAGE,geometry
0,1,Phillips West,90,90,10669250.0,14403.885934,http://www.nrp.org/r2/Neighborhoods/Orgs/PHW.html,PHW,"POLYGON ((-93.2625807586419 44.9609082137146, ..."
1,2,Downtown West,87,87,20756130.0,19220.602541,http://www.nrp.org/r2/Neighborhoods/Orgs/DTN.html,DTN,"POLYGON ((-93.2601055025157 44.9829952758614, ..."
2,3,Downtown East,88,88,10254990.0,13436.601356,http://www.nrp.org/r2/Neighborhoods/Orgs/DTN.html,DTN,"POLYGON ((-93.2449864570206 44.9789336625517, ..."
3,4,Ventura Village,89,89,12635260.0,16988.532717,http://www.nrp.org/r2/Neighborhoods/Orgs/VEN.html,VEN,"POLYGON ((-93.24957700344829 44.9662967560422,..."
4,5,Sumner - Glenwood,29,29,5741860.0,11065.343364,http://www.nrp.org/r2/Neighborhoods/Orgs/SGL.html,SGL,"POLYGON ((-93.2882976528817 44.9890356035354, ..."


In [50]:
neighborhoods.shape

(87, 9)

In [51]:
# A helper function for determining the neighborhood of a point:

def find_neighborhood(place, neighborhood_df=neighborhoods):
    for row in neighborhood_df.itertuples():
        if row.geometry.contains(place):
            return row.BDNAME

In [52]:
grant_tows['Neighborhood'] = grant_tows.geometry.map(find_neighborhood)
grant_tows.head()

Unnamed: 0,FID,X,Y,Call_Taken,Location,Day,geometry,Ward,Community,Neighborhood
0,1,-10380250.0,5623291.0,2015-12-29T00:00:00,2326 Central Ave NE,Day 1,POINT (-93.24740688343616 45.01123618428048),1,Northeast,Holland
1,2,-10383340.0,5618287.0,2015-12-29T00:00:00,618 1ST N,Day 1,POINT (-93.27510825313226 44.97945116173709),3,Central,Downtown West
2,3,-10383080.0,5615996.0,2015-12-29T00:00:00,1824 3rd ave s,Day 1,POINT (-93.27281155185176 44.96488946723947),6,Central,Steven's Square - Loring Heights
3,4,-10380800.0,5614751.0,2015-12-29T00:00:00,2519 bloomington ave s,Day 1,POINT (-93.25235251858196 44.95697462618004),9,Phillips,East Phillips
4,5,-10380230.0,5626288.0,2015-12-29T00:00:00,3447 Central Ave NE,Day 1,POINT (-93.24719240972475 45.03026988637581),1,Northeast,Waite Park


In [53]:
# Save the work done so far to avoid needing to geocode again:
grant_tows.to_csv(os.path.join(OUTPUT_DIR, "grant_temp.csv"))

# 3.  Drop the geometry column.  Add two columns for Longitude and Latitude.

In [54]:
def point_longitude (point):
    return point.x

def point_latitude (point):
    return point.y

grant_tows['Longitude'] = grant_tows['geometry'].x
grant_tows['Latitude'] = grant_tows['geometry'].y
grant_tows.head()

Unnamed: 0,FID,X,Y,Call_Taken,Location,Day,geometry,Ward,Community,Neighborhood,Longitude,Latitude
0,1,-10380250.0,5623291.0,2015-12-29T00:00:00,2326 Central Ave NE,Day 1,POINT (-93.24740688343616 45.01123618428048),1,Northeast,Holland,-93.247407,45.011236
1,2,-10383340.0,5618287.0,2015-12-29T00:00:00,618 1ST N,Day 1,POINT (-93.27510825313226 44.97945116173709),3,Central,Downtown West,-93.275108,44.979451
2,3,-10383080.0,5615996.0,2015-12-29T00:00:00,1824 3rd ave s,Day 1,POINT (-93.27281155185176 44.96488946723947),6,Central,Steven's Square - Loring Heights,-93.272812,44.964889
3,4,-10380800.0,5614751.0,2015-12-29T00:00:00,2519 bloomington ave s,Day 1,POINT (-93.25235251858196 44.95697462618004),9,Phillips,East Phillips,-93.252353,44.956975
4,5,-10380230.0,5626288.0,2015-12-29T00:00:00,3447 Central Ave NE,Day 1,POINT (-93.24719240972475 45.03026988637581),1,Northeast,Waite Park,-93.247192,45.03027


# 4.  Convert Call_Taken to a Date String and Time String.  Note that the Times are all 00:00:00

In [57]:
import dateutil.parser as dparser
from datetime import datetime

test = dparser.parse(grant_tows.loc[0, 'Call_Taken'], fuzzy=True)
print(test.strftime("%m/%d/%Y"))
print(test.strftime("%H:%M"))

12/29/2015
00:00


In [58]:
# A helper function to map against the `Call_Taken` column to get the date

def get_date(call_string):
    call_dt = dparser.parse(call_string, fuzzy=True)
    return call_dt.strftime("%m/%d/%Y")

def get_time(call_string):
    call_dt = dparser.parse(call_string, fuzzy=True)
    return call_dt.strftime("%H:%M")

grant_tows['Date'] = grant_tows.Call_Taken.map(get_date)
grant_tows['Time'] = grant_tows.Call_Taken.map(get_time)
grant_tows.head()

Unnamed: 0,FID,X,Y,Call_Taken,Location,Day,geometry,Ward,Community,Neighborhood,Longitude,Latitude,Date,Time
0,1,-10380250.0,5623291.0,2015-12-29T00:00:00,2326 Central Ave NE,Day 1,POINT (-93.24740688343616 45.01123618428048),1,Northeast,Holland,-93.247407,45.011236,12/29/2015,00:00
1,2,-10383340.0,5618287.0,2015-12-29T00:00:00,618 1ST N,Day 1,POINT (-93.27510825313226 44.97945116173709),3,Central,Downtown West,-93.275108,44.979451,12/29/2015,00:00
2,3,-10383080.0,5615996.0,2015-12-29T00:00:00,1824 3rd ave s,Day 1,POINT (-93.27281155185176 44.96488946723947),6,Central,Steven's Square - Loring Heights,-93.272812,44.964889,12/29/2015,00:00
3,4,-10380800.0,5614751.0,2015-12-29T00:00:00,2519 bloomington ave s,Day 1,POINT (-93.25235251858196 44.95697462618004),9,Phillips,East Phillips,-93.252353,44.956975,12/29/2015,00:00
4,5,-10380230.0,5626288.0,2015-12-29T00:00:00,3447 Central Ave NE,Day 1,POINT (-93.24719240972475 45.03026988637581),1,Northeast,Waite Park,-93.247192,45.03027,12/29/2015,00:00


# Convert the Day column to an integer - to be uniform with the other data sets.

In [59]:
grant_tows.Day.unique()

array(['Day 1', 'Day 2', 'Day 3'], dtype=object)

In [64]:
grant_tows['Day'] = grant_tows.Day.map(lambda x: int(x.split()[1]))
grant_tows.head()

Unnamed: 0,FID,X,Y,Call_Taken,Location,Day,geometry,Ward,Community,Neighborhood,Longitude,Latitude,Date,Time
0,1,-10380250.0,5623291.0,2015-12-29T00:00:00,2326 Central Ave NE,1,POINT (-93.24740688343616 45.01123618428048),1,Northeast,Holland,-93.247407,45.011236,12/29/2015,00:00
1,2,-10383340.0,5618287.0,2015-12-29T00:00:00,618 1ST N,1,POINT (-93.27510825313226 44.97945116173709),3,Central,Downtown West,-93.275108,44.979451,12/29/2015,00:00
2,3,-10383080.0,5615996.0,2015-12-29T00:00:00,1824 3rd ave s,1,POINT (-93.27281155185176 44.96488946723947),6,Central,Steven's Square - Loring Heights,-93.272812,44.964889,12/29/2015,00:00
3,4,-10380800.0,5614751.0,2015-12-29T00:00:00,2519 bloomington ave s,1,POINT (-93.25235251858196 44.95697462618004),9,Phillips,East Phillips,-93.252353,44.956975,12/29/2015,00:00
4,5,-10380230.0,5626288.0,2015-12-29T00:00:00,3447 Central Ave NE,1,POINT (-93.24719240972475 45.03026988637581),1,Northeast,Waite Park,-93.247192,45.03027,12/29/2015,00:00


In [65]:
grant_tows_final = pd.DataFrame(grant_tows[['Date', 'Time', 'Location', 'Latitude', 'Longitude', 'Ward', 'Community', 'Neighborhood']])

In [66]:
grant_tows_final.head()

Unnamed: 0,Date,Time,Location,Latitude,Longitude,Ward,Community,Neighborhood
0,12/29/2015,00:00,2326 Central Ave NE,45.011236,-93.247407,1,Northeast,Holland
1,12/29/2015,00:00,618 1ST N,44.979451,-93.275108,3,Central,Downtown West
2,12/29/2015,00:00,1824 3rd ave s,44.964889,-93.272812,6,Central,Steven's Square - Loring Heights
3,12/29/2015,00:00,2519 bloomington ave s,44.956975,-93.252353,9,Phillips,East Phillips
4,12/29/2015,00:00,3447 Central Ave NE,45.03027,-93.247192,1,Northeast,Waite Park


In [67]:
grant_tows_final['Emergency'] = 'Grant'
grant_tows_final.head()

Unnamed: 0,Date,Time,Location,Latitude,Longitude,Ward,Community,Neighborhood,Emergency
0,12/29/2015,00:00,2326 Central Ave NE,45.011236,-93.247407,1,Northeast,Holland,Grant
1,12/29/2015,00:00,618 1ST N,44.979451,-93.275108,3,Central,Downtown West,Grant
2,12/29/2015,00:00,1824 3rd ave s,44.964889,-93.272812,6,Central,Steven's Square - Loring Heights,Grant
3,12/29/2015,00:00,2519 bloomington ave s,44.956975,-93.252353,9,Phillips,East Phillips,Grant
4,12/29/2015,00:00,3447 Central Ave NE,45.03027,-93.247192,1,Northeast,Waite Park,Grant


In [69]:
grant_tows_final.to_csv(os.path.join(OUTPUT_DIR, OUTPUT_FILE), index=False)