In [1]:
import requests
from invisibleroads_macros.disk import make_folder
from os.path import expanduser, join

def download(target_path, source_url):
    response = requests.get(source_url)
    open(target_path, 'w').write(response.content)
    return target_path

target_folder = make_folder(expanduser('~/Experiments/spatiotemporal'))
geojson_path = join(target_folder, 'nyc-traffic-injuries.json')
shapefile_path = join(target_folder, 'nyc-traffic-injuries.shp.zip')

In [2]:
"""
geojson_path = download(
    join(target_folder, 'nyc-traffic-injuries.json'), 
    'http://www.nyc.gov/html/dot/downloads/misc/injury_all_monthly.json')
shapefile_path = download(
    join(target_folder, 'nyc-traffic-injuries.shp.zip'),
    'http://www.nyc.gov/html/dot/downloads/misc/injury_all_monthly_shapefile.zip')
""";

In [6]:
"""
import fiona
geojson_collection = fiona.open(geojson_path)
print 'geojson_collection.bounds = %s' % repr(geojson_collection.bounds)
print 'geojson_collection.crs_wkt = %s' % geojson_collection.crs_wkt
print 'geojson_collection.crs = %s' % geojson_collection.crs
geojson_collection[0]
""";

In [23]:
import fiona
shapefile_collection = fiona.open('/', vfs='zip://' + shapefile_path)
print 'shapefile_collection.bounds = %s' % repr(shapefile_collection.bounds)
print 'shapefile_collection.crs_wkt = %s' % shapefile_collection.crs_wkt
print 'shapefile_collection.crs = %s' % shapefile_collection.crs
shapefile_collection[0]

shapefile_collection.bounds = (-74.2539230306024, 40.49947769792743, -73.70059800086655, 40.91246913562538)
shapefile_collection.crs_wkt = GEOGCS["GCS_WGS_1984",DATUM["WGS_1984",SPHEROID["WGS_84",6378137.0,298.257223563]],PRIMEM["Greenwich",0.0],UNIT["Degree",0.0174532925199433]]
shapefile_collection.crs = {'init': u'epsg:4326'}


{'geometry': {'coordinates': (-73.7917447266822, 40.72578884918672),
  'type': 'Point'},
 'id': '0',
 'properties': OrderedDict([(u'Injuries', 2),
              (u'PedInjurie', 1),
              (u'BikeInjuri', 0),
              (u'MVOInjurie', 1),
              (u'MN', u'1'),
              (u'YR', u'2009')]),
 'type': 'Feature'}

In [24]:
from pandas import DataFrame, Period
rows, indices = [], []
for value_by_key in shapefile_collection:
    indices.append(value_by_key['id'])
    longitude, latitude = map(float, value_by_key['geometry']['coordinates'])
    properties = value_by_key['properties']
    year, month = int(properties['YR']), int(properties['MN'])
    total_injury_count = int(properties['Injuries'])
    pedestrian_injury_count = int(properties['PedInjurie'])
    bike_injury_count = int(properties['BikeInjuri'])
    motor_vehicle_occupant_injury_count = int(properties['MVOInjurie'])
    rows.append([
        longitude, latitude, year, month,
        total_injury_count, pedestrian_injury_count, bike_injury_count, motor_vehicle_occupant_injury_count,
    ])
nyc_traffic_injuries_table = DataFrame(rows, index=indices, columns=[
    'Longitude', 'Latitude', 'Year', 'Month',
    'Total Injury Count', 'Pedestrian Injury Count', 'Bike Injury Count', 'Motor Vehicle Occupant Injury Count',
])
print(nyc_traffic_injuries_table.dtypes)

In [26]:
from os.path import expanduser
nyc_traffic_injuries_table.to_msgpack(expanduser('~/Experiments/spatiotemporal/nyc-traffic-injuries.msg-blosc'), compress='blosc')
nyc_traffic_injuries_table.to_msgpack(expanduser('~/Experiments/spatiotemporal/nyc-traffic-injuries.msg-zlib'), compress='zlib')

In [27]:
from os.path import expanduser
from pandas import read_msgpack
nyc_traffic_injuries_table = read_msgpack(expanduser('~/Experiments/spatiotemporal/nyc-traffic-injuries.msg-blosc'))
print 'nyc_traffic_injury_count = %s' % len(nyc_traffic_injury_table)
nyc_traffic_injuries_table[:2]

nyc_traffic_injury_count = 199341


Unnamed: 0,Longitude,Latitude,Year,Month,Total Injury Count,Pedestrian Injury Count,Bike Injury Count,Motor Vehicle Occupant Injury Count
0,-73.791745,40.725789,2009,1,2,1,0,1
1,-73.882429,40.844981,2009,1,1,1,0,0


In [28]:
type(nyc_traffic_injuries_table[:2]['Year'][0])

numpy.int64

In [30]:
nyc_traffic_injuries_table[:2]['Year'][0]

2009

In [33]:
from pandas import Period
Period(year=2009, month=1, freq='M')

Period('2009-01', 'M')

In [43]:
# Replace integer index with time series index
# This method seems exceedingly slow and memory hungry
# It might be faster to just filter by year and month manually
from pandas import Period

def add_time_period(row):
    row['Time Period'] = Period(year=row['Year'], month=row['Month'], freq='M')
    return row

nyc_traffic_injuries_by_month_table = nyc_traffic_injuries_table.apply(add_time_period, axis=1)
nyc_traffic_injuries_by_month_table.index = nyc_traffic_injuries_by_month_table['Time Period']

In [44]:
nyc_traffic_injuries_table.to_pickle(expanduser('~/Experiments/spatiotemporal/nyc-traffic-injuries.pkl'))
nyc_traffic_injuries_by_month_table.to_pickle(expanduser('~/Experiments/spatiotemporal/nyc-traffic-injuries-by-month.pkl'))

In [59]:
timeit len(nyc_traffic_injuries_by_month_table['2015-01':'2015-03'])

100 loops, best of 3: 9.17 ms per loop


In [52]:
print(len(nyc_traffic_injuries_by_month_table['2015-01':'2015-01']))
print(len(nyc_traffic_injuries_by_month_table['2015-02':'2015-02']))
print(len(nyc_traffic_injuries_by_month_table['2015-03':'2015-03']))

1992
1749
2072


In [53]:
1992 + 1749 + 2072

5813

In [57]:
# Filter by dates
start_year = 2015
start_month = 1
end_year = 2015
end_month = 3

In [60]:
timeit len(nyc_traffic_injuries_table[(
    start_year <= nyc_traffic_injuries_table.Year
) & (
    nyc_traffic_injuries_table.Year <= end_year
) & (
    start_month <= nyc_traffic_injuries_table.Month
) & (
    nyc_traffic_injuries_table.Month <= end_month
)])

SyntaxError: invalid syntax (<ipython-input-60-30c9ae8b48d0>, line 1)

In [61]:
timeit len(nyc_traffic_injuries_table[(start_year <= nyc_traffic_injuries_table.Year) & (nyc_traffic_injuries_table.Year <= end_year) & (start_month <= nyc_traffic_injuries_table.Month) & (nyc_traffic_injuries_table.Month <= end_month)])

100 loops, best of 3: 3.25 ms per loop


In [65]:
# Identify most dangerous by location and month
nyc_traffic_injuries_table.sort_values('Total Injury Count', ascending=False)[:10]

Unnamed: 0,Longitude,Latitude,Year,Month,Total Injury Count,Pedestrian Injury Count,Bike Injury Count,Motor Vehicle Occupant Injury Count
164489,-73.938508,40.679034,2013,9,43,0,0,43
182054,-73.755328,40.724723,2015,4,32,2,0,30
81178,-73.940593,40.750524,2011,4,28,0,0,28
114932,-73.940646,40.805737,2012,6,26,1,0,25
134328,-73.945642,40.692033,2013,7,25,0,0,25
175983,-73.9296,40.644433,2015,1,24,0,0,24
15864,-73.935858,40.677715,2009,10,22,0,0,22
179181,-74.094752,40.64077,2015,2,22,0,0,22
186586,-73.921619,40.753028,2015,6,22,0,0,22
192535,-73.881133,40.685063,2015,8,22,0,0,22


In [69]:
# Identify most dangerous by location and year
import numpy as np
g = nyc_traffic_injuries_table.groupby(['Longitude', 'Latitude'])
g.aggregate(np.sum)

ValueError: buffer source array is read-only

In [None]:
# Find all within radius
# Plot histogram

In [None]:
# Predict next month
# Predict next year

In [None]:
# Find all where trend is increasing month to month
# Find all where trend is increasing year to year

In [None]:
# Cluster into areas
# Run trend again

In [None]:
# Map results to png

In [None]:
# Map results to leaflet

In [None]:
# Animate results (what would we animate?)

In [None]:
# Given dates, return ranked table of intersections

In [None]:
# Given dates, address, radius, return ranked table of intersections
# Illustrates kdtree

In [None]:
# Given dates, addresses, return ranked table within bounding box
# Show histogram

In [None]:
# Given dates, addresses, mode of transport, return ranked routes

In [None]:
# Given dates, target date, project trends
# Illustrates basic trend forecasting

In [None]:
# Find most dangerous center
# Illustrates minimizing a loss function