[Mountain View Spatiotemporal Analysis Working Group](http://www.meetup.com/Mountain-View-Spatiotemporal-Analysis-Working-Group/)

Follow the instructions below to rebuild the datasets from scratch.

Or you can skip the instructions and [download the datasets directly](https://drive.google.com/folderview?id=0By20iaqtOZG3V3JobkpJZi0xTTg&usp=sharing).

# Geocoding

In [84]:
import os
from dogpile.cache import make_region

cache_folder = os.path.expanduser('~/Datasets')
cache_path = os.path.join(cache_folder, 'locations.dbm')
file_cache = make_region().configure(
    'dogpile.cache.dbm',
    arguments = {
        'filename': cache_path,
    })

In [167]:
import json
import re
from urllib import quote_plus, urlopen
from urlparse import urljoin

PATTERN_FRACTION = re.compile(r'\s+\d+/\d+\s+')

@file_cache.cache_on_arguments()
def get_location(address):
    """
    Convert address into longitude and latitude using the
    Data Science Toolkit API.  Also, see geopy.
    http://www.datasciencetoolkit.org/developerdocs#street2coordinates
    """
    base_url = 'http://www.datasciencetoolkit.org/street2coordinates/'
    url = urljoin(base_url, quote_plus(normalize_address(address)))
    text = urlopen(url).read()
    try:
        data = json.loads(text)
    except ValueError:
        raise ValueError('Could not parse "%s"' % text)
    try:
        value = data.values()[0]
        longitude = value['longitude']
        latitude = value['latitude']
    except TypeError:
        longitude = None
        latitude = None
    return longitude, latitude

def normalize_address(address):
    address = PATTERN_FRACTION.sub(' ', address)
    return address

get_location('345 Chambers St, New York, NY 10282')

(-74.01459, 40.716597)

# [NYC building violations](http://www.nyc.gov/html/hpd/html/pr/HPD-Open-Data-Violation-Files.shtml)

In [124]:
import os
import pandas

source_folder = os.path.expanduser('~/Datasets/USA/NYC')
source_path = os.path.join(source_folder, 'NYC-BuildingViolations-201402.csv')
nyc_building_violations = pandas.read_csv(source_path, sep='|', parse_dates=[
    'NOVIssuedDate',
], usecols=[
    'ViolationID',
    'HouseNumber',
    'StreetName',
    'Zip',
    'Class',
    'NOVDescription',
    'NOVIssuedDate',
]).dropna()

In [147]:
nyc_building_violations.set_index('NOVIssuedDate', drop=False, inplace=True)
nyc_building_violations_201402 = nyc_building_violations['2014-02']
nyc_building_violations_201402.reset_index(drop=True, inplace=True)
len(nyc_building_violations_201402)

36249

In [148]:
nyc_building_violations_20140203 = nyc_building_violations['2014-02-03']
nyc_building_violations_20140203.reset_index(drop=True, inplace=True)
len(nyc_building_violations_20140203)

3259

In [172]:
def add_location(row):
    address = '%s %s, New York, NY %s' % (
        row['HouseNumber'],
        row['StreetName'],
        int(row['Zip']))
    longitude, latitude = get_location(address)
    row['Latitude'] = latitude
    row['Longitude'] = longitude
    return row

nyc_building_violations_20140203_ll = nyc_building_violations_20140203[:5000].apply(add_location, axis=1).dropna()
len(nyc_building_violations_20140203_ll)

3259

In [178]:
target_folder = os.path.expanduser('~/Datasets/USA/NYC')
target_path = os.path.join(target_folder, 'NYC-BuildingViolations-20140203.h5')
nyc_building_violations_20140203_ll.to_hdf(
    target_path, 'raw', mode='w', complevel=5, complib='blosc')
pandas.read_hdf(target_path, 'raw')[:2]

Unnamed: 0,ViolationID,HouseNumber,StreetName,Zip,Class,NOVDescription,NOVIssuedDate,Latitude,Longitude
0,10119868,175,EAST 52 STREET,11203,C,"§ 27-2026, 2027 HMC: PROPERLY REPAIR THE SOURC...",2014-02-03 00:00:00,40.657023,-73.929105
1,10118191,701,GATES AVENUE,11221,A,§ 27-2013 ADM CODE PAINT WITH LIGHT COLORED PA...,2014-02-03 00:00:00,40.687663,-73.938752


# [NYC 311 service requests](https://data.cityofnewyork.us/Social-Services/311-Service-Requests-from-2010-to-Present/erm2-nwe9)

In [126]:
import os
import pandas

source_folder = os.path.expanduser('~/Datasets/USA/NYC')
source_path = os.path.join(source_folder, 'NYC-311ServiceRequests-201402.csv')
nyc_service_requests = pandas.read_csv(source_path, parse_dates=[
    'Created Date',
], usecols=[
    'Unique Key',
    'Created Date',
    'Agency Name',
    'Complaint Type',
    'Incident Zip',
    'Incident Address',
]).dropna()

In [142]:
nyc_service_requests.set_index('Created Date', drop=False, inplace=True)
nyc_service_requests_201402 = nyc_service_requests['2014-02']
nyc_service_requests_201402.reset_index(drop=True, inplace=True)
len(nyc_service_requests_201402)

26728

In [143]:
nyc_service_requests_20140203 = nyc_service_requests['2014-02-03']
nyc_service_requests_20140203.reset_index(drop=True, inplace=True)
len(nyc_service_requests_20140203)

911

In [171]:
def add_location(row):
    address = '%s, New York, NY %s' % (
        row['Incident Address'],
        row['Incident Zip'])
    longitude, latitude = get_location(address)
    row['Latitude'] = latitude
    row['Longitude'] = longitude
    return row

nyc_service_requests_20140203_ll = nyc_service_requests_20140203[:5000].apply(add_location, axis=1).dropna()
len(nyc_service_requests_20140203_ll)

911

In [180]:
target_folder = os.path.expanduser('~/Datasets/USA/NYC')
target_path = os.path.join(target_folder, 'NYC-311ServiceRequests-20140203.h5')
nyc_service_requests_20140203_ll.to_hdf(
    target_path, 'raw', mode='w', complevel=5, complib='blosc')
pandas.read_hdf(target_path, 'raw')[:2]

Unnamed: 0,Unique Key,Created Date,Agency Name,Complaint Type,Incident Zip,Incident Address,Latitude,Longitude
0,27325118,2014-02-03 23:59:26,New York City Police Department,Noise - Commercial,10128,163 EAST 92 STREET,40.783149,-73.95236
1,27325781,2014-02-03 23:58:29,Department of Parks and Recreation,Maintenance or Facility,11228,675 86TH STREET,40.619401,-74.022675


# [Seattle 911 fire department responses](https://data.seattle.gov/Public-Safety/Seattle-Real-Time-Fire-911-Calls/kzjm-xkqj)

In [174]:
import datetime
import os
import pandas

source_folder = os.path.expanduser('~/Datasets/USA/Seattle')
source_path = os.path.join(source_folder, 'Seattle-FireDepartmentResponses-201402.csv')
seattle_fire_responses = pandas.read_csv(source_path, parse_dates=[
    'Datetime'
], usecols=[
    'Address',
    'Type',
    'Datetime',
    'Latitude',
    'Longitude',
    'Incident Number',
]).dropna()
len(seattle_fire_responses)

7440

In [183]:
seattle_fire_responses.set_index('Datetime', drop=False, inplace=True)
seattle_fire_responses_20140203 = seattle_fire_responses['2014-02-03']
seattle_fire_responses_20140203.reset_index(drop=True, inplace=True)
len(seattle_fire_responses_20140203)

332

In [184]:
seattle_fire_responses_20140203_ll = seattle_fire_responses_20140203

In [185]:
target_folder = os.path.expanduser('~/Datasets/USA/Seattle')
target_path = os.path.join(target_folder, 'Seattle-FireDepartmentResponses-20140203.h5')
seattle_fire_responses_20140203_ll.to_hdf(
    target_path, 'raw', mode='w', complevel=5, complib='blosc')
pandas.read_hdf(target_path, 'raw')[:2]

Unnamed: 0,Address,Type,Datetime,Latitude,Longitude,Incident Number
0,4th Av N / Mercer St,Aid Response,2014-02-03 23:56:00,47.624564,-122.348877,F140012355
1,2121 8th Av,Aid Response,2014-02-03 23:47:00,47.616647,-122.33816,F140012352
