[Mountain View Spatiotemporal Analysis Working Group](http://www.meetup.com/Mountain-View-Spatiotemporal-Analysis-Working-Group/)

# Load dataset

In [1]:
import os
import pandas

source_folder = os.path.expanduser('~/Datasets/USA/NYC')
source_path = os.path.join(source_folder, 'NYC-BuildingViolations-20140203.h5')
nyc_building_violations = pandas.read_hdf(source_path, 'raw')
nyc_building_violation_locations = nyc_building_violations[
    ['Longitude', 'Latitude']].values
nyc_building_violations[:2]

Unnamed: 0,ViolationID,HouseNumber,StreetName,Zip,Class,NOVDescription,NOVIssuedDate,Latitude,Longitude
0,10119868,175,EAST 52 STREET,11203,C,"§ 27-2026, 2027 HMC: PROPERLY REPAIR THE SOURC...",2014-02-03 00:00:00,40.657023,-73.929105
1,10118191,701,GATES AVENUE,11221,A,§ 27-2013 ADM CODE PAINT WITH LIGHT COLORED PA...,2014-02-03 00:00:00,40.687663,-73.938752


In [2]:
from geometryIO import get_transformPoint, proj4LL

# http://www.spatialreference.org/ref/epsg/3627/
proj4_nyc = (
    '+proj=lcc +lat_1=41.03333333333333 +lat_2=40.66666666666666 +lat_0=40.16666666666666 '
    '+lon_0=-74 +x_0=300000 +y_0=0 +ellps=GRS80 +towgs84=0,0,0,0,0,0,0 +units=m +no_defs ')
transform_point = get_transformPoint(proj4LL, proj4_nyc)
untransform_point = get_transformPoint(proj4_nyc, proj4LL)
print tuple(nyc_building_violation_locations[0])
print untransform_point(*transform_point(*nyc_building_violation_locations[0]))

(-73.929105000000007, 40.657023000000002)
(-73.92910499999999, 40.657022999999846)


In [3]:
import numpy as np

nyc_building_violation_xys = np.array([
    transform_point(*_) for _ in nyc_building_violation_locations])
nyc_building_violation_xys[:2]

array([[ 305995.57869204,   54454.31792567],
       [ 305177.35457836,   57856.19856483]])

In [4]:
len(nyc_building_violation_xys)

3259

# Filter by class

According to the [NYC Department of Housing Preservation and Development](http://www.nyc.gov/html/hpd/downloads/pdf/ABCs-housing-singlepg.pdf), Class C violations are immediately hazardous.

In [5]:
indices = nyc_building_violations['Class'] == 'C'

nyc_building_violation_c = nyc_building_violations[indices]
nyc_building_violation_c.reset_index(drop=True, inplace=True)
nyc_building_violation_c_xys = nyc_building_violation_xys[indices]
len(nyc_building_violation_c)

1044

# Decide inspection order

In [6]:
unique_xys = list(set(tuple(xy) for xy in nyc_building_violation_c_xys))
len(unique_xys)

2

In [7]:
unique_xys

[(305177.35457835509, 57856.198564833117),
 (305995.57869203627, 54454.317925673342)]

In [8]:
from itertools import izip
from networkx import Graph
from networkx.algorithms import minimum_spanning_tree
from libraries.kdtree import KDTree

kdtree = KDTree(unique_xys)
candidate_graph = Graph()
for source_index, source_row in nyc_building_violation_c.iterrows():
    source_xy = transform_point(source_row['Longitude'], source_row['Latitude'])
    target_distances, target_indices = kdtree.query(source_xy, maximum_count=3)
    for distance, target_index in izip(target_distances, target_indices):
        target_row = nyc_building_violation_c.ix[target_index]
        candidate_graph.add_edge(
            source_row['ViolationID'],
            target_row['ViolationID'],
            distance=distance)
optimized_graph = minimum_spanning_tree(candidate_graph)

In [9]:
from networkx.algorithms import dfs_edges
list(dfs_edges(optimized_graph))[:10]

[(10117121, 10119868),
 (10119868, 10119153),
 (10119868, 10119172),
 (10119868, 10117126),
 (10119868, 10117127),
 (10119868, 10119178),
 (10119868, 10117131),
 (10119868, 10117132),
 (10119868, 10115086),
 (10119868, 10118831)]