In [1]:
# Basic
from datetime import datetime, timedelta
import time

# Data Analysis Specific
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import geopy

# Marchine Learning Specific
from sklearn.kernel_ridge import KernelRidge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.grid_search import GridSearchCV, RandomizedSearchCV

# IPython magic
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
DATA_PATH = './data/'
DETROIT_LAT = (42.252, 42.452)
DETROIT_LNG = (-83.295, -82.895)
SIZE_RATIO = 1000
DETROIT_WIDTH = 200
DETROIT_HEIGHT = 400
print DETROIT_WIDTH, DETROIT_HEIGHT, float(DETROIT_WIDTH)/DETROIT_HEIGHT

200 400 0.5


In [3]:
raw_violation = pd.read_csv(DATA_PATH + 'detroit-blight-violations.csv', low_memory=False)
raw_311 = pd.read_csv(DATA_PATH + 'detroit-311.csv', low_memory=False)
raw_crime = pd.read_csv(DATA_PATH + 'detroit-crime.csv', low_memory=False)
raw_permit = pd.read_csv(DATA_PATH + 'detroit-demolition-permits.tsv.csv', low_memory=False)

# Cleaning permit

In [35]:
clean_permit = pd.read_csv('data/clean_permit.csv',index_col=0)

In [36]:
from dateutil.parser import parse as date_parser

In [37]:
def my_date_parser(dt):
    try:
        return date_parser(dt)
    except:
        return None
permit_date = raw_permit['PERMIT_APPLIED'].map(lambda x: my_date_parser(x))

In [38]:
clean_permit['date'] = permit_date
clean_permit.to_csv('clean/permit.csv')

In [41]:
clean_permit.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7133 entries, 0 to 7132
Data columns (total 3 columns):
lat     6889 non-null float64
lng     6889 non-null float64
date    7130 non-null datetime64[ns]
dtypes: datetime64[ns](1), float64(2)
memory usage: 222.9 KB


# Cleaning violation

In [75]:
def violation_lat_lng():
#     address = raw_violation['ViolationAddress'].map(lambda x: x.split('\n')[0])
    lat_lng = raw_violation['ViolationAddress'].map(lambda x: x.split('\n')[-1][1:-1])
    lat = lat_lng.map(lambda x: float(x.split(',')[0]))
    lng = lat_lng.map(lambda x: float(x.split(',')[1]))
    addr = pd.DataFrame()
    addr['lat'] = lat
    addr['lng'] = lng
    addr = addr.apply(lambda x: None if 42.331<=x.lat<42.332 and -83.048<=x.lng<-83.047 else x, axis = 1)
    return addr

In [79]:
violation_tmp = violation_lat_lng()

In [84]:
violation_tmp.isnull()[violation_tmp.isnull()].dropna().index

Int64Index([     8,     26,     47,     48,     49,     50,    150,    156,
               157,    158,
            ...
            307513, 307514, 307515, 307609, 307662, 307787, 307788, 307796,
            307797, 307803],
           dtype='int64', length=21119)

In [85]:
miss_violation_index = violation_tmp.isnull()[violation_tmp.isnull()].dropna().index

In [88]:
address = raw_violation['ViolationAddress'].map(lambda x: x.split('\n')[0])
miss_violation_addr = address[miss_violation_index]

In [60]:
from geopy.geocoders import Nominatim
def parse_addr(addr):
    geolocator = Nominatim()
    addr= addr+', Detroit'
    try:
        loc = geolocator.geocode(addr) 
        lat, lon = float(loc.latitude), float(loc.longitude)   
    except:
        print 'Cannot parse %s' % addr
        return (np.nan,np.nan)
    return lat, lon

In [89]:
miss_violation_lat_lng = miss_violation_addr.map(lambda x:parse_addr(x))

Cannot parse 82 MONTANA, Detroit
Cannot parse 654 MT, Detroit
Cannot parse 41 ADAMS, Detroit
Cannot parse 5449 VERNOR, Detroit
Cannot parse 657 MULLETT, Detroit
Cannot parse 312 WEST END, Detroit
Cannot parse 17101 JOHN, Detroit
Cannot parse 15326 G A, Detroit
Cannot parse 260 SCHWEITZER PL, Detroit
Cannot parse 260 SCHWEITZER PL, Detroit
Cannot parse 260 SCHWEITZER PL, Detroit
Cannot parse 260 SCHWEITZER PL, Detroit
Cannot parse 9635 VAN, Detroit
Cannot parse 211 EUCLID, Detroit
Cannot parse 6000 NEVADA, Detroit
Cannot parse 40 MONTANA, Detroit
Cannot parse 130 MONTANA, Detroit
Cannot parse 181 MONTANA, Detroit
Cannot parse 3131 BEATRICE, Detroit
Cannot parse 4305 BUENA VISTA, Detroit
Cannot parse 1500 BUENA VISTA, Detroit
Cannot parse 118 COLUMBIA, Detroit
Cannot parse 2457 BUENA VISTA, Detroit
Cannot parse 28 GOLDEN, Detroit
Cannot parse 12060 SANTA, Detroit
Cannot parse 12753 SANTA, Detroit
Cannot parse 3029 BUENA VISTA, Detroit
Cannot parse 3336 BUENA VISTA, Detroit
Cannot parse 1

In [90]:
miss_violation_lat_lng

8              (20.6555017, -103.3503161)
26         (42.4173941875, -83.122188625)
47             (20.6555017, -103.3503161)
48        (42.2661497879, -83.1591363636)
49        (42.2610254857, -83.1636500857)
50         (42.259967697, -83.1602879394)
150            (20.6555017, -103.3503161)
156            (20.6555017, -103.3503161)
157            (20.6555017, -103.3503161)
158            (20.6555017, -103.3503161)
159            (20.6555017, -103.3503161)
212       (42.2630850808, -83.1574056263)
214       (42.2630850808, -83.1574056263)
221       (42.3216108163, -83.0926776939)
277              (42.357374, -83.0626299)
298              (42.4171429, -83.140933)
299              (42.4171429, -83.140933)
304             (-33.390742, -70.5588784)
501            (20.6555017, -103.3503161)
502            (20.6555017, -103.3503161)
522              (42.4171429, -83.140933)
551       (42.2669172222, -83.1668251111)
552       (42.2688694444, -83.1649252929)
623             (40.3598275, -83.7

In [93]:
parse_addr('220 CONGRESS, Detroit')

Cannot parse 220 CONGRESS, Detroit, Detroit


(nan, nan)

In [59]:
violation_lat_lng().to_csv('clean/violation.csv')

In [45]:
raw_violation.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 307804 entries, 0 to 307803
Data columns (total 31 columns):
TicketID                 307804 non-null int64
TicketNumber             307804 non-null object
AgencyName               307804 non-null object
ViolName                 307802 non-null object
ViolationStreetNumber    307804 non-null int64
ViolationStreetName      307804 non-null object
MailingStreetNumber      307802 non-null object
MailingStreetName        307804 non-null object
MailingCity              307382 non-null object
MailingState             306916 non-null object
MailingZipCode           307377 non-null object
NonUsAddressCode         425 non-null object
Country                  18484 non-null object
TicketIssuedDT           307804 non-null object
TicketIssuedTime         307804 non-null object
HearingDT                307804 non-null object
CourtTime                307804 non-null object
ViolationCode            307804 non-null object
ViolDescription          307804

In [56]:
raw_violation['HearingDT'].unique()

array(['01/01/38474 12:00:00 AM', '01/01/38425 12:00:00 AM',
       '01/01/38422 12:00:00 AM', ..., '06/08/2005 12:00:00 AM',
       '07/21/2015 12:00:00 AM', '08/20/2015 12:00:00 AM'], dtype=object)

In [55]:
['Department of Public Works',
       'Building and Safety Engineering Department',
       'Detroit Police Department', 'Health Department',
       'Neighborhood City Halls']

['Department of Public Works',
 'Building and Safety Engineering Department',
 'Detroit Police Department',
 'Health Department',
 'Neighborhood City Halls']