In [79]:
# Basic
from datetime import datetime, timedelta
import time

# Data Analysis Specific
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from dateutil.parser import parse as date_parser
from geopy.geocoders import Nominatim

# Marchine Learning Specific
from sklearn.ensemble import RandomForestRegressor,RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVR,SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cross_validation import train_test_split,cross_val_score

# IPython magic
%matplotlib inline
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
DATA_PATH = './data/'
DETROIT_LAT = (42.252, 42.452)
DETROIT_LNG = (-83.295, -82.895)
SIZE_RATIO = 1000
DETROIT_WIDTH = 200
DETROIT_HEIGHT = 400
print DETROIT_WIDTH, DETROIT_HEIGHT, float(DETROIT_WIDTH)/DETROIT_HEIGHT

200 400 0.5


In [3]:
clean_permit = pd.read_csv('clean/permit.csv',index_col=0)
clean_violation = pd.read_csv('clean/violation.csv',index_col=0)
clean_crime = pd.read_csv('clean/crime.csv',index_col=0)
clean_311 = pd.read_csv('clean/311.csv',index_col=0)

In [4]:
def generateDataMap(data, feature='Count'):
    data = data.copy()
    if feature == 'Count':
        data['Count'] = 1
    data_map = np.zeros(shape=(int(DETROIT_WIDTH),int(DETROIT_HEIGHT)))
    for row in range(len(data)):
        lat, lng = data.ix[row,'lat'],data.ix[row,'lng']
        if DETROIT_LAT[0]<= lat < DETROIT_LAT[1] and DETROIT_LNG[0]<= lng < DETROIT_LNG[1]:
            r = int((lat - DETROIT_LAT[0])*SIZE_RATIO) 
            c = int((lng - DETROIT_LNG[0])*SIZE_RATIO)
            data_map[r,c] += data.ix[row,feature]
    return data_map   

In [5]:
map_vj = generateDataMap(clean_violation, 'JudgeAmt')
map_v = generateDataMap(clean_violation)
map_t = generateDataMap(clean_311)
map_c = generateDataMap(clean_crime)
map_p = generateDataMap(clean_permit)

In [9]:
def generateDataMapFrame(data_map,feature):
    lat = range(data_map.shape[0])
    lng = range(data_map.shape[1])
    data_map_frame = pd.DataFrame()
    data_map_frame['lat'] = reduce(lambda x,y: x+y,map(lambda x: [x]*data_map.shape[1], lat))
    data_map_frame['lng'] = lng*data_map.shape[0]
    data_map_frame[feature] = data_map.flatten('C')
    return data_map_frame.drop(['lat','lng'],axis=1)

In [52]:
frame_vj = generateDataMapFrame(map_vj, 'vj')
frame_v = generateDataMapFrame(map_v, 'v')
frame_p = generateDataMapFrame(map_p, 'p')
frame_c = generateDataMapFrame(map_c, 'c')
frame_t = generateDataMapFrame(map_t, 't')

In [53]:
raw_train = pd.concat([frame_vj,frame_v,frame_p,frame_c,frame_t],axis=1)

In [46]:
def balanced_dtrain(raw, clf=True):
    non_zero = raw[raw.sum(axis=1)!=0].copy()
    if clf:
        non_zero['p'] = non_zero['p']>0
    positive = non_zero[non_zero['p']>0]
    negative = non_zero[non_zero['p']==0]
    sample_n = negative.sample(len(positive))
    return pd.concat([sample_n,positive]).sort_index()

In [54]:
dtrain = balanced_dtrain(raw_train)

In [55]:
train, test = train_test_split(dtrain, test_size = 0.2)

In [56]:
rfc = RandomForestClassifier(100)
rfc.fit(train.drop('p',axis=1),train['p'])
print rfc.score(train.drop('p',axis=1),train['p'])
# print ((rfr.predict(dtrain[['vj','v']])>0) == (dtrain['p']>0)).sum()/len(dtrain)

0.921411265899


In [57]:
rfc.score(test.drop('p',axis=1),test['p'])

0.58958837772397099

In [72]:
adbc = AdaBoostClassifier(n_estimators=2000)
adbc.fit(train.drop('p',axis=1),train['p'])
print adbc.score(train.drop('p',axis=1),train['p'])

0.662174439733


In [73]:
print adbc.score(test.drop('p',axis=1),test['p'])

0.621670702179


In [84]:
clf = AdaBoostClassifier(n_estimators=2000)
cross_val_score(clf, dtrain.drop('p',axis=1),dtrain['p'], cv=5).mean()

0.62609846650524614

In [None]:
clf = RandomForestClassifier(n_estimators=500)
cross_val_score(clf, dtrain.drop('p',axis=1),dtrain['p'], cv=5).mean()

0.59799691833590141

In [None]:
clf = SVC(kernel='linear')
cross_val_score(clf, dtrain.drop('p',axis=1),dtrain['p'], cv=5).mean()