In [28]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.neighbors import NearestNeighbors
from sklearn.ensemble import GradientBoostingClassifier,GradientBoostingRegressor
from sklearn.cross_validation import cross_val_score
from sklearn.metrics import make_scorer

%matplotlib inline

In [65]:
clean_permit = pd.read_pickle('clean/permit.pickle')
clean_violation = pd.read_pickle('clean/violation.pickle') 
clean_crime = pd.read_pickle('clean/crime.pickle')
clean_311 = pd.read_pickle('clean/311.pickle')
clusters = pd.read_csv('clean/buildings.csv')[['lat','lng']]
def cut_date(df):
    return df[df.date.map(lambda x: x.year)>=2005]
clean_permit, clean_violation, clean_crime, clean_311 = map(cut_date,[clean_permit, clean_violation, clean_crime, clean_311])
print len(clusters)

96463


In [79]:
pd.read_csv('clean/buildings.csv')[['lat','lng']]

Unnamed: 0,lat,lng
0,0.714869,0.903169
1,0.720103,0.750076
2,0.761821,0.905622
3,0.985508,0.903681
4,0.111348,0.371944
5,0.747155,0.904098
6,0.757334,0.900653
7,0.758097,0.900395
8,0.756953,0.900783
9,0.575037,0.935626


In [80]:
nbrs = NearestNeighbors(n_neighbors=1).fit(clusters)

In [67]:
def find_nearest(df):
    distances, indices = nbrs.kneighbors(df[['lat','lng']])
    df['neighbor'] = indices
    return df
clean_permit, clean_violation, clean_crime, clean_311 = map(find_nearest,[clean_permit, clean_violation, clean_crime, clean_311])

In [68]:
clusters['judge'] = clean_violation.groupby('neighbor').apply(lambda g: g['JudgeAmt'].mean())
clusters['violation'] = clean_violation.groupby('neighbor').apply(lambda g: g['JudgeAmt'].count())

In [69]:
clusters['permit'] = clean_permit.groupby('neighbor').apply(lambda g: g['addr'].count())
clusters['permit'] = clusters['permit'].fillna(0)

In [70]:
clusters = pd.concat([clusters,pd.get_dummies(clean_crime['Category'],prefix='crime').groupby(clean_crime['neighbor']).sum()],axis=1)
clusters = clusters.fillna(0)

In [71]:
clusters = pd.concat([clusters,pd.get_dummies(clean_311['Category'],prefix='311').groupby(clean_311['neighbor']).sum()],axis=1)
clusters = clusters.fillna(0)

In [72]:
clusters['crime'] = clean_crime.groupby('neighbor').apply(lambda g: g['Category'].count())
clusters['crime'] = clusters['crime'].fillna(0)

In [78]:
clusters

Unnamed: 0,lat,lng,judge,violation,permit,crime_0,crime_1,crime_2,crime_3,crime_4,...,311_14,311_15,311_16,311_17,311_18,311_19,311_20,311_21,311_22,crime
0,0.714869,0.903169,360.00,2.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.720103,0.750076,346.25,8.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.761821,0.905622,0.00,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.985508,0.903681,0.00,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.111348,0.371944,0.00,0.0,1.0,7.0,2.0,2.0,0.0,12.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,36.0
5,0.747155,0.904098,0.00,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.757334,0.900653,0.00,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.758097,0.900395,0.00,0.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
8,0.756953,0.900783,0.00,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.575037,0.935626,0.00,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [73]:
# clusters[clusters['violation']>1000]=1000

In [74]:
def balanced_dtrain(raw, clf=True, sample=True, target='permit'):
    non_zero = raw[raw.sum(axis=1)!=0].copy()
    if clf:
        non_zero[target] = non_zero[target]>0
    positive = non_zero[non_zero[target]>0]
    negative = non_zero[non_zero[target]==0]
    if sample:
        negative = negative.sample(len(positive))
    return pd.concat([negative,positive]).sort_index()

In [75]:
dataset = balanced_dtrain(clusters).drop(['lat','lng'],axis=1)
X = dataset.drop('permit',axis=1)
y = dataset['permit']

In [76]:
clf = GradientBoostingClassifier(n_estimators=1000,max_depth=1)
cross_val_score(clf, X, y, cv=5,scoring="roc_auc").mean()

0.77828436205013318

In [77]:
clf = GradientBoostingClassifier(n_estimators=500,max_depth=3)
cross_val_score(clf, X, y, cv=5,scoring="roc_auc").mean()

0.78866177872044285

In [81]:
dataset = balanced_dtrain(clusters,clf=False).drop(['lat','lng'],axis=1)
X = dataset.drop('permit',axis=1)
y = dataset['permit']
scorer = make_scorer(lambda ground_truth, predictions: ((predictions>=1)==(ground_truth>=1)).mean())

In [82]:
rgr = GradientBoostingRegressor(n_estimators=500,max_depth=3)
cross_val_score(rgr, X, y, cv=5,scoring=scorer).mean()

0.70072989339019187

In [83]:
rgr = GradientBoostingRegressor(n_estimators=800,max_depth=3)
cross_val_score(rgr, X, y, cv=5,scoring=scorer).mean()

0.69699758351101637

In [84]:
rgr = GradientBoostingRegressor(n_estimators=800,max_depth=2)
cross_val_score(rgr, X, y, cv=5,scoring=scorer).mean()

0.70051638948116557