In [32]:
import numpy as np
import pandas as pd
from sklearn.neighbors import NearestNeighbors
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.cross_validation import cross_val_score, train_test_split

# Get the cleaned data

These cleaned data, was made in many steps. I tried to make them:
* Get rid of some "strange" data, like the default value of detroit city. Some of them were cleaned by their address.
* Get rid of some points that seems far from Detroit
* Normalize the lat, lng with Max-Min.
* In the building list, the addresses have been cleaned

In [19]:
clean_permit = pd.read_pickle('clean/permit.pickle')
clean_violation = pd.read_pickle('clean/violation.pickle') 
clean_crime = pd.read_pickle('clean/crime.pickle')
clean_311 = pd.read_pickle('clean/311.pickle')
clusters = pd.read_csv('clean/buildings.csv')[['lat','lng']]

In [20]:
# only use the data after the year of 2005
clean_permit, clean_violation, clean_crime, clean_311 = map(lambda df: df[df.date.map(lambda x: x.year)>=2005],
                                                            [clean_permit, clean_violation, clean_crime, clean_311])

In [21]:
nbrs = NearestNeighbors(n_neighbors=1).fit(clusters)
def find_nearest(df):
    distances, indices = nbrs.kneighbors(df[['lat','lng']])
    df['neighbor'] = indices
    return df

clean_permit, clean_violation, clean_crime, clean_311 = map(find_nearest,[clean_permit, clean_violation, clean_crime, clean_311])

In [22]:
clusters['judge'] = clean_violation.groupby('neighbor').apply(lambda g: g['JudgeAmt'].mean())
clusters['violation'] = clean_violation.groupby('neighbor').apply(lambda g: g['JudgeAmt'].count())

In [23]:
clusters['permit'] = clean_permit.groupby('neighbor').apply(lambda g: g['addr'].count())
clusters['permit'] = clusters['permit'].fillna(0)

In [24]:
clusters = pd.concat([clusters,pd.get_dummies(clean_crime['Category'],prefix='crime').groupby(clean_crime['neighbor']).sum()],axis=1)
clusters = clusters.fillna(0)

In [25]:
clusters = pd.concat([clusters,pd.get_dummies(clean_311['Category'],prefix='311').groupby(clean_311['neighbor']).sum()],axis=1)
clusters = clusters.fillna(0)

In [26]:
# clusters['crime'] = clean_crime.groupby('neighbor').apply(lambda g: g['Category'].count())
# clusters['crime'] = clusters['crime'].fillna(0)

In [31]:
clusters.head(3)

Unnamed: 0,lat,lng,judge,violation,permit,crime_0,crime_1,crime_2,crime_3,crime_4,...,311_13,311_14,311_15,311_16,311_17,311_18,311_19,311_20,311_21,311_22
0,0.714869,0.903169,360.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.720103,0.750076,346.25,8.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.761821,0.905622,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [44]:
def balanced_dtrain(raw, clf=True, sample=True, target='permit'):
    non_zero = raw[raw.sum(axis=1)!=0].copy()
    if clf:
        non_zero[target] = non_zero[target]>0
    positive = non_zero[non_zero[target]>0]
    negative = non_zero[non_zero[target]==0]
    if sample:
        negative = negative.sample(len(positive))
    return pd.concat([negative,positive]).sort_index()

In [45]:
dataset = balanced_dtrain(clusters).drop(['lat','lng'],axis=1)
X = dataset.drop('permit',axis=1)
y = dataset['permit']

# MLTK is a my toolkit based on sklearn, developping.
SkPipeline is a pipeline for machine learning, including:
* Scale the data
* Cross validation search
* Then just the normal sklearn model

In [46]:
from MLTK import SkPipeline

In [47]:
clf = SkPipeline(GradientBoostingClassifier)(method="Grid",cv=5,n_estimators=[100,300,700,1500],max_depth=[1,2,3,4]).fit(X,y)
clf.score(X,y)

Best: GradientBoostingClassifier(init=None, learning_rate=0.1, loss='deviance',
              max_depth=1, max_features=None, max_leaf_nodes=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False)


0.76306248667093202

In [43]:
clf = SkPipeline(GradientBoostingClassifier)(scoring="roc_auc",method="Grid",cv=5,n_estimators=[100,300,700,1500],max_depth=[1,2,3,4]).fit(X,y)
clf.score(X,y)

Best: GradientBoostingClassifier(init=None, learning_rate=0.1, loss='deviance',
              max_depth=3, max_features=None, max_leaf_nodes=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False)


0.77010023459159738

In [36]:
clf = GradientBoostingClassifier(n_estimators=1000,max_depth=1)
cross_val_score(clf, X, y, cv=5).mean()

0.75495616140975252

In [37]:
clf = GradientBoostingClassifier(n_estimators=100,max_depth=1)
cross_val_score(clf, X, y, cv=5).mean()

0.76081992841100188

In [38]:
clf = GradientBoostingClassifier(n_estimators=100,max_depth=2)
cross_val_score(clf, X, y, cv=5).mean()

0.75612921063230887

In [39]:
clf = GradientBoostingClassifier(n_estimators=1000,max_depth=1)
cross_val_score(clf, X, y, cv=5,scoring="roc_auc").mean()

0.7784041026151689

In [40]:
clf = GradientBoostingClassifier(n_estimators=100,max_depth=1)
cross_val_score(clf, X, y, cv=5,scoring="roc_auc").mean()

0.78086874947580065

In [41]:
clf = GradientBoostingClassifier(n_estimators=500,max_depth=3)
cross_val_score(clf, X, y, cv=5).mean()

0.74823712660967157

# What about unbalanced data?

In [42]:
clf = GradientBoostingClassifier(n_estimators=500,max_depth=3)
cross_val_score(clf, X, y, cv=5,scoring="roc_auc").mean()

0.78780872082164344

In [48]:
dataset = balanced_dtrain(clusters,sample=False).drop(['lat','lng'],axis=1)
X = dataset.drop('permit',axis=1)
y = dataset['permit']

In [49]:
clf = SkPipeline(GradientBoostingClassifier)(method="Grid",cv=5,n_estimators=[100,300,700,1500],max_depth=[1,2,3,4]).fit(X,y)
clf.score(X,y)

Best: GradientBoostingClassifier(init=None, learning_rate=0.1, loss='deviance',
              max_depth=3, max_features=None, max_leaf_nodes=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=700,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False)


0.96603879207571819

In [50]:
clf = SkPipeline(GradientBoostingClassifier)(scoring="roc_auc",method="Grid",cv=5,n_estimators=[100,300,700,1500],max_depth=[1,2,3,4]).fit(X,y)
clf.score(X,y)

Best: GradientBoostingClassifier(init=None, learning_rate=0.1, loss='deviance',
              max_depth=3, max_features=None, max_leaf_nodes=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False)


0.96400692493494911

# Reset cluster

In [104]:
clean_permit = pd.read_pickle('clean/permit.pickle')
clean_violation = pd.read_pickle('clean/violation.pickle') 
clean_crime = pd.read_pickle('clean/crime.pickle')
clean_311 = pd.read_pickle('clean/311.pickle')
clusters = pd.read_csv('clean/buildings.csv')[['lat','lng']]

In [105]:
n_neighbors = 3
nbrs = NearestNeighbors(n_neighbors=n_neighbors).fit(clusters)
def find_nearest(df,alpha=100):
    distances, indices = nbrs.kneighbors(df[['lat','lng']])
    for k, ind in enumerate(indices.T):
        df['neighbor_%s'%k] = ind
        df['distance_%s'%k] = np.exp(-alpha*distances.T[k])
    return df

clean_permit, clean_violation, clean_crime, clean_311 = map(find_nearest,[clean_permit, clean_violation, clean_crime, clean_311])

In [106]:
clusters['permit'] = clean_permit.groupby('neighbor_0').apply(lambda g: g['addr'].count())
clusters['permit'] = clusters['permit'].fillna(0) > 0

In [101]:
clusters['permit']
for k in range(n_neighbors):
    clean_violation.groupby('neighbor_%s'%k).apply(lambda g: (g['JudgeAmt']*g['distance_%s']%k).mean())

neighbor_0
0         356.307010
1         341.885959
10        297.001513
12        302.624096
20        398.183833
22        319.443287
27        286.704106
35       1113.768688
42         82.426856
44        299.307232
50       1672.446580
58        860.668878
66        274.839168
67        370.424763
68       1108.003393
78         81.954877
114       275.852533
115       480.542635
126       281.461031
132      1029.359071
135       286.696306
140       243.001911
161       325.409988
172       300.986636
176       162.876012
178        82.173755
183       325.147485
204       246.132893
205      1403.977421
222       565.636964
            ...     
96433     140.000000
96434     140.000000
96435     140.000000
96436     140.000000
96437     140.000000
96438     140.000000
96439     140.000000
96440     140.000000
96441     140.000000
96442     140.000000
96443     140.000000
96444     140.000000
96445     140.000000
96446     140.000000
96447     140.000000
96448     140.000000
96

In [96]:
clusters['judge'] = clean_violation.groupby('neighbor').apply(lambda g: g['JudgeAmt'].mean())
clusters['violation'] = clean_violation.groupby('neighbor').apply(lambda g: g['JudgeAmt'].count())

count    6859.000000
mean        0.993807
std         0.033992
min         0.505598
25%         1.000000
50%         1.000000
75%         1.000000
max         1.000000
Name: distance_0, dtype: float64

In [97]:
clean_violation.groupby('neighbor_0').apply(lambda g: g['JudgeAmt'].mean())

neighbor_0
0         360.000000
1         346.250000
10        305.000000
12        305.000000
20        415.000000
22        332.500000
27        305.000000
35       1130.000000
42         85.000000
44        305.000000
50       1698.333333
58        900.833333
66        305.000000
67        387.500000
68       1130.000000
78         85.000000
114       305.000000
115       497.500000
126       305.000000
132      1130.000000
135       305.000000
140       266.500000
161       332.500000
172       305.000000
176       167.500000
178        85.000000
183       332.500000
204       250.000000
205      1432.500000
222       580.000000
            ...     
96433     140.000000
96434     140.000000
96435     140.000000
96436     140.000000
96437     140.000000
96438     140.000000
96439     140.000000
96440     140.000000
96441     140.000000
96442     140.000000
96443     140.000000
96444     140.000000
96445     140.000000
96446     140.000000
96447     140.000000
96448     140.000000
96