In [195]:
import numpy as np
import pandas as pd
from sklearn.neighbors import NearestNeighbors
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.cross_validation import cross_val_score
from sklearn.preprocessing import StandardScaler as Scaler

# Get the cleaned data

These cleaned data, was made in many steps. I tried to make them:
* Get rid of some "strange" data, like the default value of detroit city. Some of them were cleaned by their address.
* Get rid of some points that seems far from Detroit
* Normalize the lat, lng with Max-Min.
* In the building list, the addresses have been cleaned

In [224]:
clean_permit = pd.read_pickle('clean/permit.pickle')
clean_violation = pd.read_pickle('clean/violation.pickle') 
clean_crime = pd.read_pickle('clean/crime.pickle')
clean_311 = pd.read_pickle('clean/311.pickle')
clusters = pd.read_csv('clean/buildings.csv')[['lat','lng']]

In [225]:
# only use the data after the year of 2005
clean_permit, clean_violation, clean_crime, clean_311 = map(lambda df: df[df.date.map(lambda x: x.year)>=2005],
                                                            [clean_permit, clean_violation, clean_crime, clean_311])

In [226]:
nbrs = NearestNeighbors(n_neighbors=1).fit(clusters)
def find_nearest(df):
    distances, indices = nbrs.kneighbors(df[['lat','lng']])
    df['neighbor'] = indices
    return df

clean_permit, clean_violation, clean_crime, clean_311 = map(find_nearest,[clean_permit, clean_violation, clean_crime, clean_311])

In [227]:
clusters['judge'] = clean_violation.groupby('neighbor').apply(lambda g: g['JudgeAmt'].mean())
clusters['violation'] = clean_violation.groupby('neighbor').apply(lambda g: g['JudgeAmt'].count())

In [228]:
clusters['permit'] = clean_permit.groupby('neighbor').apply(lambda g: g['addr'].count())
clusters['permit'] = clusters['permit'].fillna(0)

In [229]:
clusters = pd.concat([clusters,pd.get_dummies(clean_crime['Category'],prefix='crime').groupby(clean_crime['neighbor']).sum()],axis=1)
clusters = clusters.fillna(0)

In [230]:
clusters = pd.concat([clusters,pd.get_dummies(clean_311['Category'],prefix='311').groupby(clean_311['neighbor']).sum()],axis=1)
clusters = clusters.fillna(0)

In [231]:
# clusters['crime'] = clean_crime.groupby('neighbor').apply(lambda g: g['Category'].count())
# clusters['crime'] = clusters['crime'].fillna(0)

In [232]:
clusters.head(3)

Unnamed: 0,lat,lng,judge,violation,permit,crime_0,crime_1,crime_2,crime_3,crime_4,...,311_13,311_14,311_15,311_16,311_17,311_18,311_19,311_20,311_21,311_22
0,0.714869,0.903169,360.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.720103,0.750076,346.25,8.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.761821,0.905622,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [44]:
def balanced_dtrain(raw, clf=True, sample=True, target='permit'):
    non_zero = raw[raw.sum(axis=1)!=0].copy()
    if clf:
        non_zero[target] = non_zero[target]>0
    positive = non_zero[non_zero[target]>0]
    negative = non_zero[non_zero[target]==0]
    if sample:
        negative = negative.sample(len(positive))
    return pd.concat([negative,positive]).sort_index()

In [45]:
dataset = balanced_dtrain(clusters).drop(['lat','lng'],axis=1)
X = dataset.drop('permit',axis=1)
y = dataset['permit']

# MLTK is a my toolkit based on sklearn, developping.
SkPipeline is a pipeline for machine learning, including:
* Scale the data
* Cross validation search
* Then just the normal sklearn model

In [46]:
from MLTK import SkPipeline

In [47]:
clf = SkPipeline(GradientBoostingClassifier)(method="Grid",cv=5,n_estimators=[100,300,700,1500],max_depth=[1,2,3,4]).fit(X,y)
clf.score(X,y)

Best: GradientBoostingClassifier(init=None, learning_rate=0.1, loss='deviance',
              max_depth=1, max_features=None, max_leaf_nodes=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False)


0.76306248667093202

In [43]:
clf = SkPipeline(GradientBoostingClassifier)(scoring="roc_auc",method="Grid",cv=5,n_estimators=[100,300,700,1500],max_depth=[1,2,3,4]).fit(X,y)
clf.score(X,y)

Best: GradientBoostingClassifier(init=None, learning_rate=0.1, loss='deviance',
              max_depth=3, max_features=None, max_leaf_nodes=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False)


0.77010023459159738

In [36]:
clf = GradientBoostingClassifier(n_estimators=1000,max_depth=1)
cross_val_score(clf, X, y, cv=5).mean()

0.75495616140975252

In [37]:
clf = GradientBoostingClassifier(n_estimators=100,max_depth=1)
cross_val_score(clf, X, y, cv=5).mean()

0.76081992841100188

In [38]:
clf = GradientBoostingClassifier(n_estimators=100,max_depth=2)
cross_val_score(clf, X, y, cv=5).mean()

0.75612921063230887

In [39]:
clf = GradientBoostingClassifier(n_estimators=1000,max_depth=1)
cross_val_score(clf, X, y, cv=5,scoring="roc_auc").mean()

0.7784041026151689

In [40]:
clf = GradientBoostingClassifier(n_estimators=100,max_depth=1)
cross_val_score(clf, X, y, cv=5,scoring="roc_auc").mean()

0.78086874947580065

In [41]:
clf = GradientBoostingClassifier(n_estimators=500,max_depth=3)
cross_val_score(clf, X, y, cv=5).mean()

0.74823712660967157

# What about unbalanced data?

In [42]:
clf = GradientBoostingClassifier(n_estimators=500,max_depth=3)
cross_val_score(clf, X, y, cv=5,scoring="roc_auc").mean()

0.78780872082164344

In [48]:
dataset = balanced_dtrain(clusters,sample=False).drop(['lat','lng'],axis=1)
X = dataset.drop('permit',axis=1)
y = dataset['permit']

In [49]:
clf = SkPipeline(GradientBoostingClassifier)(method="Grid",cv=5,n_estimators=[100,300,700,1500],max_depth=[1,2,3,4]).fit(X,y)
clf.score(X,y)

Best: GradientBoostingClassifier(init=None, learning_rate=0.1, loss='deviance',
              max_depth=3, max_features=None, max_leaf_nodes=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=700,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False)


0.96603879207571819

In [50]:
clf = SkPipeline(GradientBoostingClassifier)(scoring="roc_auc",method="Grid",cv=5,n_estimators=[100,300,700,1500],max_depth=[1,2,3,4]).fit(X,y)
clf.score(X,y)

Best: GradientBoostingClassifier(init=None, learning_rate=0.1, loss='deviance',
              max_depth=3, max_features=None, max_leaf_nodes=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False)


0.96400692493494911

# Reset cluster

In [212]:
clean_permit = pd.read_pickle('clean/permit.pickle')
clean_violation = pd.read_pickle('clean/violation.pickle') 
clean_crime = pd.read_pickle('clean/crime.pickle')
clean_311 = pd.read_pickle('clean/311.pickle')
clusters = pd.read_csv('clean/buildings.csv')[['lat','lng']]

In [213]:
n_neighbors = 1
nbrs = NearestNeighbors(n_neighbors=n_neighbors).fit(clusters)
def find_nearest(df,alpha=0):
    distances, indices = nbrs.kneighbors(df[['lat','lng']])
    for k, ind in enumerate(indices.T):
        df['neighbor_%s'%k] = ind
        df['distance_%s'%k] = np.exp(-alpha*distances.T[k])
    return df

clean_permit, clean_violation, clean_crime, clean_311 = map(find_nearest,[clean_permit, clean_violation, clean_crime, clean_311])

In [214]:
clusters['permit'] = clean_permit.groupby('neighbor_0').apply(lambda g: g['addr'].count())
clusters['permit'] = clusters['permit'].fillna(0) > 0

In [215]:
clusters['judge']=0
for k in range(n_neighbors):
    clusters['judge'] += clean_violation.groupby('neighbor_%s'%k).apply(lambda g: (g['JudgeAmt']*g['distance_%s'%k]).mean()).ix[clusters.index].fillna(0)
clusters['judge'] /= n_neighbors
# clusters['judge'] = clusters['judge'].fillna(0)

In [216]:
clusters['violation']=0
for k in range(n_neighbors):
    clusters['violation'] += clean_violation.groupby('neighbor_%s'%k).apply(lambda g: g['distance_%s'%k].sum()).ix[clusters.index].fillna(0)
clusters['violation'] /= n_neighbors

# Use loop to avoid memory error

In [217]:
crime = pd.get_dummies(clean_crime['Category'],prefix='crime')
cat = pd.DataFrame(0,index=clusters.index,columns=crime.columns)
for col in crime.columns:
    for k in range(n_neighbors):
        cat[col] += (crime[col]*clean_crime['distance_%s'%k]).groupby(clean_crime['neighbor_%s'%k]).sum().ix[clusters.index].fillna(0)
    cat[col] /= n_neighbors
clusters = pd.concat([clusters,cat],axis=1)

In [218]:
calls = pd.get_dummies(clean_311['Category'],prefix='calls')
cat = pd.DataFrame(0,index=clusters.index,columns=calls.columns)
for col in calls.columns:
    for k in range(n_neighbors):
        cat[col] += (calls[col]*clean_311['distance_%s'%k]).groupby(clean_311['neighbor_%s'%k]).sum().ix[clusters.index].fillna(0)
    cat[col] /= n_neighbors
clusters = pd.concat([clusters,cat],axis=1)

In [219]:
clusters

Unnamed: 0,lat,lng,permit,judge,violation,crime_0,crime_1,crime_2,crime_3,crime_4,...,calls_13,calls_14,calls_15,calls_16,calls_17,calls_18,calls_19,calls_20,calls_21,calls_22
0,0.714869,0.903169,True,360.00,2.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.720103,0.750076,True,346.25,8.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.761821,0.905622,True,0.00,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.985508,0.903681,True,0.00,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.111348,0.371944,True,0.00,0.0,7.0,2.0,2.0,0.0,12.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.747155,0.904098,True,0.00,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.757334,0.900653,True,0.00,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.758097,0.900395,True,0.00,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.756953,0.900783,True,0.00,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.575037,0.935626,True,0.00,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [220]:
# drop = 'crime'
# clusters = clusters.drop(filter(lambda x: x.startswith(drop),clusters.columns),axis=1)

# filter(lambda x: x.startswith(drop),clusters.columns)

In [233]:
dataset = balanced_dtrain(clusters).drop(['lat','lng'],axis=1)
X = dataset.drop('permit',axis=1)
y = dataset['permit']

In [234]:
class ScaledGBM(GradientBoostingClassifier):
    def fit(self,X,y):
        self.__scaler = Scaler().fit(X)
        X = self.__scaler.transform(X)
        super(ScaledGBM,self).fit(X,y)
        return self
    def predict(self,X):
        X = self.__scaler.transform(X)
        return super(ScaledGBM,self).predict(X)

In [235]:
clf = ScaledGBM(n_estimators=100,max_depth=1)
cross_val_score(clf, X, y, cv=5).mean()

0.75847621929990239

0.34020480064320469