# Step 2: Building Labels

In [126]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [127]:
buildings = pd.read_csv('../data/buildings.csv')
permits = pd.read_csv('../data/permits.csv')

In [128]:
permits.shape

(6315, 57)

In [129]:
buildings.shape

(157193, 9)

In [130]:
permits.head(1)

Unnamed: 0,PERMIT_NO,PERMIT_APPLIED,PERMIT_ISSUED,PERMIT_EXPIRES,SITE_ADDRESS,BETWEEN1,PARCEL_NO,LOT_NUMBER,SUBDIVISION,CASE_TYPE,...,CONTRACTOR_CITY,CONTRACTOR_STATE,CONTRACTOR_ZIP,CONDITION_FOR_APPROVAL,owner_location,contractor_location,geom,addr,lon,lat
0,BLD2015-03955,8/28/15,8/28/15,,4331 BARHAM,BETWEEN VOIGHT AND WAVENEY,2165525-6,S15,,BLD,...,DETROIT,,48227.0,,"65 CADILLAC\nDETROIT, MI\n(42.331741326000042,...","13500 FOLEY\nDETROIT, MI\n(42.379332, -83.177501)",,4331 barham,-82.9474,42.394106


In [131]:
permits = permits[['addr','lon','lat','BLD_PERMIT_TYPE']]  #select only relevant columns

In [132]:
permits['BLD_PERMIT_TYPE'].unique()

array(['Dismantle', 'DISM'], dtype=object)

Any buildings showing up in permits are blighted

In [166]:
buildings['blighted'] = False
permits['by_addr'] = False

In [167]:
permits.head(1)    

Unnamed: 0,addr,lon,lat,BLD_PERMIT_TYPE,by_addr
0,4331 barham,-82.9474,42.394106,Dismantle,False


### By address

In [168]:
buildings['blighted'] = buildings['addr'].apply(lambda x: x in permits['addr'].values.tolist())

In [169]:
buildings['blighted'].value_counts()

False    154637
True       2556
Name: blighted, dtype: int64

In [170]:
blighted_by_addr = buildings[buildings['blighted']==True]
permits['by_addr'] = permits['addr'].apply(lambda x: x in blighted_by_addr['addr'].values.tolist())

In [171]:
permits['by_addr'].value_counts()

False    3409
True     2906
Name: by_addr, dtype: int64

The above numbers reflect that permits sometimes are assigned to the same address (and same coordinates):

In [172]:
permits[permits['by_addr']==True].sort_values(by='addr').head(10)

Unnamed: 0,addr,lon,lat,BLD_PERMIT_TYPE,by_addr
5056,1000 14th,-83.071221,42.322086,DISM,True
5055,1000 14th,-83.071221,42.322086,DISM,True
1964,1000 baldwin,-82.999423,42.352188,DISM,True
6023,1000 scotten,-83.090194,42.315778,DISM,True
6021,1000 scotten,-83.090194,42.315778,DISM,True
6022,1000 scotten,-83.090194,42.315778,DISM,True
5777,10002 nottingham,-82.953115,42.41216,DISM,True
3395,10015 asbury park,-83.207127,42.369328,DISM,True
4655,10016 beaconsfield,-82.954174,42.41233,DISM,True
4654,10016 beaconsfield,-82.954174,42.41233,DISM,True


### By Coordiates:

In [187]:
permits_coord = permits[permits['by_addr'] == False]

In [188]:
epsilon = 1.0e-6
cnt = 0
for i,permit in permits_coord.iterrows():
    lon = permit['lon']
    lat = permit['lat']
    #indices = (buildings['llcrnrlon'] < lon) & (buildings['urcrnrlon'] > lon) \
    #        & (buildings['llcrnrlat'] < lat) & (buildings['urcrnrlat'] > lat)
    indices = (abs(buildings['lon']-lon) < epsilon) & (abs(buildings['lat'] - lat) < epsilon)
    if cnt %100 == 0:
        print(cnt, buildings[indices].index.tolist())
    buildings.loc[indices,'blighted'] = True
    cnt += 1
    

0 []
100 []
200 []
300 []
400 []
500 []
600 []
700 [28603, 28619, 28677, 28906, 28960, 32945, 33627, 33808, 40739, 40806, 40863, 40929, 40950, 40990, 41048, 41851, 41935, 42036, 42135, 42201, 42216, 42295, 42313, 42514, 42552, 42595, 42798, 42926, 42943, 42970, 43064, 43071, 44839, 44853, 44972, 44986, 45033, 45043, 45098, 45121, 45183, 45237, 45257, 45262, 45312, 45450, 45496, 45534, 45575, 45588, 45641, 45751, 45780, 45783, 45793, 45805, 45825, 45846, 45849, 48872, 48960, 49055, 49124, 49193, 49244, 49267, 49284, 49302, 49354, 49489, 49672, 49917, 49938, 50001, 50014, 50070, 50197, 50241, 50302, 50337, 50339, 50359, 50555, 50563, 50627, 50667, 50734, 50788, 50798, 51509, 51613, 51620, 51779, 51876, 52020, 52028, 52110, 52118, 52124, 52141, 52164, 52178, 52237, 52248, 52263, 52281, 52288, 52300, 52338, 52490, 52520, 52546, 52561, 52642, 52683, 52811, 52822, 52870, 52877, 52918, 52930, 52973, 53063, 53100, 53104, 53256, 53365, 53375, 53384, 53476, 53541, 53612, 53624, 54183, 54265, 543

In [175]:
buildings['blighted'].value_counts()

False    144468
True      12725
Name: blighted, dtype: int64

In [176]:
blighted_b = buildings[buildings['blighted'] == True]

In [177]:
blighted_b.sort_values(by='addr')

Unnamed: 0,addr,event_id_list,lat,lon,llcrnrlon,llcrnrlat,urcrnrlon,urcrnrlat,building_id,blighted
3,0 10th,"[371125, 409945, 370505, 184559, 370506, 37576...",42.369786,-83.216326,-83.216532,42.369684,-83.216121,42.369888,3,True
5,0 11th,"[306426, 378616, 373086]",42.411997,-83.167339,-83.167544,42.411895,-83.167133,42.412099,5,True
6,0 15th,[307472],42.441234,-83.219551,-83.219756,42.441132,-83.219345,42.441336,6,True
7,0 18th,[376876],42.341516,-83.087758,-83.087964,42.341414,-83.087553,42.341618,7,True
8,0 21st,[254794],42.323726,-83.081573,-83.081778,42.323624,-83.081367,42.323828,8,True
9,0 22nd,[382405],42.330432,-83.088302,-83.088508,42.330330,-83.088097,42.330534,9,True
11,0 31st st,[440256],42.337560,-83.112723,-83.112928,42.337458,-83.112517,42.337662,11,True
12,0 32nd,"[241853, 436900, 436898]",42.337607,-83.114110,-83.114316,42.337505,-83.113905,42.337709,12,True
13,0 33rd st,[434194],42.337575,-83.115226,-83.115432,42.337473,-83.115021,42.337677,13,True
14,0 35th,"[422701, 387089, 404279]",42.335981,-83.116326,-83.116531,42.335879,-83.116120,42.336083,14,True


In [185]:
buildings.loc[[149292,149498],:]

Unnamed: 0,addr,event_id_list,lat,lon,llcrnrlon,llcrnrlat,urcrnrlon,urcrnrlat,building_id,blighted
149292,8903 eight,"[233671, 181072, 399646, 384895]",42.345279,-83.216524,-83.21673,42.345177,-83.216319,42.345381,149292,True
149498,8931 eight,[219048],42.345279,-83.216524,-83.21673,42.345177,-83.216319,42.345381,149498,True
