# Step 2: Building Labels

In [20]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [21]:
buildings = pd.read_csv('../data/buildings.csv')
permits = pd.read_csv('../data/permits.csv')

In [22]:
permits.shape

(6315, 57)

In [23]:
buildings.shape

(200055, 9)

In [24]:
permits.head(1)

Unnamed: 0,PERMIT_NO,PERMIT_APPLIED,PERMIT_ISSUED,PERMIT_EXPIRES,SITE_ADDRESS,BETWEEN1,PARCEL_NO,LOT_NUMBER,SUBDIVISION,CASE_TYPE,...,CONTRACTOR_CITY,CONTRACTOR_STATE,CONTRACTOR_ZIP,CONDITION_FOR_APPROVAL,owner_location,contractor_location,geom,addr,lon,lat
0,BLD2015-03955,8/28/15,8/28/15,,4331 BARHAM,BETWEEN VOIGHT AND WAVENEY,2165525-6,S15,,BLD,...,DETROIT,,48227.0,,"65 CADILLAC\nDETROIT, MI\n(42.331741326000042,...","13500 FOLEY\nDETROIT, MI\n(42.379332, -83.177501)",,4331 barham,-82.9474,42.394106


In [25]:
permits = permits[['addr','lon','lat','BLD_PERMIT_TYPE']]  #select only relevant columns

In [26]:
permits['BLD_PERMIT_TYPE'].unique()

array(['Dismantle', 'DISM'], dtype=object)

Any buildings showing up in permits are blighted

In [27]:
buildings['blighted'] = False
permits['by_addr'] = False   # if permits assigned to buildings by address

In [28]:
permits.head(1)    

Unnamed: 0,addr,lon,lat,BLD_PERMIT_TYPE,by_addr
0,4331 barham,-82.9474,42.394106,Dismantle,False


In [95]:
permits.drop_duplicates('lon','lat')

ValueError: keep must be either "first", "last" or False

### Assign By Address

In [29]:
buildings['blighted'] = buildings['addr'].apply(lambda x: x in permits['addr'].values.tolist())

In [30]:
buildings['blighted'].value_counts()

False    197555
True       2500
Name: blighted, dtype: int64

In [31]:
blighted_by_addr = buildings[buildings['blighted']==True] # list of address
permits['by_addr'] = permits['addr'].apply(lambda x: x in blighted_by_addr['addr'].values.tolist())

In [32]:
permits['by_addr'].value_counts()

False    3673
True     2642
Name: by_addr, dtype: int64

The above numbers reflect that permits sometimes are assigned to the same address (and same coordinates):

In [33]:
permits[permits['by_addr']==True].sort_values(by='addr').head(10)

Unnamed: 0,addr,lon,lat,BLD_PERMIT_TYPE,by_addr
5055,1000 14th,-83.071221,42.322086,DISM,True
5056,1000 14th,-83.071221,42.322086,DISM,True
1964,1000 baldwin,-82.999423,42.352188,DISM,True
6021,1000 scotten,-83.090194,42.315778,DISM,True
6022,1000 scotten,-83.090194,42.315778,DISM,True
6023,1000 scotten,-83.090194,42.315778,DISM,True
4104,10000 mettetal,-83.205763,42.369181,DISM,True
5777,10002 nottingham,-82.953115,42.41216,DISM,True
3395,10015 asbury park,-83.207127,42.369328,DISM,True
4654,10016 beaconsfield,-82.954174,42.41233,DISM,True


### Assign By Coordiates:

In [48]:
permits_coord = permits.loc[permits['by_addr'] == False,:]
permits_coord.loc[:,'by_coord'] = False

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [76]:
epsilon1 = 4.011e-4/2.0  # fuzzy match
epsilon2 = 2.04e-4/2.0
cnt = 0
for i,permit in permits_coord.iterrows():
    lon = permit['lon']
    lat = permit['lat']
    #indices = (buildings['llcrnrlon'] < lon) & (buildings['urcrnrlon'] > lon) \
    #        & (buildings['llcrnrlat'] < lat) & (buildings['urcrnrlat'] > lat)
    indices = (abs(buildings['lon']-lon) < epsilon1) & (abs(buildings['lat'] - lat) < epsilon2)
    buildings.loc[indices,'blighted'] = True
    if(len(buildings.loc[indices,:].index.tolist())!=0):
        cnt += 1
        if(cnt%10 == 5):
            print(buildings.loc[indices,:].index.tolist())
    

[111482, 111541, 111558, 111621, 111624]
[41087, 41149, 41284]
[183778]
[131544, 131565, 131690]
[146147]
[90238]
[22633]
[141292, 141434]
[191211]
[106952, 107033]
[108295]
[94035, 94089, 94113]
[100682, 100735, 100803, 100877]
[151941, 151989]
[141537, 141577, 141655]
[171171, 171231, 171248, 171251]
[82938, 83079]
[137957, 137960, 137967]
[187817, 187880, 187889, 187923]
[84696]
[19262, 19288]
[171591]
[88290, 88305, 88343, 88394, 88427, 88464]
[123274]
[55379, 55412, 55483]
[175229]
[142444, 142486, 142498, 142517, 142548, 142603]
[165588, 165644]
[75489, 75547, 75619]
[59527, 59541, 59559]
[169097]
[142444, 142486, 142498, 142517, 142548, 142603]
[31972]
[138625, 138668, 138754]
[142444, 142486, 142498, 142517, 142548, 142603]
[145993, 146022]
[137080]
[130868]
[119164]
[159383, 159462, 159550]
[43490, 43543, 43579, 43649, 43750, 43778, 43783, 43792]
[114782]
[142444, 142486, 142498, 142517, 142548, 142603]
[110435]
[124918, 124926, 124958, 124960]
[95449, 95543]
[96569]
[97507, 9

In [74]:
buildings.loc[142517,:]

addr                                              0 aberton avenue
event_id_list    [387080, 387076, 387078, 388121, 387072, 38708...
lat                                                        42.3317
lon                                                        -83.048
llcrnrlon                                                 -83.0482
llcrnrlat                                                  42.3316
urcrnrlon                                                 -83.0478
urcrnrlat                                                  42.3318
building_id                                                 142517
blighted                                                      True
Name: 142517, dtype: object

1301

In [82]:
len(permits.drop_duplicates(['lon','lat']))

4927

In [88]:
dict1 = {"A":[1,2,3,4,5,1,2,3,4,5], "B":[5,2,3,4,5,5,4,3,2,1]}

In [89]:
df = pd.DataFrame(dict1)

In [91]:
df.drop_duplicates()

Unnamed: 0,A,B
0,1,5
1,2,2
2,3,3
3,4,4
4,5,5
6,2,4
8,4,2
9,5,1
