In [1]:
import numpy as np
import pandas as pd

# Step 1: Building List

## Collecting instances from 311 calls, crimes, blight violations, and demolition permits.

Data already cleaned by [this notebook](./Cleaning_data.ipynb)
The collection of data was saved at __../data/events.csv__

In [49]:
data_events = pd.read_csv('../data/events.csv')

In [50]:
data_events.head(10)

Unnamed: 0,event_id,lon,lat,addr,type
0,0,-83.161039,42.383998,13120-13130 ilene st,1
1,1,-83.080919,42.440471,1485 e outer dr,1
2,2,-82.962038,42.445244,15460 eastburn,1
3,3,-83.166194,42.421043,17541 mendota st,1
4,4,-83.162874,42.402033,griggs,1
5,5,-83.1581,42.399431,14902 kentucky,1
6,6,-83.24074,42.439669,20089 vaughan,1
7,7,-83.053367,42.430693,18663 fenelon st,1
8,8,-83.161803,42.410764,16170 ilene st,1
9,9,-83.152779,42.437114,19530 roselawn st,1


In [51]:
data_events.shape

(453192, 5)

In [52]:
# To get rid of duplicates with same coordinates and possibly different address names
building_pool = data_events.drop_duplicates(subset=['lon','lat'])

In [53]:
building_pool.shape

(219984, 5)

In [68]:
# For building in buildings sorted by 'lon', and then 'lat':
#    if building.lat same as prev_lat & building.lon same as prev_lon:
#        prev_event_list.append(event_id_list)
# Same process can be used for 'lat'
def condense_buildings(data, by=['lon','lat']):
    '''condense very similar values along a (or several) column direction'''
    sorted_data = data.sort_values(by=by, inplace=False)
    
    data_concise = {'addr': [], 'lon': [], 'lat': [], 'event_id_list': [], 'blighted': []}
    prev_llcrnrlon = 0
    prev_llcrnrlat = 0
    prev_urcrnrlon = 0
    prev_urcrnrlat = 0
    length = 4.11e-4 # longitude
    width = 2.04e-4 # latitude
    for i, entry in sorted_data.iterrows():
        lon = entry['lon']
        lat = entry['lat']
        b = entry['type']
        if (lon<prev_llcrnrlon) or (lon>prev_urcrnrlon) or (lat<prev_llcrnrlat) or (lat>prev_urcrnrlat):
            data_concise['addr'].append(entry['addr'])
            data_concise['lon'].append(entry['lon'])
            data_concise['lat'].append(entry['lat'])
            # below line is different from the loop for events_part2
            data_concise['event_id_list'].append([entry['event_id']])
            if b == 4:
                data_concise['blighted'].append(1)
            else:
                data_concise['blighted'].append(0)

            prev_llcrnrlon = lon - length/2
            prev_llcrnrlat = lat - width/2
            prev_urcrnrlon = lon + length/2
            prev_urcrnrlat = lat + width/2
        else:
            data_concise['event_id_list'][-1] = data_concise['event_id_list'][-1]\
                                              + [entry['event_id']]
            if b == 4:
                data_concise['blighted'][-1] = 1
    return pd.DataFrame(data_concise)

In [69]:
buildings_concise = condense_buildings(data_events, by=['lon','lat'])

In [70]:
len(buildings_concise['addr'])  # shorter than before

202522

In [71]:
buildings = buildings_concise

### Get rid of void coordinates

In [72]:
buildings = buildings[(buildings['lat']>42.25) & (buildings['lat']<42.5) & (buildings['lon']>-83.3) & (buildings['lon']<-82.9)]

In [73]:
buildings.shape

(202521, 5)

In [74]:
buildings['blighted'].value_counts()

0    197659
1      4862
Name: blighted, dtype: int64

## Recap of [step 0](./Building_size_estimation.ipynb)

## Adopting building coordinates
It turns out that there is a slight mismatch between real world building coordinates w.r.t given data. So that only median building dimension info is reserved from the building info we got from online open data at data.detroitmi.gov.

In [26]:
data_dir = '../data/'

In [27]:
buildings_step_0 = pd.read_csv(data_dir+'buildings_step_0.csv')
permits = pd.read_csv(data_dir+'permits.csv')

OSError: File b'../data/buildings_step_0.csv' does not exist

In [None]:
permits = permits[['PARCEL_NO', 'BLD_PERMIT_TYPE', 'addr', 'lon', 'lat']]

In [None]:
permits['BLD_PERMIT_TYPE'].unique()

### For example: the very first entry of permit has coordinate:

In [None]:
lon01 = permits.loc[1200,'lon']
lat01 = permits.loc[1200,'lat']
print(lon01,lat01)

### In real world data, this corresponds to:

In [33]:
c = buildings_step_0['addr'].apply(lambda x: x == permits.loc[0,'addr'])

NameError: name 'buildings_step_0' is not defined

In [None]:
buildings_step_0[c][['lon','lat']]

The coordinate of this building from data.detroitmi.gov is different from data given in our course material.

### Only building dimension info is adopted for our analysis.

In [28]:
length = 0.000411
width = 0.000204  # These results come from step 0.

In [30]:
buildings.loc[:,'llcrnrlon'] = buildings['lon'] - length/2
buildings.loc[:,'llcrnrlat'] = buildings['lat'] - width/2
buildings.loc[:,'urcrnrlon'] = buildings['lon'] + length/2
buildings.loc[:,'urcrnrlat'] = buildings['lat'] + width/2

buildings.loc[:,'building_id'] = np.arange(0,buildings.shape[0])
buildings = buildings.reindex()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [31]:
buildings.tail()

Unnamed: 0,addr,event_id_list,lat,lon,llcrnrlon,llcrnrlat,urcrnrlon,urcrnrlat,building_id
200050,00300 neff,[96671],42.3845,-82.91,-82.910206,42.384398,-82.909794,42.384602,200050
200051,"00000 kerby, grosse point farms",[127565],42.4139,-82.909,-82.909205,42.413798,-82.908794,42.414002,200051
200052,00400 calvin ave,[104873],42.4109,-82.9087,-82.908906,42.410798,-82.908494,42.411002,200052
200053,00 mack and renald,[118910],42.4352,-82.9082,-82.908406,42.435098,-82.907994,42.435302,200053
200054,"20000 mack plaza, grosse pointe woods police d...","[26965, 69819]",42.4314,-82.9053,-82.905506,42.431298,-82.905094,42.431502,200054


In [32]:
buildings.to_csv('../data/buildings.csv', index=False)

### Visualization

In [None]:
from bbox import draw_screen_bbox
from matplotlib import pyplot as plt
%matplotlib inline

In [None]:
buildings = pd.read_csv('../data/buildings.csv')
bboxes = buildings.loc[:,['llcrnrlon','llcrnrlat','urcrnrlon','urcrnrlat']]
bboxes = bboxes.as_matrix()

In [None]:
fig = plt.figure(figsize=(8,6), dpi=2000)
for box in bboxes:     
    draw_screen_bbox(box, fig)
    
plt.xlim(-83.3,-82.9)
plt.ylim(42.25,42.45)
plt.savefig('../data/buildings_distribution.png')
plt.show()