## Recap of step 0

In [3]:
import numpy as np
import pandas as pd

In [4]:
data_dir = '../data/'

In [5]:
buildings = pd.read_csv(data_dir+'buildings_step_0.csv')
permits = pd.read_csv(data_dir+'permits.csv')

In [6]:
buildings.head(4)

Unnamed: 0,index,building_id,lon,lat,llcrnrlon,llcrnrlat,urcrnrlon,urcrnrlat,addr,PARCELNO,length,width
0,0,0,-83.040064,42.328955,-83.041659,42.3276,-83.037944,42.33058,400 e jefferson,01000001.,0.003715,0.00298
1,1,1,-83.039415,42.327845,-83.040284,42.327441,-83.038001,42.328499,300 e atwater,01000002-3,0.002283,0.001058
2,2,2,-83.041866,42.328047,-83.042504,42.32768,-83.041037,42.328326,201 randolph,01000004.001,0.001467,0.000646
3,3,3,-83.041562,42.327899,-83.042353,42.327402,-83.040638,42.328329,159 randolph,01000004.002L,0.001715,0.000927


In [7]:
permits.columns

Index(['PERMIT_NO', 'PERMIT_APPLIED', 'PERMIT_ISSUED', 'PERMIT_EXPIRES',
       'SITE_ADDRESS', 'BETWEEN1', 'PARCEL_NO', 'LOT_NUMBER', 'SUBDIVISION',
       'CASE_TYPE', 'CASE_DESCRIPTION', 'LEGAL_USE', 'ESTIMATED_COST',
       'PARCEL_SIZE', 'PARCEL_CLUSTER_SECTOR', 'STORIES', 'PARCEL_FLOOR_AREA',
       'PARCEL_GROUND_AREA', 'PRC_AKA_ADDRESS', 'BLD_PERMIT_TYPE',
       'PERMIT_DESCRIPTION', 'BLD_PERMIT_DESC', 'BLD_TYPE_USE', 'RESIDENTIAL',
       'DESCRIPTION', 'BLD_TYPE_CONST_COD', 'BLD_ZONING_DIST', 'BLD_USE_GROUP',
       'BLD_BASEMENT', 'FEE_TYPE', 'CSM_CASENO', 'CSF_CREATED_BY', 'SEQ_NO',
       'PCF_AMT_PD', 'PCF_AMT_DUE', 'PCF_UPDATED', 'OWNER_LAST_NAME',
       'OWNER_FIRST_NAME', 'OWNER_ADDRESS1', 'OWNER_ADDRESS2', 'OWNER_CITY',
       'OWNER_STATE', 'OWNER_ZIP', 'CONTRACTOR_LAST_NAME',
       'CONTRACTOR_FIRST_NAME', 'CONTRACTOR_ADDRESS1', 'CONTRACTOR_ADDRESS2',
       'CONTRACTOR_CITY', 'CONTRACTOR_STATE', 'CONTRACTOR_ZIP',
       'CONDITION_FOR_APPROVAL', 'owner_location'

In [8]:
permits = permits[['PARCEL_NO', 'BLD_PERMIT_TYPE', 'addr', 'lon', 'lat']]

In [9]:
permits['BLD_PERMIT_TYPE'].unique()

array(['Dismantle', 'DISM'], dtype=object)

In [10]:
permits.head(4)

Unnamed: 0,PARCEL_NO,BLD_PERMIT_TYPE,addr,lon,lat
0,2165525-6,Dismantle,4331 barham,-82.9474,42.394106
1,192312.,Dismantle,9707 bessemore,-83.005077,42.395122
2,2169321.,Dismantle,5315 berkshire,-82.946476,42.40322
3,2125930.,Dismantle,16670 bringard dr,-82.947207,42.44664


In [11]:
print("Total number of blighted buildings: %d" % permits.shape[0])

Total number of blighted buildings: 6315


## Adopting building coordinates
It turns out that there is a slight mismatch between real world building coordinates w.r.t given data. So that only average building dimension info is reserved from the building info we got from online open data at data.detroitmi.gov.

### For example: the very first entry of permit has coordinate:

In [12]:
lon01 = permits.loc[1200,'lon']
lat01 = permits.loc[1200,'lat']

In [13]:
c = buildings['addr'].apply(lambda x: x == permits.loc[0,'addr'])

In [14]:
buildings[c]

Unnamed: 0,index,building_id,lon,lat,llcrnrlon,llcrnrlat,urcrnrlon,urcrnrlat,addr,PARCELNO,length,width
261994,261995,261994,-82.947708,42.393997,-82.947911,42.393874,-82.947455,42.394124,4331 barham,21065525-6,0.000456,0.00025


This does not match exactly with the coordinate of the same building in our permit.csv file from the course

__Taking the coordinate from course data files, and assign the buildings with the average dimension taken from real world data:__

In [15]:
length = 0.000411
width = 0.000204

# Step 1: Building List

## Collecting instances from 311 calls, crimes, and blight violations

Data already cleaned by [this notebook](./Cleaning_data.ipynb)
The collection of data was saved at __../data/events.csv__

In [16]:
data_events = pd.read_csv('../data/events.csv')

In [18]:
data_events.sort_values(by='addr',inplace=True)

In [21]:
data_events.head(10)

Unnamed: 0,event_id,lon,lat,addr,type
2778,2778,-83.041618,42.408738,5949 casmere st,1
919,919,-83.175329,42.358054,8641 littlefield,1
12722,12722,-83.137666,42.338734,wagner,1
372868,372868,-83.216326,42.369786,0 10th,3
372312,372312,-83.216326,42.369786,0 10th,3
345447,345447,-83.216326,42.369786,0 10th,3
372311,372311,-83.216326,42.369786,0 10th,3
372887,372887,-83.216326,42.369786,0 10th,3
345451,345451,-83.216326,42.369786,0 10th,3
377671,377671,-83.216326,42.369786,0 10th,3


In [129]:
events_grouped = data_events.groupby(by=['addr'], axis=0)

In [152]:
events_grouped_list = {'addr': [], 'lon': [], 'lat': [], 'event_id_list': []}

unknown_address = []

epsilon = 1.0e-5 # 1/40 of median length of a building or 1/20 of median width of a building

def similar_vals(lst, epsilon=1.0e-5):
    '''determine if the lst contains almost identical values. If it does, return median value.
    Otherwise return -9999'''
    if len(lst) == 1:
        return lst[0]
    
    avg_val = np.median(lst)
    notSimilar = False
    for val in lst:
        if abs(val-avg_val) > epsilon:
            notSimilar=True
            break
    if notSimilar:
        return -9999
    else:
        return avg_val

In [None]:
for a_group in events_grouped:
    address, entry = a_group
    event_id_list = entry['event_id'].values.tolist()
    lon = entry['lon'].drop_duplicates().values.tolist()
    lat = entry['lat'].drop_duplicates().values.tolist()
    
    if len(lon) == 1:
        lon = lon[0] 
    else:
        effect_lon = similar_vals(lon)
        if effect_lon < -1000:
            unknown_address.append(entry)
            continue
        lon = effect_lon
        
    if (len(lat) == 1):
        lat = lat[0]
    else:
        effect_lat = similar_vals(lat)
        if effect_lat < -1000:
            unknown_address.append(entry)
            continue
        lat = effect_lat
        
        
    if len(events_grouped_list['addr']) != 0:
        # check if previous address is actually the same
        prev_lon = events_grouped_list['lon'][-1]
        prev_lat = events_grouped_list['lat'][-1]
        if (abs(lon-prev_lon) < epsilon) and (abs(lat-prev_lat) < epsilon):
            events_grouped_list['event_id_list'][-1] = events_grouped_list['event_id_list'][-1] \
                                                            + event_id_list
        else:
            events_grouped_list['addr'].append(address)
            events_grouped_list['lon'].append(lon)
            events_grouped_list['lat'].append(lat)
            events_grouped_list['event_id_list'].append(event_id_list)
    else:
        events_grouped_list['addr'].append(address)
        events_grouped_list['lon'].append(lon)
        events_grouped_list['lat'].append(lat)
        events_grouped_list['event_id_list'].append(event_id_list)

    
buildings = pd.DataFrame(events_grouped_list)

In [None]:
len(unknown_address)

In [None]:
buildings.shape

In [None]:
buildings.head(10)