# BlightFight Phase 2: Generate Training Set and Testing Set

In [55]:
import random
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import pandas as pd
pd.options.display.max_columns = None
from IPython.display import Image
from util import *

## Inspect permits

In [2]:
permits = pd.read_csv('../data/permits.csv')

In [3]:
len(permits)

7133

In [4]:
permits.head(1)

Unnamed: 0,PERMIT_NO,PERMIT_APPLIED,PERMIT_ISSUED,PERMIT_EXPIRES,SITE_ADDRESS,BETWEEN1,PARCEL_NO,LOT_NUMBER,SUBDIVISION,CASE_TYPE,CASE_DESCRIPTION,LEGAL_USE,ESTIMATED_COST,PARCEL_SIZE,PARCEL_CLUSTER_SECTOR,STORIES,PARCEL_FLOOR_AREA,PARCEL_GROUND_AREA,PRC_AKA_ADDRESS,BLD_PERMIT_TYPE,PERMIT_DESCRIPTION,BLD_PERMIT_DESC,BLD_TYPE_USE,RESIDENTIAL,DESCRIPTION,BLD_TYPE_CONST_COD,BLD_ZONING_DIST,BLD_USE_GROUP,BLD_BASEMENT,FEE_TYPE,CSM_CASENO,CSF_CREATED_BY,SEQ_NO,PCF_AMT_PD,PCF_AMT_DUE,PCF_UPDATED,OWNER_LAST_NAME,OWNER_FIRST_NAME,OWNER_ADDRESS1,OWNER_ADDRESS2,OWNER_CITY,OWNER_STATE,OWNER_ZIP,CONTRACTOR_LAST_NAME,CONTRACTOR_FIRST_NAME,CONTRACTOR_ADDRESS1,CONTRACTOR_ADDRESS2,CONTRACTOR_CITY,CONTRACTOR_STATE,CONTRACTOR_ZIP,CONDITION_FOR_APPROVAL,site_location,owner_location,contractor_location,geom,incident_id
0,BLD2015-03955,8/28/15,8/28/15,,4331 BARHAM,BETWEEN VOIGHT AND WAVENEY,21 65525-6,S15,,BLD,Building Permit,I - FAM DWLG,,5184,3,,0,829,,Dismantle,,,One Family Dwelling,RESIDENTIAL,,5B,R2,R3,Y,WPMT,BLD2015-03955,RSA,1,,$238.00,,DETROIT LAND BANK-HHF2,,65 CADILLAC,,DETROIT,MI,,DMC CONSULTANTS INC 2015 (W),,13500 FOLEY,,DETROIT,,48227,,"4331 BARHAM\r\nDetroit, MI\r\n(42.394106, -82....","65 CADILLAC\r\nDETROIT, MI\r\n(42.331741326000...","13500 FOLEY\r\nDETROIT, MI\r\n(42.379332, -83....",,0


In [5]:
permits['BLD_PERMIT_TYPE'].value_counts()

DISM         5859
Dismantle    1274
Name: BLD_PERMIT_TYPE, dtype: int64

In [6]:
permits.dropna(subset=['site_location'], inplace=True)

In [7]:
permits['Address'] = permits['site_location'].map(lambda x: x.split('\n')[0])
permits['Address'].replace('', np.nan, inplace=True)
permits.dropna(subset=['Address'], inplace=True)

In [8]:
permits['Coordinates'] = permits['site_location'].map(lambda x: x.split('\n')[-1])
permits['Coordinates'].replace('', np.nan, inplace=True)
permits.dropna(subset=['Coordinates'], inplace=True)

In [9]:
len(permits)

6316

In [10]:
permits['Latitude'] = permits['Coordinates'].map(lambda x: x.split(',')[0][1:].strip()).astype(np.float)
permits['Longitude'] = permits['Coordinates'].map(lambda x: x.split(',')[1][:-1].strip()).astype(np.float)
permits = permits[permits['Latitude'] < 45]

## Map to buildings

In [11]:
buildings = pd.read_csv('../data/buildings.csv')

In [12]:
buildings.head(3)

Unnamed: 0,bottom_left,top_right,addr,cluster,num,incident_ids,building_id
0,"[42.255267050999997, -83.161064993999986]","[42.255811, -83.16050500000001]",26585 OUTER DRIVE,0,2,"[53453.0, 446027.0]",0
1,"[42.256026999999996, -83.1607722]","[42.256177, -83.1606222]",3808 S Bassett St,1,1,[446640.0],1
2,"[42.256281200000004, -83.160299]","[42.25643120000001, -83.160149]",3795 S. Bassett,2,1,[446574.0],2


**pandas stores lists as strings !!! Besure to use literal_eval to recover them !!!**

In [13]:
from ast import literal_eval

In [14]:
buildings['incident_ids'] = buildings['incident_ids'].map(literal_eval)

In [15]:
def get_incident_to_building_mapping(buildings):
    mapping = {}
    for index, row in buildings.iterrows():
        for y in row['incident_ids']:
            if y not in mapping:
                mapping[y] = [row['building_id']]
            else:
                mapping[y].append(row['building_id'])
    return mapping

In [16]:
incident_to_building_mapping = get_incident_to_building_mapping(buildings)

In [17]:
import cPickle as pickle
with open('incident_to_building_mapping.dump', 'w') as f:
    pickle.dump(incident_to_building_mapping, f)

## Construct training set

In [18]:
dism_permits_ids = permits['incident_id'].tolist()

In [42]:
dism_buildings = list(set([z for y in [incident_to_building_mapping[x] for x in dism_permits_ids] for z in y]))

In [54]:
len_dism_buildings = len(dism_buildings)
len_dism_buildings

5538

In [44]:
train_positive = pd.DataFrame()
train_positive['building_id'] = sorted(dism_buildings)
train_positive['blighted'] = 1

In [46]:
non_dism_buildings = list(set(buildings['building_id'].tolist()) - set(dism_buildings))

In [47]:
len(non_dism_buildings) + len(dism_buildings) - len(buildings)

0

In [66]:
random.seed(1027)
non_dism_sample = random.sample(non_dism_buildings, len_dism_buildings)

In [67]:
train_negative = pd.DataFrame()
train_negative['building_id'] = sorted(non_dism_sample)
train_negative['blighted'] = 0

In [68]:
train_df = pd.concat((train_positive, train_negative), axis=0, ignore_index=True)

In [70]:
train_df.to_csv('../data/train.csv', index=False)