# Data Preprocessing
This notebook includes all the steps we took to prepare our data for our three models (i.e. Naive Bayes, Random Forest Classification, and Bayesian Networks). 

In [179]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [180]:
# import file
dbn = pd.read_csv('data/all_boilder_year_data_w_ACS.csv')
dbn.drop('Unnamed: 0', axis=1, inplace=True)
print(dbn.shape)
dbn.head()

  interactivity=interactivity, compiler=compiler, result=result)


(1632518, 95)


Unnamed: 0,bin_number,boiler_id,defects_exist,inspection_year,first_inspection,boiler_age,years_since_last_defects,boiler_make,boiler_model,boiler_pressure_type,...,total_population,white_population,median_household_income,state,county,tract,boro,over_65_pct,over_white_pct,tract2
0,1080776,10000000027N0001,0,1993,1993,0,,SLANTFIN,GG-399,Low Pressure,...,3803.0,2978.0,135588.0,36.0,61.0,7400.0,1.0,0.14173,0.783066,7400.0
1,1080776,10000000027N0001,0,1994,1993,1,,SLANTFIN,GG-399,Low Pressure,...,3803.0,2978.0,135588.0,36.0,61.0,7400.0,1.0,0.14173,0.783066,7400.0
2,1080776,10000000027N0001,0,1995,1993,2,,SLANTFIN,GG-399,Low Pressure,...,3803.0,2978.0,135588.0,36.0,61.0,7400.0,1.0,0.14173,0.783066,7400.0
3,1080776,10000000027N0001,0,1996,1993,3,,SLANTFIN,GG-399,Low Pressure,...,3803.0,2978.0,135588.0,36.0,61.0,7400.0,1.0,0.14173,0.783066,7400.0
4,1080776,10000000027N0001,0,1997,1993,4,,SLANTFIN,GG-399,Low Pressure,...,3803.0,2978.0,135588.0,36.0,61.0,7400.0,1.0,0.14173,0.783066,7400.0


## Step 1: Create Dummy Variables

List of variables we want to include from dataset (italics are features we need to create dummy variable for):
* **Defects exist** (what our model will predict)
* Years since last failure (if any)
* Boiler Age
* *Boiler Pressure Type* - turn to 0/1
* *Boiler Make/Model*
* *Census tract*
* Pct White
* Median Income
* *Building Owner*
* Building Area
* Res Units
* Number Floors
* *Land Use (11 discrete values)*

In [181]:
dbn_cropped = dbn[['boiler_id','defects_exist','ct_bc_comb','boiler_age',
                  'years_since_last_defects', 'boiler_make', 'boiler_model', 
                   'boiler_pressure_type','ownername','landuse','bldgarea',
                   'unitstotal','builtfar','median_household_income',
                   'over_65_pct', 'over_white_pct'
                  ]].copy()
dbn_cropped.iloc[13]

boiler_id                       10000000027N0001
defects_exist                                  0
ct_bc_comb                                107400
boiler_age                                    15
years_since_last_defects                       3
boiler_make                             SLANTFIN
boiler_model                              GG-399
boiler_pressure_type                Low Pressure
ownername                   88 MADISON HOTEL FEE
landuse                                        5
bldgarea                                  185785
unitstotal                                     8
builtfar                                   10.38
median_household_income                   135588
over_65_pct                              0.14173
over_white_pct                          0.783066
Name: 13, dtype: object

#### 1. Pressure Type

In [182]:
dbn_cropped.boiler_pressure_type = pd.Categorical(dbn_cropped.boiler_pressure_type)
dbn_cropped['dummy_pressure_type'] = dbn_cropped.boiler_pressure_type.cat.codes
dbn_cropped.dummy_pressure_type.head()

0    1
1    1
2    1
3    1
4    1
Name: dummy_pressure_type, dtype: int8

low pressure = 1, high pressure = 0

#### 2. Boiler Make/Model
find worst performers and combine into a binary variable?

In [183]:
defects = dbn[dbn.defects_exist == 1]
defects.shape, dbn.shape

((191806, 95), (1632518, 95))

In [184]:
def RateFunc(field):
    '''
    this function is used to determine what classes for a given field 
    have the highest rate of failure
    '''
    gb_defects = defects.groupby(field)['bin_number'].count().reset_index()
    gb_all = dbn.groupby(field)['bin_number'].count().reset_index()
    summary = gb_defects.merge(gb_all, on=field)
    summary = summary.rename(columns={'bin_number_x':'CountFailed', 'bin_number_y':'CountTotal'})
    summary['FailureRate'] = (summary.CountFailed/summary.CountTotal)*100
    summary = summary.sort_values(by='FailureRate',ascending=False).reset_index(drop=True)
    return summary

In [185]:
dbn_cropped.boiler_make.nunique()

1851

In [186]:
dbn_cropped.boiler_model.nunique()

12659

*Obviously can't search over 1,851 makes and 12,659 models. Instead, create dummy variables for worst performing model/make combinations*

In [187]:
dbn_cropped.shape[0]

1632518

In [188]:
make = RateFunc(['boiler_make'])
make[make.CountTotal > 200].head()

Unnamed: 0,boiler_make,CountFailed,CountTotal,FailureRate
116,BRYAN,295,1041,28.338136
120,NATIONAL RADIATOR,80,289,27.681661
145,GIBRALTOR,93,369,25.203252
168,LCHINVAR,59,238,24.789916
188,HAMILTON,100,437,22.883295


In [189]:
bad_makes = set()
sum_counts = 0
num_records = dbn_cropped.shape[0]

for i, row in make.iterrows():
    if sum_counts>(num_records/10):
        break
    # only bad makes that account for 10%?
    bad_makes.add(row['boiler_make'])
    sum_counts+=row['CountTotal']
        
bad_makes = list(bad_makes)

In [190]:
len(bad_makes), sum_counts, num_records

(401, 227782, 1632518)

In [191]:
dbn_cropped['dummy_make'] = dbn_cropped.boiler_make.apply(lambda x: x in bad_makes)
dbn_cropped.dummy_make.head()

0    True
1    True
2    True
3    True
4    True
Name: dummy_make, dtype: bool

#### Now for boiler model types

In [192]:
models = RateFunc(['boiler_model'])
bad_models = set()
sum_counts = 0

for i, row in models.iterrows():
    if sum_counts>(num_records/10):
        break
    # only bad models that account for 10%?
    bad_models.add(row['boiler_model'])
    sum_counts+=row['CountTotal']
        
bad_models = list(bad_models)
len(bad_models)

3642

In [193]:
dbn_cropped['dummy_model'] = dbn_cropped.boiler_model.apply(lambda x: x in bad_models)
dbn_cropped.dummy_model.head()

0    True
1    True
2    True
3    True
4    True
Name: dummy_model, dtype: bool

#### 3. Census Tracts
isolate poor performing census tracts

In [194]:
tracts = RateFunc(['ct_bc_comb'])
bad_tracts = set()
sum_counts = 0

for i, row in tracts.iterrows():
    if sum_counts>(num_records/10):
        break
    # only bad models that account for 10%?
    bad_tracts.add(row['ct_bc_comb'])
    sum_counts+=row['CountTotal']
        
bad_tracts = list(bad_tracts)
len(bad_tracts)

317

In [195]:
dbn_cropped['dummy_tract'] = dbn_cropped.ct_bc_comb.apply(lambda x: x in bad_tracts)
dbn_cropped.dummy_tract.head()

0    False
1    False
2    False
3    False
4    False
Name: dummy_tract, dtype: bool

#### 4. Building Owner

In [196]:
owners = RateFunc(['ownername'])
bad_owners = set()
sum_counts = 0

for i, row in owners.iterrows():
    if sum_counts>(num_records/10):
        break
    # only bad models that account for 10%?
    bad_owners.add(row['ownername'])
    sum_counts+=row['CountTotal']
        
bad_owners = list(bad_owners)
len(bad_owners)

5928

In [197]:
dbn_cropped['dummy_owner'] = dbn_cropped.ownername.apply(lambda x: x in bad_owners)
dbn_cropped.dummy_owner.head()

0    True
1    True
2    True
3    True
4    True
Name: dummy_owner, dtype: bool

#### 5. Years since last defects

What to do about nans in years_since_last_defects field?

In [198]:
# more nan values than not, cannot just drop them from table
print(dbn_cropped[dbn_cropped.years_since_last_defects.isnull()].shape,
dbn_cropped[~dbn_cropped.years_since_last_defects.isnull()].shape)

(987465, 21) (645053, 21)


In [199]:
# test below code on subsample of dataset (it takes a long time)
# subsam = dbn_cropped.iloc[:100000].copy()
# dbn_cropped = dbn_cropped.copy()

In [200]:
# create dummy variable for defects history
# track number of defects for given boiler ID

dbn_cropped['defects_history'] = dbn_cropped.groupby('boiler_id')['defects_exist'].cumsum().shift(1).fillna(0).astype(int)
dbn_cropped.iloc[15:22]

Unnamed: 0,boiler_id,defects_exist,ct_bc_comb,boiler_age,years_since_last_defects,boiler_make,boiler_model,boiler_pressure_type,ownername,landuse,...,builtfar,median_household_income,over_65_pct,over_white_pct,dummy_pressure_type,dummy_make,dummy_model,dummy_tract,dummy_owner,defects_history
15,10000000027N0001,0,107400,17,1.0,SLANTFIN,GG-399,Low Pressure,88 MADISON HOTEL FEE,5.0,...,10.38,135588.0,0.14173,0.783066,1,True,True,False,True,2
16,10000000027N0001,0,107400,19,3.0,SLANTFIN,GG-399,Low Pressure,88 MADISON HOTEL FEE,5.0,...,10.38,135588.0,0.14173,0.783066,1,True,True,False,True,2
17,10000000027N0001,1,107400,20,4.0,SLANTFIN,GG-399,Low Pressure,88 MADISON HOTEL FEE,5.0,...,10.38,135588.0,0.14173,0.783066,1,True,True,False,True,2
18,10000000027N0001,1,107400,21,1.0,SLANTFIN,GG-399,Low Pressure,88 MADISON HOTEL FEE,5.0,...,10.38,135588.0,0.14173,0.783066,1,True,True,False,True,3
19,10000000027N0001,0,107400,23,2.0,SLANTFIN,GG-399,Low Pressure,88 MADISON HOTEL FEE,5.0,...,10.38,135588.0,0.14173,0.783066,1,True,True,False,True,4
20,10000000027N0001,0,107400,24,3.0,SLANTFIN,GG-399,Low Pressure,88 MADISON HOTEL FEE,5.0,...,10.38,135588.0,0.14173,0.783066,1,True,True,False,True,4
21,10000000027N0001,0,107400,25,4.0,SLANTFIN,GG-399,Low Pressure,88 MADISON HOTEL FEE,5.0,...,10.38,135588.0,0.14173,0.783066,1,True,True,False,True,4


In [201]:
# dbn_cropped['boiler_age'] = dbn.boiler_age
# subsam = dbn_cropped.iloc[:100000].copy()

In [202]:
# subsam.head()

In [203]:
# dbn_cropped.drop('boiler_age', axis=1, inplace=True)

In [204]:
# increasing values for nans, until defects seen
dbn_cropped.years_since_last_defects = dbn_cropped.years_since_last_defects.fillna(dbn_cropped.boiler_age).astype(int)
dbn_cropped.iloc[15:25]

Unnamed: 0,boiler_id,defects_exist,ct_bc_comb,boiler_age,years_since_last_defects,boiler_make,boiler_model,boiler_pressure_type,ownername,landuse,...,builtfar,median_household_income,over_65_pct,over_white_pct,dummy_pressure_type,dummy_make,dummy_model,dummy_tract,dummy_owner,defects_history
15,10000000027N0001,0,107400,17,1,SLANTFIN,GG-399,Low Pressure,88 MADISON HOTEL FEE,5.0,...,10.38,135588.0,0.14173,0.783066,1,True,True,False,True,2
16,10000000027N0001,0,107400,19,3,SLANTFIN,GG-399,Low Pressure,88 MADISON HOTEL FEE,5.0,...,10.38,135588.0,0.14173,0.783066,1,True,True,False,True,2
17,10000000027N0001,1,107400,20,4,SLANTFIN,GG-399,Low Pressure,88 MADISON HOTEL FEE,5.0,...,10.38,135588.0,0.14173,0.783066,1,True,True,False,True,2
18,10000000027N0001,1,107400,21,1,SLANTFIN,GG-399,Low Pressure,88 MADISON HOTEL FEE,5.0,...,10.38,135588.0,0.14173,0.783066,1,True,True,False,True,3
19,10000000027N0001,0,107400,23,2,SLANTFIN,GG-399,Low Pressure,88 MADISON HOTEL FEE,5.0,...,10.38,135588.0,0.14173,0.783066,1,True,True,False,True,4
20,10000000027N0001,0,107400,24,3,SLANTFIN,GG-399,Low Pressure,88 MADISON HOTEL FEE,5.0,...,10.38,135588.0,0.14173,0.783066,1,True,True,False,True,4
21,10000000027N0001,0,107400,25,4,SLANTFIN,GG-399,Low Pressure,88 MADISON HOTEL FEE,5.0,...,10.38,135588.0,0.14173,0.783066,1,True,True,False,True,4
22,10000000027N0002,0,107400,0,0,SLANTFIN,GG-399,Low Pressure,88 MADISON HOTEL FEE,5.0,...,10.38,135588.0,0.14173,0.783066,1,True,True,False,True,4
23,10000000027N0002,0,107400,1,1,SLANTFIN,GG-399,Low Pressure,88 MADISON HOTEL FEE,5.0,...,10.38,135588.0,0.14173,0.783066,1,True,True,False,True,0
24,10000000027N0002,0,107400,2,2,SLANTFIN,GG-399,Low Pressure,88 MADISON HOTEL FEE,5.0,...,10.38,135588.0,0.14173,0.783066,1,True,True,False,True,0


#### Landuse _> convert to dummies

In [205]:
dbn_cropped = pd.get_dummies(dbn_cropped, columns=['landuse'])
dbn_cropped.columns

Index(['boiler_id', 'defects_exist', 'ct_bc_comb', 'boiler_age',
       'years_since_last_defects', 'boiler_make', 'boiler_model',
       'boiler_pressure_type', 'ownername', 'bldgarea', 'unitstotal',
       'builtfar', 'median_household_income', 'over_65_pct', 'over_white_pct',
       'dummy_pressure_type', 'dummy_make', 'dummy_model', 'dummy_tract',
       'dummy_owner', 'defects_history', 'landuse_1.0', 'landuse_2.0',
       'landuse_3.0', 'landuse_4.0', 'landuse_5.0', 'landuse_6.0',
       'landuse_7.0', 'landuse_8.0', 'landuse_9.0', 'landuse_10.0',
       'landuse_11.0'],
      dtype='object')

#### Cleanup table (get rid of non continuous/discrete features)

In [206]:
print(dbn.groupby('boiler_pressure_type')['boiler_id'].count())
print(dbn_cropped.groupby('dummy_pressure_type')['defects_exist'].count())

boiler_pressure_type
High Pressure       5546
Low Pressure     1537503
Name: boiler_id, dtype: int64
dummy_pressure_type
-1      89469
 0       5546
 1    1537503
Name: defects_exist, dtype: int64


In [207]:
# drop where no pressure type existed
dbn_cropped = dbn_cropped[dbn_cropped.dummy_pressure_type > -1].reset_index(drop=True)

In [208]:
# Drop variables replaced by dummy variables
dbn_cropped = dbn_cropped.drop(labels=['boiler_id','boiler_age','ct_bc_comb','boiler_make', 'boiler_model',
                        'boiler_pressure_type', 'ownername'], axis=1)
dbn_cropped.head()

Unnamed: 0,defects_exist,years_since_last_defects,bldgarea,unitstotal,builtfar,median_household_income,over_65_pct,over_white_pct,dummy_pressure_type,dummy_make,...,landuse_2.0,landuse_3.0,landuse_4.0,landuse_5.0,landuse_6.0,landuse_7.0,landuse_8.0,landuse_9.0,landuse_10.0,landuse_11.0
0,0,0,185785.0,8.0,10.38,135588.0,0.14173,0.783066,1,True,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,1,185785.0,8.0,10.38,135588.0,0.14173,0.783066,1,True,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,2,185785.0,8.0,10.38,135588.0,0.14173,0.783066,1,True,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0,3,185785.0,8.0,10.38,135588.0,0.14173,0.783066,1,True,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0,4,185785.0,8.0,10.38,135588.0,0.14173,0.783066,1,True,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Get rid of leftover nan values

In [210]:
before = dbn_cropped.shape[0] 
dbn_cropped = dbn_cropped.dropna(axis=0)
after = dbn_cropped.shape[0]
print('{} --> {}: lost {} records'.format(before, after, before-after))

1543049 --> 1535994: lost 7055 records


In [211]:
# dbn_cropped.to_csv('data/dbn_for_models.csv', index=False)