# Exhaustive features

The EPC data contains several categorical variables with a lot of values. In order to find suitable features which will retain the most information, three feature sets are explored;
* data driven
* domain driven
* exhaustive

The exhaustive feature set does nothing further to the data and feeds the model the maximum information possible. Not grouping any of the categorical values allows more advanced machine learning techniques to decide where to split data during training. Although this should be the most predictive approach, computational time to train the models will be considerably longer.

This script removes correlated features

In [1]:
import numpy as np
import pandas as pd
import datetime
import os
import glob
import json

In [3]:
# set variables from config file
config_path = os.path.abspath('..')[:-7]

with open(config_path + '/config.json', 'r') as f:
    config = json.load(f)

processing_path = config['DEFAULT']['processing_path']
epc_train_clean_fname = config['DEFAULT']['epc_train_clean_fname']
epc_test_clean_fname = config['DEFAULT']['epc_test_clean_fname']
epc_train_ex_fname = config['DEFAULT']['epc_train_ex_fname']
epc_test_ex_fname = config['DEFAULT']['epc_test_ex_fname']

In [4]:
dtype_dict = {'INSPECTION_DATE':'str'}

epc_train = pd.read_csv(os.path.join(processing_path,epc_train_clean_fname),header = 0,delimiter = ',',dtype = dtype_dict,
                        parse_dates = ['INSPECTION_DATE'])
epc_test = pd.read_csv(os.path.join(processing_path,epc_test_clean_fname),header = 0,delimiter = ',',dtype = dtype_dict,
                        parse_dates = ['INSPECTION_DATE'])

In [7]:
# features that correlate with each other or are leading
correlated_variables = ['CO2_EMISS_CURR_PER_FLOOR_AREA','CO2_EMISSIONS_CURRENT','ENERGY_CONSUMPTION_CURRENT',
                     'HEATING_COST_CURRENT','HOT_WATER_COST_CURRENT','HOT_WATER_ENERGY_EFF','HOT_WATER_ENV_EFF',
                     'LIGHTING_COST_CURRENT','LIGHTING_ENERGY_EFF','LIGHTING_ENV_EFF','LMK_KEY','LOW_ENERGY_LIGHTING',
                     'MAIN_FUEL','MAINHEAT_ENERGY_EFF','MAINHEAT_ENV_EFF','MAINHEATC_ENERGY_EFF','MAINHEATC_ENV_EFF',
                     'MAINHEATCONT_DESCRIPTION','MECHANICAL_VENTILATION','MULTI_GLAZE_PROPORTION','NUMBER_HEATED_ROOMS',
                     'POSTCODE','ROOF_ENERGY_EFF','ROOF_ENV_EFF','SECONDHEAT_DESCRIPTION','WALLS_ENERGY_EFF',
                     'WALLS_ENV_EFF','WINDOWS_ENERGY_EFF','WINDOWS_ENV_EFF']

# other fields not needed
fields_to_drop = ['floors_average_thermal_transmittance','low_energy_lighting_perc',
                  'roof_average_thermal_transmittance','walls_average_thermal_transmittance',
                 'INSPECTION_DATE','CURRENT_ENERGY_RATING']

In [8]:
epc_train.drop(correlated_variables,axis = 1,inplace=True)
epc_train.drop(fields_to_drop,axis = 1,inplace=True)
epc_test.drop(correlated_variables,axis = 1,inplace=True)
epc_test.drop(fields_to_drop,axis = 1,inplace=True)

### Export

In [11]:
epc_train.to_csv(os.path.join(processing_path,epc_train_ex_fname),index = False)
epc_test.to_csv(os.path.join(processing_path,epc_test_ex_fname),index = False)