# Data Driven Features

The EPC data contains several categorical variables with a lot of values. In order to find suitable features which will retain the most information, three feature sets are explored;
* data driven
* domain driven
* exhaustive
The first approach, termed data driven, uses statistical methods to reduce the number of variables. As the variables containing textual descriptions of the property have been created free-hand, many contain a large number of unique values. In some cases, only recorded for one property. The data driven approach uses a single level Chi-square Automatic Interaction Detector (CHAID) to group the levels within each categorical variable into a smaller number of groups. CHAID groups values with a similar response rate or in this context Energy Efficiency Rating (EER).

This script applies the groupings from the CHAID and bins the numerical fields 

In [1]:
import numpy as np
import pandas as pd
import datetime
import os
import glob
import json

In [2]:
# set variables from config file
config_path = os.path.abspath('..')[:-7]

with open(config_path + '/config.json', 'r') as f:
    config = json.load(f)

processing_path = config['DEFAULT']['processing_path']
epc_train_clean_fname = config['DEFAULT']['epc_train_clean_fname']
epc_test_clean_fname = config['DEFAULT']['epc_test_clean_fname']
epc_train_data_fname = config['DEFAULT']['epc_train_dd_fname']
epc_test_data_fname = config['DEFAULT']['epc_test_dd_fname']

In [3]:
dtype_dict = {'INSPECTION_DATE':'str'}

epc_train = pd.read_csv(os.path.join(processing_path,epc_train_clean_fname),header = 0,delimiter = ',',dtype = dtype_dict,
                        parse_dates = ['INSPECTION_DATE'])
epc_test = pd.read_csv(os.path.join(processing_path,epc_test_clean_fname),header = 0,delimiter = ',',dtype = dtype_dict,
                        parse_dates = ['INSPECTION_DATE'])

In [4]:
%store -r chaid_dict

## Combine bins of categorical variables into a smaller number of bins

### Built_form

In [5]:
built_dict = dict.fromkeys(['Mid-Terrace','End-Terrace'],'terraced')
built_dict1 = dict.fromkeys(['Semi-Detached','Detached'],'detached')
built_dict.update(built_dict1)
epc_train['built_form'] = epc_train['BUILT_FORM'].replace(built_dict)
epc_test['built_form'] = epc_test['BUILT_FORM'].replace(built_dict)

### Energy_tariff

In [6]:
# adding which is missing from chaid due to small volumes, and removing null from other groups
chaid_dict['ENERGY_TARIFF']['node2'].append('off-peak 18 hour')
# chaid_dict['ENERGY_TARIFF']['node1'].remove('Unknown')
chaid_dict['ENERGY_TARIFF']['node1'].remove('<missing>')

In [7]:
energy_dict = dict.fromkeys(chaid_dict['ENERGY_TARIFF']['node1'],'single/dual')
energy_dict1 = dict.fromkeys(chaid_dict['ENERGY_TARIFF']['node2'],'off-peak')
energy_dict['Unknown'] = np.nan 
energy_dict.update(energy_dict1)
epc_train['energy_tariff'] = epc_train['ENERGY_TARIFF'].replace(energy_dict)
epc_test['energy_tariff'] = epc_test['ENERGY_TARIFF'].replace(energy_dict)

### Floor description

In [8]:
chaid_dict['FLOOR_DESCRIPTION']['node1'].remove('<missing>')

In [9]:
floor_desc_dict = dict.fromkeys(chaid_dict['FLOOR_DESCRIPTION']['node1'],'floor group 1')
floor_desc_dict1 = dict.fromkeys(chaid_dict['FLOOR_DESCRIPTION']['node2'],'floor group 2')
floor_desc_dict2 = dict.fromkeys(chaid_dict['FLOOR_DESCRIPTION']['node3'],'floor group 3')
floor_desc_dict3 = dict.fromkeys(chaid_dict['FLOOR_DESCRIPTION']['node4'],'floor group 4')
floor_desc_dict4 = dict.fromkeys(chaid_dict['FLOOR_DESCRIPTION']['node5'],'floor group 5')
floor_desc_dict.update(floor_desc_dict1)
floor_desc_dict.update(floor_desc_dict2)
floor_desc_dict.update(floor_desc_dict3)
floor_desc_dict.update(floor_desc_dict4)
epc_train['floor_description'] = epc_train['FLOOR_DESCRIPTION'].replace(floor_desc_dict)
epc_test['floor_description'] = epc_test['FLOOR_DESCRIPTION'].replace(floor_desc_dict)

Tidying up levels not included in the chaid

In [10]:
floor_desc_dict_extra = dict.fromkeys(['average thermal transmittance 1.4 w/m²k', 'solid',
                                       'average thermal transmittance 1.9 w/m²k'],'floor group 2')
floor_desc_dict_extra1 = dict.fromkeys(['above unheated space or full exposed'],'floor group 3')
floor_desc_dict_extra2 = dict.fromkeys(['average thermal transmittance 2.5 w/m²k','to unheated space'],'floor group 4')
floor_desc_dict_extra.update(floor_desc_dict_extra1)
floor_desc_dict_extra.update(floor_desc_dict_extra2)
epc_train['floor_description'] = epc_train['floor_description'].replace(floor_desc_dict_extra)
epc_test['floor_description'] = epc_test['floor_description'].replace(floor_desc_dict_extra)

### Floor Level

In [13]:
floor_level_dict = dict.fromkeys(['Ground','ground floor','Basement',],'ground floor')
floor_level_dict1 = dict.fromkeys(['1st','2nd','3rd','4th'],'low floors')
floor_level_dict2 = dict.fromkeys(['mid floor','5th','6th','7th','8th','9th','10th','11th'],'mid floors')
floor_level_dict3 = dict.fromkeys(['top floor','12th','13th','14th','15th','16th','17th','18th','19th','20th',
                                   '21st or above'],'mid floors')
floor_level_dict.update(floor_level_dict1)
floor_level_dict.update(floor_level_dict2)
floor_level_dict.update(floor_level_dict3)
epc_train['floor_level'] = epc_train['FLOOR_LEVEL'].replace(floor_level_dict)
epc_test['floor_level'] = epc_test['FLOOR_LEVEL'].replace(floor_level_dict)

### Glazed Type

In [14]:
glazed_dict = dict.fromkeys(['double glazing installed before 2002','double glazing, unknown install date'],'old double glazing')
glazed_dict1 = dict.fromkeys(['triple, known data','triple glazing'],'triple glazing')
glazed_dict2 = dict.fromkeys(['secondary glazing','not defined','single glazing'],'old glazing')
glazed_dict3 = dict.fromkeys(['double, known data','double glazing installed during or after 2002'],'double glazing')
glazed_dict.update(glazed_dict1)
glazed_dict.update(glazed_dict2)
glazed_dict.update(glazed_dict3)
glazed_dict['INVALID!'] = np.nan
epc_train['glazed_type'] = epc_train['GLAZED_TYPE'].replace(glazed_dict)
epc_test['glazed_type'] = epc_test['GLAZED_TYPE'].replace(glazed_dict)

### Hot water description

In [15]:
hot_water_dict = dict.fromkeys(chaid_dict['HOTWATER_DESCRIPTION']['node1'],'water group 1')
hot_water_dict1 = dict.fromkeys(chaid_dict['HOTWATER_DESCRIPTION']['node2'],'water group 2')
hot_water_dict.update(hot_water_dict1)
epc_train['hotwater_description'] = epc_train['HOTWATER_DESCRIPTION'].replace(hot_water_dict)
epc_test['hotwater_description'] = epc_test['HOTWATER_DESCRIPTION'].replace(hot_water_dict)

In [16]:
water_dict_extra = dict.fromkeys(['7-hour tariff (on-peak)',
                                  'From community scheme, plus solar',
                                  'From main system, 7-hour tariff (on-peak)',
                                  'Solid fuel range cooker, plus solar, no cylinder thermostat',
                                  'Gas boiler/circulator, plus solar, no cylinder thermostat',
                                  'Electric immersion, standard tariff, plus solar, waste water heat recovery',
                                  'Solid fuel boiler/circulator, plus solar','Community scheme with CHP',
                                  'Oil boiler/circulator, waste water heat recovery',
                                  'From secondary system, no cylinder thermostat, plus solar',
                                  'From secondary system, no cylinderstat, plus solar',
                                  'From second main heating system, plus solar', 'Gas range cooker, plus solar',
                                  'Gas range cooker, plus solar, no cylinder thermostat',
                                  'Heat pump, waste water heat recovery','SAP:Hot-Water',
                                  'From main system, no cylinder thermostat, flue gas heat recovery'
                                 ],'water group 1')
water_dict_extra1 = dict.fromkeys(['O system eilaidd','Room heaters, anthracite',
                                   'Oil range cooker, plus solar',
                                   'Oil range cooker, plus solar, no cylinder thermostat',
                                   'From main system, no cylinderstat, no cylinderstat',
                                   'No system present : electric immersion assumed', 'Electric immersion',
                                   'From secondary heater, standard tariff','From secondary heater',
                                   'No system present?electric immersion assumed', ', no cylinderstat',
                                   'Single-point gas water heater', 'Point gas water heater, no cylinderstat',
                                   'Back boiler (hot water only), gas','Gas multipoint, no cylinder thermostat',
                                   'Electric instantaneous at point of use, no cylinder thermostat',
                                   'No hot water system present - electric immersion assumed, plus solar',
                                   'No system present: electric immersion assumed, no cylinder thermostat',
                                   'Solid fuel range cooker, no cylinderstat','Gas boiler/circulator, no cylinderstat'
                                 ],'water group 2')
water_dict_extra.update(water_dict_extra1)
epc_train['hotwater_description'] = epc_train['hotwater_description'].replace(water_dict_extra)
epc_test['hotwater_description'] = epc_test['hotwater_description'].replace(water_dict_extra)

In [17]:
# putting remaining small options into one group, note leaving 'From main system' separate as it's 75% of all values
hotwater_leftover = [x for x in set(epc_train['hotwater_description']) if 'water group' not in str(x)]
hotwater_leftover_test = [x for x in set(epc_test['hotwater_description']) if 'water group' not in str(x)]
#hotwater_leftover.remove('From main system')
#hotwater_leftover_test.remove('From main system')
hotwater_leftover.extend(hotwater_leftover_test)
hotwater_leftover_dict = dict.fromkeys(hotwater_leftover,'water group 3')
epc_train['hotwater_description'] = epc_train['hotwater_description'].replace(hotwater_leftover_dict)
epc_test['hotwater_description'] = epc_test['hotwater_description'].replace(hotwater_leftover_dict)

###  Lighting Description

In [18]:
epc_train[['low_energy_lighting_perc']] = epc_train.apply(lambda row: 100.0 if row['LIGHTING_DESCRIPTION'] == 'low energy lighting in all fixed outlets' else row['low_energy_lighting_perc'], axis=1)
epc_train[['low_energy_lighting_perc']] = epc_train.apply(lambda row: 0 if row['LIGHTING_DESCRIPTION'] == 'no low energy lighting' else row['low_energy_lighting_perc'], axis=1)
epc_test[['low_energy_lighting_perc']] = epc_test.apply(lambda row: 100.0 if row['LIGHTING_DESCRIPTION'] == 'low energy lighting in all fixed outlets' else row['low_energy_lighting_perc'], axis=1)
epc_test[['low_energy_lighting_perc']] = epc_test.apply(lambda row: 0 if row['LIGHTING_DESCRIPTION'] == 'no low energy lighting' else row['low_energy_lighting_perc'], axis=1)

In [19]:
chaid_dict['LIGHTING_DESCRIPTION']['node1'].remove('<missing>')

In [20]:
lighting_dict = dict.fromkeys(chaid_dict['LIGHTING_DESCRIPTION']['node1'],'lighting group 1')
lighting_dict1 = dict.fromkeys(chaid_dict['LIGHTING_DESCRIPTION']['node2'],'lighting group 2')
lighting_dict2 = dict.fromkeys(chaid_dict['LIGHTING_DESCRIPTION']['node3'],'lighting group 3')
lighting_dict.update(lighting_dict1)
lighting_dict.update(lighting_dict2)
epc_train['lighting_description'] = epc_train['LIGHTING_DESCRIPTION'].replace(lighting_dict)
epc_test['lighting_description'] = epc_test['LIGHTING_DESCRIPTION'].replace(lighting_dict)

### Main heating controls

In [21]:
chaid_dict['MAIN_HEATING_CONTROLS']['node1'].remove('<missing>')

In [22]:
main_heat_dict = dict.fromkeys(chaid_dict['MAIN_HEATING_CONTROLS']['node1'],'main heating controls group 1')
main_heat_dict1 = dict.fromkeys(chaid_dict['MAIN_HEATING_CONTROLS']['node2'],'main heating controls group 2')
main_heat_dict2 = dict.fromkeys(chaid_dict['MAIN_HEATING_CONTROLS']['node3'],'main heating controls group 3')
main_heat_dict3 = dict.fromkeys(chaid_dict['MAIN_HEATING_CONTROLS']['node4'],'main heating controls group 4')
main_heat_dict4 = dict.fromkeys(chaid_dict['MAIN_HEATING_CONTROLS']['node5'],'main heating controls group 5')
main_heat_dict.update(main_heat_dict1)
main_heat_dict.update(main_heat_dict2)
main_heat_dict.update(main_heat_dict3)
main_heat_dict.update(main_heat_dict4)
epc_train['mainheat_controls'] = epc_train['MAIN_HEATING_CONTROLS'].replace(main_heat_dict)
epc_test['mainheat_controls'] = epc_test['MAIN_HEATING_CONTROLS'].replace(main_heat_dict)

In [23]:
main_heat_dict_extra = dict.fromkeys(['Charging system linked to use of communit heating, TRVs'
                                     ],'main heating controls group 3')
main_heat_dict_extra1 = dict.fromkeys(["Thermostat ystafell yn unig"],'main heating controls group 2')
main_heat_dict_extra2 = dict.fromkeys(["Rhaglennydd ac o leiaf ddau thermostat ystafell",
                                       'Rheoli gwefr drydanol yn awtomatig',
                                       'Programmer + appliance thermostats',
                                       'Rhaglennydd a thermostatau ar y cyfarpar',
                                       'Programmer + TRVs + boiler energy manager',
                                       'Programmer + TRVs + flow switch',
                                       'Programmer + room thermostats'
                                      ],'main heating controls group 4')
main_heat_dict_extra3 = dict.fromkeys(['Charging system linked to use of communit heating, programmer and TRVs',
                                       'Programmer and delayed start thermostat'
                                      ],'main heating controls group 1')
main_heat_dict_extra.update(main_heat_dict_extra1)
main_heat_dict_extra.update(main_heat_dict_extra2)
main_heat_dict_extra.update(main_heat_dict_extra3)
epc_train['mainheat_controls'] = epc_train['mainheat_controls'].replace(main_heat_dict_extra)
epc_test['mainheat_controls'] = epc_test['mainheat_controls'].replace(main_heat_dict_extra)

### Property type

In [24]:
prop_type_dict = dict.fromkeys(['Bungalow','Park home'],'one storey building')
epc_train['property_type'] = epc_train['PROPERTY_TYPE'].replace(prop_type_dict)
epc_test['property_type'] = epc_test['PROPERTY_TYPE'].replace(prop_type_dict)

### Roof description

In [25]:
chaid_dict['ROOF_DESCRIPTION']['node1'].remove('<missing>')

In [26]:
roof_dict = dict.fromkeys(chaid_dict['ROOF_DESCRIPTION']['node1'],'roof group 1')
roof_dict1 = dict.fromkeys(chaid_dict['ROOF_DESCRIPTION']['node2'],'roof group 2')
roof_dict2 = dict.fromkeys(chaid_dict['ROOF_DESCRIPTION']['node3'],'roof group 3')
roof_dict3 = dict.fromkeys(chaid_dict['ROOF_DESCRIPTION']['node4'],'roof group 4')
roof_dict4 = dict.fromkeys(chaid_dict['ROOF_DESCRIPTION']['node5'],'roof group 5')
roof_dict5 = dict.fromkeys(chaid_dict['ROOF_DESCRIPTION']['node6'],'roof group 6')
roof_dict.update(roof_dict1)
roof_dict.update(roof_dict2)
roof_dict.update(roof_dict3)
roof_dict.update(roof_dict4)
roof_dict.update(roof_dict5)
epc_train['roof_description'] = epc_train['ROOF_DESCRIPTION'].replace(roof_dict)
epc_test['roof_description'] = epc_test['ROOF_DESCRIPTION'].replace(roof_dict)

In [27]:
# putting remaining small options into one group, note leaving 'From main system' separate as it's 75% of all values
roof_leftover = [x for x in set(epc_train['roof_description']) if 'roof group' not in str(x)]
roof_leftover_test = [x for x in set(epc_test['roof_description']) if 'roof group' not in str(x)]
roof_leftover.extend(roof_leftover_test)
roof_leftover_dict = dict.fromkeys(roof_leftover,'roof group 7')
epc_train['roof_description'] = epc_train['roof_description'].replace(roof_leftover_dict)
epc_test['roof_description'] = epc_test['roof_description'].replace(roof_leftover_dict)

### Transaction type

In [28]:
chaid_dict['TRANSACTION_TYPE']['node1'].remove('<missing>')

In [29]:
trans_dict = dict.fromkeys(chaid_dict['TRANSACTION_TYPE']['node1'],'private rental and sale')
trans_dict1 = dict.fromkeys(chaid_dict['TRANSACTION_TYPE']['node2'],'social rental and new build')
trans_dict2 = dict.fromkeys(chaid_dict['TRANSACTION_TYPE']['node3'],'private rental and sale')
trans_dict3 = dict.fromkeys(chaid_dict['TRANSACTION_TYPE']['node4'],'social rental and new build')
#trans_dict4 = dict.fromkeys(chaid_dict['TRANSACTION_TYPE']['node5'],'assessment')
trans_dict.update(trans_dict1)
trans_dict.update(trans_dict2)
trans_dict.update(trans_dict3)
#trans_dict.update(trans_dict4)
trans_dict['unknown'] = np.nan
epc_train['transaction_type'] = epc_train['TRANSACTION_TYPE'].replace(trans_dict)
epc_test['transaction_type'] = epc_test['TRANSACTION_TYPE'].replace(trans_dict)


### Walls description

In [30]:
chaid_dict['WALLS_DESCRIPTION']['node1'].remove('<missing>')

In [31]:
walls_dict = dict.fromkeys(chaid_dict['WALLS_DESCRIPTION']['node1'],'walls group 1')
walls_dict1 = dict.fromkeys(chaid_dict['WALLS_DESCRIPTION']['node2'],'walls group 2')
walls_dict2 = dict.fromkeys(chaid_dict['WALLS_DESCRIPTION']['node3'],'walls group 3')
walls_dict3 = dict.fromkeys(chaid_dict['WALLS_DESCRIPTION']['node4'],'walls group 4')
walls_dict4 = dict.fromkeys(chaid_dict['WALLS_DESCRIPTION']['node5'],'walls group 5')
walls_dict.update(walls_dict1)
walls_dict.update(walls_dict2)
walls_dict.update(walls_dict3)
walls_dict.update(walls_dict4)
epc_train['walls_description'] = epc_train['WALLS_DESCRIPTION'].replace(walls_dict)
epc_test['walls_description'] = epc_test['WALLS_DESCRIPTION'].replace(walls_dict)

In [32]:
# putting remaining small options into one group, note leaving 'From main system' separate as it's 75% of all values
walls_leftover = [x for x in set(epc_train['walls_description']) if 'walls group' not in str(x)]
walls_leftover_test = [x for x in set(epc_test['walls_description']) if 'walls group' not in str(x)]
walls_leftover.extend(walls_leftover_test)
walls_leftover_dict = dict.fromkeys(walls_leftover,'walls group 6')
epc_train['walls_description'] = epc_train['walls_description'].replace(walls_leftover_dict)
epc_test['walls_description'] = epc_test['walls_description'].replace(walls_leftover_dict)

### Windows Description

In [33]:
windows_dict = dict.fromkeys(chaid_dict['WINDOWS_DESCRIPTION']['node1'],'window group 1')
windows_dict1 = dict.fromkeys(chaid_dict['WINDOWS_DESCRIPTION']['node2'],'window group 2')
windows_dict2 = dict.fromkeys(chaid_dict['WINDOWS_DESCRIPTION']['node3'],'window group 3')
windows_dict.update(windows_dict1)
windows_dict.update(windows_dict2)
epc_train['window_description'] = epc_train['WINDOWS_DESCRIPTION'].replace(windows_dict)
epc_test['window_description'] = epc_test['WINDOWS_DESCRIPTION'].replace(windows_dict)

In [34]:
epc_train['window_description'] = epc_train['window_description'].replace('multiple glazing throughout double glazing','window group 3')
epc_test['window_description'] = epc_test['window_description'].replace('multiple glazing throughout double glazing','window group 3')

### Region

In [35]:
region_dict = dict.fromkeys(['Blaenau Gwent','Neath Port Talbot','Pembrokeshire','Rhondda Cynon Taf','Caerphilly',
                             'Flintshire','Carmarthenshire','Powys','Conwy','Ceredigion','Debighshire',
                             'Gwynedd','Isle of Anglesey'],'rural')
region_dict1 = dict.fromkeys(['Bridgend','Monmouthshire','Wrexham','Merthyr Tydfil','Vale of Glamorgan','Cardiff',
                              'Torfaen','Newport','Swansea'],'suburban')
region_dict.update(region_dict1)
epc_train['locality'] = epc_train['region'].replace(region_dict)
epc_test['locality'] = epc_test['region'].replace(region_dict)

## Binning Numeric Fields

In [36]:
def numberic_bins(var,bin_boundaries,bin_labels):

  var_new = pd.cut(var,bins = bin_boundaries,labels = bin_labels)

  return var_new

### Extension count

In [37]:
extension_max = epc_train['EXTENSION_COUNT'].max()
extension_bins = [-1,0,1,extension_max]
extension_labels = ['0','1','2+']

In [38]:
epc_train['extension'] = numberic_bins(epc_train['EXTENSION_COUNT'],extension_bins,extension_labels)
epc_test['extension'] = numberic_bins(epc_test['EXTENSION_COUNT'],extension_bins,extension_labels)

### Floor height

In [39]:
floor_max = epc_train['FLOOR_HEIGHT'].max()
floor_bins = [0,2.3,2.4,2.4999,2.5,2.7,floor_max]
floor_labels = ['0-2.3','2.3-2.4','2.4-2.5','2.5','2.5-2.7','2.7+']

In [40]:
epc_train['floor_height'] = numberic_bins(epc_train['FLOOR_HEIGHT'],floor_bins,floor_labels)
epc_test['floor_height'] = numberic_bins(epc_test['FLOOR_HEIGHT'],floor_bins,floor_labels)

### Number of habitable rooms

In [41]:
room_max = epc_train['NUMBER_HABITABLE_ROOMS'].max()
room_bins = [0,1,2,3,4,5,room_max]
room_labels = ['1','2','3','4','5','6+']

In [42]:
epc_train['habitable_rooms'] = numberic_bins(epc_train['NUMBER_HABITABLE_ROOMS'],room_bins,room_labels)
epc_test['habitable_rooms'] = numberic_bins(epc_test['NUMBER_HABITABLE_ROOMS'],room_bins,room_labels)

### Number of open fireplaces

In [43]:
fire_max = epc_train['NUMBER_OPEN_FIREPLACES'].max()
fire_bins = [-1,0,1,fire_max]
fire_labels = ['0','1','2+']

In [44]:
epc_train['open_fireplaces'] = numberic_bins(epc_train['NUMBER_OPEN_FIREPLACES'],fire_bins,fire_labels)
epc_test['open_fireplaces'] = numberic_bins(epc_test['NUMBER_OPEN_FIREPLACES'],fire_bins,fire_labels)

## Dropping fields

In [45]:
# features that correlate with each other or are leading
correlated_variables = ['CO2_EMISS_CURR_PER_FLOOR_AREA','CO2_EMISSIONS_CURRENT','ENERGY_CONSUMPTION_CURRENT',
                     'HEATING_COST_CURRENT','HOT_WATER_COST_CURRENT','HOT_WATER_ENERGY_EFF','HOT_WATER_ENV_EFF',
                     'LIGHTING_COST_CURRENT','LIGHTING_ENERGY_EFF','LIGHTING_ENV_EFF','LMK_KEY','LOW_ENERGY_LIGHTING',
                     'MAIN_FUEL','MAINHEAT_ENERGY_EFF','MAINHEAT_ENV_EFF','MAINHEATC_ENERGY_EFF','MAINHEATC_ENV_EFF',
                     'MAINHEATCONT_DESCRIPTION','MECHANICAL_VENTILATION','MULTI_GLAZE_PROPORTION','NUMBER_HEATED_ROOMS',
                     'POSTCODE','ROOF_ENERGY_EFF','ROOF_ENV_EFF','SECONDHEAT_DESCRIPTION','WALLS_ENERGY_EFF',
                     'WALLS_ENV_EFF','WINDOWS_ENERGY_EFF','WINDOWS_ENV_EFF']

# features replace with binned features
replace_features = ['region','CURRENT_ENERGY_RATING','PROPERTY_TYPE','BUILT_FORM','INSPECTION_DATE','TRANSACTION_TYPE',
                   'ENERGY_TARIFF','FLOOR_LEVEL','GLAZED_TYPE','EXTENSION_COUNT','NUMBER_HABITABLE_ROOMS',
                    'NUMBER_OPEN_FIREPLACES','HOTWATER_DESCRIPTION','FLOOR_DESCRIPTION','MAIN_HEATING_CONTROLS',
                    'WINDOWS_DESCRIPTION','WALLS_DESCRIPTION','ROOF_DESCRIPTION','LIGHTING_DESCRIPTION',
                    'FLOOR_HEIGHT']

# other fields not needed
fields_to_drop = ['floors_average_thermal_transmittance','low_energy_lighting_perc','LODGEMENT_DATE',
                  'roof_average_thermal_transmittance','walls_average_thermal_transmittance'] 

In [46]:
epc_train.drop(correlated_variables,axis = 1,inplace=True)
epc_train.drop(replace_features,axis = 1,inplace=True)
epc_train.drop(fields_to_drop,axis = 1,inplace=True)
epc_test.drop(correlated_variables,axis = 1,inplace=True)
epc_test.drop(replace_features,axis = 1,inplace=True)
epc_test.drop(fields_to_drop,axis = 1,inplace=True)

## Exporting data

In [47]:
epc_train.to_csv(os.path.join(processing_path,epc_train_data_fname),index = False)
epc_test.to_csv(os.path.join(processing_path,epc_test_data_fname),index = False)