# Categorical cleanup

Within the feature set, many of the categorical fields contain a large number of unique values. For example, “roof description” alone has over 500 unique values. In order to reduce this, the following steps are applied:

* Welsh descriptions are translated to English; for example  
    *‘To gwellt, gydag inswleiddio ychwanegol’ to ‘Thatched, with additional insulation’
* bilingual sentences are reduced to English only; for example  
    *‘High performance glazing|Ffenestri perfformiad uchel’ to ‘High performance glazing’
* The unit for average thermal transmittance is standardised to ‘W/m²K’
* The value of average thermal transmittance is reduced to 1 decimal place
* special characters and unnecessary spaces are removed
* spellings are corrected; for example  
    *‘eneregy’ to ‘energy’
* the word ‘assumed’ is removed; for example  
    *‘Solid no insulation (assumed)’ to ‘Solid no insulation’

In [1]:
import numpy as np
import pandas as pd
import datetime
import os
import glob
import json
import re

In [2]:
# set variables from config file
config_path = os.path.abspath('..')[:-7]

with open(config_path + '/config.json', 'r') as f:
    config = json.load(f)

processing_path = config['DEFAULT']['processing_path']
epc_train_fname = config['DEFAULT']['epc_train_fname']
epc_test_fname = config['DEFAULT']['epc_test_fname']
epc_train_clean_fname = config['DEFAULT']['epc_train_clean_fname']
epc_test_clean_fname = config['DEFAULT']['epc_test_clean_fname']

In [3]:
dtype_dict = {'INSPECTION_DATE':'str'}

epc_train = pd.read_csv(os.path.join(processing_path,epc_train_fname),
                        header = 0,
                        delimiter = ',',
                        dtype = dtype_dict,
                        parse_dates = ['INSPECTION_DATE'])

epc_test = pd.read_csv(os.path.join(processing_path,epc_test_fname),
                        header = 0,
                        delimiter = ',',
                        dtype = dtype_dict,
                        parse_dates = ['INSPECTION_DATE'])

In [10]:
epc_test['BUILT_FORM'].value_counts()

Semi-Detached           48479
Detached                41609
Mid-Terrace             40361
End-Terrace             19692
Enclosed End-Terrace     1250
Enclosed Mid-Terrace     1249
Name: BUILT_FORM, dtype: int64

### Remove bi-lingual sentences

In [None]:
descriptive_columns = ['FLOOR_DESCRIPTION','LIGHTING_DESCRIPTION','ROOF_DESCRIPTION','WALLS_DESCRIPTION',
                       'WINDOWS_DESCRIPTION','HOTWATER_DESCRIPTION','MAIN_HEATING_CONTROLS']

for col in descriptive_columns:
    epc_train[col] = epc_train[col].str.replace(r"\|(.*)","")
    epc_test[col] = epc_test[col].str.replace(r"\|(.*)","")

### Cleaning up units for average thermal transmittance

In [None]:
def thermal_cleanup(df):
    
    ''' 
    standardising the wording in the variables containing average thermal transmittance. Specifically, 
    removing bi-lingual versions, removing =, translating welsh version, and standardising units
    Parameters
      df: a dataframe containing variables called 'FLOOR_DESCRIPTION','WALLS_DESCRIPTION','ROOF_DESCRIPTION'
    Returns a dataframe 
    '''
    
    for c in ['FLOOR_DESCRIPTION','WALLS_DESCRIPTION','ROOF_DESCRIPTION']:
        df[c] = df[c].str.replace('  ',' ')
        df[c] = df[c].str.replace('Average thermal transmittance 1 ','Average thermal transmittance 1.00 ')
        df[c] = df[c].str.replace('Average thermal transmittance =','Average thermal transmittance')
        df[c] = df[c].str.replace('Trawsyriannedd thermol cyfartalog','Average thermal transmittance')
        # standardising the unit used
        df[c] = df[c].str.replace(r"W(.*?)K",'W/m²K')
        
    return df

In [None]:
epc_train = thermal_cleanup(epc_train)
epc_test = thermal_cleanup(epc_test)

### Floor description

In [None]:
def thermal_floor(df):
    
    ''' 
    rounds the average thermal transmittance figure which is within a str field to 1 decimal place
    Parameters
      df: a dataframe containing a variable called 'FLOOR_DESCRIPTION'
    Returns a dataframe 
    '''
    
    # finds the decimal number
    df['floors_average_thermal_transmittance'] = df['FLOOR_DESCRIPTION'].str.findall(r'\d.\d*')
    # rounds the number to 1 decimal place
    df['floors_average_thermal_transmittance'] = round(df['floors_average_thermal_transmittance'].str[0].astype(float),1)
    # 
    df['FLOOR_DESCRIPTION'] = df.apply(lambda row: 'average thermal transmittance %.1f W/m²K' % (row['floors_average_thermal_transmittance']) if 'Average' in str(row['FLOOR_DESCRIPTION']) else row['FLOOR_DESCRIPTION'],axis=1)
    
    return df

In [None]:
epc_train = thermal_floor(epc_train)
epc_test = thermal_floor(epc_test)

In [None]:
len(set(epc_train['FLOOR_DESCRIPTION']))

In [None]:
def floor_cleanup(df):
    
    ''' 
    cleans-up the FLOOR_DESCRIPTION feature from the EPC dataset
    Parameters
      df: a dataframe containing a variable called 'FLOOR_DESCRIPTION'
    Returns a dataframe 
    '''
    
    df['FLOOR_DESCRIPTION'] = df['FLOOR_DESCRIPTION'].str.lower()

    # translating welsh sentences
    df['FLOOR_DESCRIPTION'] = df['FLOOR_DESCRIPTION'].str.replace(r"anheddiad arall islaw",'another dwelling below') 
    df['FLOOR_DESCRIPTION'] = df['FLOOR_DESCRIPTION'].str.replace("\(eiddo arall islaw\)",'other premises below') 
    df['FLOOR_DESCRIPTION'] = df['FLOOR_DESCRIPTION'].str.replace(r"wedi(.*?)i inswleiddio","insulated")
    df['FLOOR_DESCRIPTION'] = df['FLOOR_DESCRIPTION'].str.replace("dim inswleiddio","no insulation")
    df['FLOOR_DESCRIPTION'] = df['FLOOR_DESCRIPTION'].str.replace("(rhagdybiaeth)","assumed")
    df['FLOOR_DESCRIPTION'] = df['FLOOR_DESCRIPTION'].str.replace("crog","suspended")
    df['FLOOR_DESCRIPTION'] = df['FLOOR_DESCRIPTION'].str.replace("heb ei inswleiddio","no insulation")
    df['FLOOR_DESCRIPTION'] = df['FLOOR_DESCRIPTION'].str.replace("i ofod heb ei wresogi","to unheated space")
    df['FLOOR_DESCRIPTION'] = df['FLOOR_DESCRIPTION'].str.replace("solet","solid")
    df['FLOOR_DESCRIPTION'] = df['FLOOR_DESCRIPTION'].str.replace("inswleiddio cyfyngedig","limited insulation")

    # removing assumed
    df['FLOOR_DESCRIPTION'] = df['FLOOR_DESCRIPTION'].str.replace("\(assumed\)","")

    # standardising language
    df['FLOOR_DESCRIPTION'] = df['FLOOR_DESCRIPTION'].str.replace("\(another dwelling below\)",'other premises below') 
    df['FLOOR_DESCRIPTION'] = df['FLOOR_DESCRIPTION'].str.replace("\(other premises below\)",'other premises below')
    df['FLOOR_DESCRIPTION'] = df['FLOOR_DESCRIPTION'].str.replace("solid.",'solid,') 
    df['FLOOR_DESCRIPTION'] = df['FLOOR_DESCRIPTION'].str.replace("uninsulated",'no insulation,') 
    df['FLOOR_DESCRIPTION'] = df['FLOOR_DESCRIPTION'].str.replace("insulation=100mm",'100 mm insulation') 
    df['FLOOR_DESCRIPTION'] = df['FLOOR_DESCRIPTION'].str.replace(", \(assumed\)",' (assumed)')
    df['FLOOR_DESCRIPTION'] = df['FLOOR_DESCRIPTION'].str.replace("insulation=25mm",'25 mm insulation')
    df['FLOOR_DESCRIPTION'] = df['FLOOR_DESCRIPTION'].str.replace("insulation=75mm",'75 mm insulation')
    df['FLOOR_DESCRIPTION'] = df['FLOOR_DESCRIPTION'].str.replace("limited insulated",'limited insulation')
    # cleanup
    df['FLOOR_DESCRIPTION'] = df['FLOOR_DESCRIPTION'].str.replace("[ \t]+$","")
    df['FLOOR_DESCRIPTION'] = df['FLOOR_DESCRIPTION'].str.replace(r'\,$','')
    df['FLOOR_DESCRIPTION'] = df['FLOOR_DESCRIPTION'].str.replace(r'^, ','')
    df['FLOOR_DESCRIPTION'] = df['FLOOR_DESCRIPTION'].str.replace(r'\?','')
    
    return df

In [None]:
epc_train = floor_cleanup(epc_train)
epc_test = floor_cleanup(epc_test)

In [None]:
len(set(epc_train['FLOOR_DESCRIPTION']))

### Lighting description

In [None]:
# lighting and second heat descriptions have some entries the wrong way around
epc_train['SECONDHEAT_DESCRIPTION1'] = epc_train.apply(lambda row: row['LIGHTING_DESCRIPTION'] if 'lighting' in str(row['SECONDHEAT_DESCRIPTION']) else row['SECONDHEAT_DESCRIPTION'],axis = 1)
epc_train['LIGHTING_DESCRIPTION1'] = epc_train.apply(lambda row: row['SECONDHEAT_DESCRIPTION'] if 'lighting' in str(row['SECONDHEAT_DESCRIPTION']) else row['LIGHTING_DESCRIPTION'],axis = 1)

epc_test['SECONDHEAT_DESCRIPTION1'] = epc_test.apply(lambda row: row['LIGHTING_DESCRIPTION'] if 'lighting' in str(row['SECONDHEAT_DESCRIPTION']) else row['SECONDHEAT_DESCRIPTION'],axis = 1)
epc_test['LIGHTING_DESCRIPTION1'] = epc_test.apply(lambda row: row['SECONDHEAT_DESCRIPTION'] if 'lighting' in str(row['SECONDHEAT_DESCRIPTION']) else row['LIGHTING_DESCRIPTION'],axis = 1)

In [None]:
epc_train.drop(columns= ['SECONDHEAT_DESCRIPTION','LIGHTING_DESCRIPTION'],axis=1,inplace=True)
epc_test.drop(columns= ['SECONDHEAT_DESCRIPTION','LIGHTING_DESCRIPTION'],axis=1,inplace=True)
epc_train.rename(columns={'SECONDHEAT_DESCRIPTION1':'SECONDHEAT_DESCRIPTION','LIGHTING_DESCRIPTION1':'LIGHTING_DESCRIPTION'},inplace=True)
epc_test.rename(columns={'SECONDHEAT_DESCRIPTION1':'SECONDHEAT_DESCRIPTION','LIGHTING_DESCRIPTION1':'LIGHTING_DESCRIPTION'},inplace=True)

In [None]:
len(set(epc_train['LIGHTING_DESCRIPTION']))

In [None]:
def lighting_cleanup(df):
    
    ''' 
    cleans-up the LIGHTING_DESCRIPTION feature from the EPC dataset
    Parameters
      df: a dataframe containing a variable called 'LIGHTING_DESCRIPTION'
    Returns a dataframe 
    '''
    
    df['LIGHTING_DESCRIPTION'] = df['LIGHTING_DESCRIPTION'].str.lower()
    # combining ways of saying all lights are low energy
    df['LIGHTING_DESCRIPTION'] = df['LIGHTING_DESCRIPTION'].str.replace(r'goleuadau ynni-isel ym mhob un o.r mannau gosod','low energy lighting in all fixed outlets')
    df['LIGHTING_DESCRIPTION'] = df['LIGHTING_DESCRIPTION'].str.replace(r'low energy lighting in 120% of fixed outlets','low energy lighting in all fixed outlets')
    df['LIGHTING_DESCRIPTION'] = df['LIGHTING_DESCRIPTION'].str.replace(r'low energy lighting 100% of fixed outlets','low energy lighting in all fixed outlets')
    # translating welsh sentences
    df['LIGHTING_DESCRIPTION'] = df['LIGHTING_DESCRIPTION'].str.replace(r"\|goleuadau(.*)",'') 
    df['LIGHTING_DESCRIPTION'] = df['LIGHTING_DESCRIPTION'].str.replace(r"o.r mannau gosod",'of fixed outlets')
    df['LIGHTING_DESCRIPTION'] = df['LIGHTING_DESCRIPTION'].str.replace('% fixed','% of fixed')
    df['LIGHTING_DESCRIPTION'] = df['LIGHTING_DESCRIPTION'].str.replace('goleuadau ynni-isel mewn','low energy lighting in')
    df['LIGHTING_DESCRIPTION'] = df['LIGHTING_DESCRIPTION'].str.replace(r"\.\d*",'')
    df['LIGHTING_DESCRIPTION'] = df['LIGHTING_DESCRIPTION'].str.replace("dim goleuadau ynni-isel",'no low energy lighting')
    # clean up
    df['LIGHTING_DESCRIPTION'] = df['LIGHTING_DESCRIPTION'].str.replace("eneregy",'energy')
    
    return df

In [None]:
epc_train = lighting_cleanup(epc_train)
epc_test = lighting_cleanup(epc_test)

In [None]:
len(set(epc_train['LIGHTING_DESCRIPTION']))

In [None]:
def lighting_perc_cleanup(df):
    
    ''' 
    rounds the low energy lighting percentage figure which is within a str field
    Parameters
      df: a dataframe containing a variable called 'LIGHTING_DESCRIPTION'
    Returns a dataframe 
    '''
    
    df['low_energy_lighting_perc'] = df['LIGHTING_DESCRIPTION'].str.findall(r'(\d*)\%')
    df['low_energy_lighting_perc'] = round(df['low_energy_lighting_perc'].str[0].astype(float),-1)
    df['LIGHTING_DESCRIPTION'] = df.apply(lambda row: 'low energy lighting %d%% of fixed outlets' % (int(row['low_energy_lighting_perc'])) if '%' in str(row['LIGHTING_DESCRIPTION']) else row['LIGHTING_DESCRIPTION'],axis=1)

    return df

In [None]:
epc_train = lighting_perc_cleanup(epc_train)
epc_test = lighting_perc_cleanup(epc_test)

In [None]:
len(set(epc_train['LIGHTING_DESCRIPTION']))

### Roof Description

In [None]:
len(set(epc_train['ROOF_DESCRIPTION']))

In [None]:
def roof_cleanup(df):
    
    ''' 
    cleans-up the ROOF_DESCRIPTION feature from the EPC dataset
    Parameters
      df: a dataframe containing a variable called 'ROOF_DESCRIPTION'
    Returns a dataframe 
    '''
    
    # standardising units
    df['ROOF_DESCRIPTION'] = df['ROOF_DESCRIPTION'].str.replace(" mm",'mm')
    df['ROOF_DESCRIPTION'] = df['ROOF_DESCRIPTION'].str.replace(">= 300mm",">=300mm")
    df['ROOF_DESCRIPTION'] = df['ROOF_DESCRIPTION'].str.replace(">=300mm","300+mm")
    # translating welsh sentences
    df['ROOF_DESCRIPTION'] = df['ROOF_DESCRIPTION'].str.replace("\|\(eiddo arall uwchben\)","")
    df['ROOF_DESCRIPTION'] = df['ROOF_DESCRIPTION'].str.replace("Ar oleddf","Pitched")
    df['ROOF_DESCRIPTION'] = df['ROOF_DESCRIPTION'].str.replace("dim inswleiddio","no insulation")
    df['ROOF_DESCRIPTION'] = df['ROOF_DESCRIPTION'].str.replace(r"wedi(.*?)i inswleiddio","insulated")
    df['ROOF_DESCRIPTION'] = df['ROOF_DESCRIPTION'].str.replace("(rhagdybiaeth)","assumed")
    df['ROOF_DESCRIPTION'] = df['ROOF_DESCRIPTION'].str.replace(r"lo inswleiddio yn y llof.*","loft insulation")
    df['ROOF_DESCRIPTION'] = df['ROOF_DESCRIPTION'].str.replace(r"o inswleiddio yn y llof.*","loft insulation")
    df['ROOF_DESCRIPTION'] = df['ROOF_DESCRIPTION'].str.replace("Ystafell\(oedd\) to","Roof room(s)")
    df['ROOF_DESCRIPTION'] = df['ROOF_DESCRIPTION'].str.replace(r"wedi(.*?)i hinswleiddio","insulated")
    df['ROOF_DESCRIPTION'] = df['ROOF_DESCRIPTION'].str.replace("nenfwd","ceiling")
    df['ROOF_DESCRIPTION'] = df['ROOF_DESCRIPTION'].str.replace("wrth y trawstia(.*?)","at rafters")
    df['ROOF_DESCRIPTION'] = df['ROOF_DESCRIPTION'].str.replace("inswleiddio cyfyngedig","limited insulation")
    df['ROOF_DESCRIPTION'] = df['ROOF_DESCRIPTION'].str.replace("To gwellt, gydag inswleiddio ychwanegol","Thatched, with additional insulation")
    df['ROOF_DESCRIPTION'] = df['ROOF_DESCRIPTION'].str.replace("Yn wastad","Always")
    # standardising descriptions
    df['ROOF_DESCRIPTION'] = df['ROOF_DESCRIPTION'].str.replace("Roof room,","Roof room(s),")
    df['ROOF_DESCRIPTION'] = df['ROOF_DESCRIPTION'].str.replace("annedd arall uwchben","other premises above")
    df['ROOF_DESCRIPTION'] = df['ROOF_DESCRIPTION'].str.replace("another dwelling above","other premises above")
    df['ROOF_DESCRIPTION'] = df['ROOF_DESCRIPTION'].str.replace("Dwelling Above","(other premises above)")
    df['ROOF_DESCRIPTION'] = df['ROOF_DESCRIPTION'].str.replace(r"0 W/m²K",r" W/m²K")
    df['ROOF_DESCRIPTION'] = df['ROOF_DESCRIPTION'].str.replace("  W/m²K"," 0.0 W/m²K")
    df['ROOF_DESCRIPTION'] = df['ROOF_DESCRIPTION'].str.replace("Roof room\(s\), no insulation\(assumed\)","Roof room(s), no insulation (assumed)")
    df['ROOF_DESCRIPTION'] = df['ROOF_DESCRIPTION'].str.replace("Other premises above","(other premises above)")
    df['ROOF_DESCRIPTION'] = df['ROOF_DESCRIPTION'].str.replace("\(assumed\)","")
    # cleanup
    df['ROOF_DESCRIPTION'] = df['ROOF_DESCRIPTION'].str.replace(r'\.$','')
    df['ROOF_DESCRIPTION'] = df['ROOF_DESCRIPTION'].str.replace(r'\,$','')
    df['ROOF_DESCRIPTION'] = df['ROOF_DESCRIPTION'].str.replace("  \+"," +")
    df['ROOF_DESCRIPTION'] = df['ROOF_DESCRIPTION'].str.replace("[ \t]+$","")
    df['ROOF_DESCRIPTION'] = df['ROOF_DESCRIPTION'].str.replace("mmmm","mm")
    df['ROOF_DESCRIPTION'] = df['ROOF_DESCRIPTION'].str.replace("Thatchedinsulated","Thatched, insulated")
    df['ROOF_DESCRIPTION'] = df['ROOF_DESCRIPTION'].str.replace("\*\*\* INVALID INPUT Code \: 57 \*\*\*","")
    
    return df

In [None]:
epc_train = roof_cleanup(epc_train)
epc_test = roof_cleanup(epc_test)

In [None]:
len(set(epc_train['ROOF_DESCRIPTION']))

In [None]:
def thermal_roof(df):
    
    ''' 
    rounds the average thermal transmittance figure which is within a str field
    Parameters
      df: a dataframe containing a variable called 'ROOF_DESCRIPTION'
    Returns a dataframe 
    '''
    
    df['roof_average_thermal_transmittance'] = df['ROOF_DESCRIPTION'].str.findall(r'(\d.\d*) W/m²K')
    df['roof_average_thermal_transmittance'] = round(df['roof_average_thermal_transmittance'].str[0].astype(float),1)
    df['ROOF_DESCRIPTION'] = df.apply(lambda row: 'average thermal transmittance %.1f w/m²k' % (row['roof_average_thermal_transmittance']) if 'Average' in str(row['ROOF_DESCRIPTION']) else row['ROOF_DESCRIPTION'],axis=1)
    
    return df

In [None]:
epc_train = thermal_roof(epc_train)
epc_test = thermal_roof(epc_test)

In [None]:
len(set(epc_test['ROOF_DESCRIPTION']))

### Walls description

In [None]:
len(set(epc_train['WALLS_DESCRIPTION']))

In [None]:
def walls_cleanup(df): 
    
    ''' 
    cleans-up the WALLS_DESCRIPTION feature from the EPC dataset
    Parameters
      df: a dataframe containing a variable called 'WALLS_DESCRIPTION'
    Returns a dataframe 
    '''
    
    # removing phrases like as built which aren't adding anything
    df['WALLS_DESCRIPTION'] = df['WALLS_DESCRIPTION'].str.lower()
    df['WALLS_DESCRIPTION'] = df['WALLS_DESCRIPTION'].str.replace(r"fel y(.*?)u hadeiladwyd, ","")
    df['WALLS_DESCRIPTION'] = df['WALLS_DESCRIPTION'].str.replace("as built, ","")
    df['WALLS_DESCRIPTION'] = df['WALLS_DESCRIPTION'].str.replace("cavity\.","cavity wall,")
    df['WALLS_DESCRIPTION'] = df['WALLS_DESCRIPTION'].str.replace("\(rhagdybiaeth\)","(assumed)")
    df['WALLS_DESCRIPTION'] = df['WALLS_DESCRIPTION'].str.replace("\(assumed\)","")
    # translating welsh sentences
    df['WALLS_DESCRIPTION'] = df['WALLS_DESCRIPTION'].str.replace("waliau ceudod","cavity wall")
    df['WALLS_DESCRIPTION'] = df['WALLS_DESCRIPTION'].str.replace("dim inswleiddio","no insulation")
    df['WALLS_DESCRIPTION'] = df['WALLS_DESCRIPTION'].str.replace("tywodfaen","sandstone")
    df['WALLS_DESCRIPTION'] = df['WALLS_DESCRIPTION'].str.replace("ceudod wedi(.*?)i lenwi","filled cavity")
    df['WALLS_DESCRIPTION'] = df['WALLS_DESCRIPTION'].str.replace(r"wedi(.*?)u hinswleiddio","insulated")
    df['WALLS_DESCRIPTION'] = df['WALLS_DESCRIPTION'].str.replace(r"ffr(.*?)m bren","timber frame")
    df['WALLS_DESCRIPTION'] = df['WALLS_DESCRIPTION'].str.replace("briciau solet","solid brick")
    df['WALLS_DESCRIPTION'] = df['WALLS_DESCRIPTION'].str.replace("wedi(.*?)u hadeiladu yn (.*?)l system","system built")
    df['WALLS_DESCRIPTION'] = df['WALLS_DESCRIPTION'].str.replace("inswleiddio rhannol","partial insulation")
    df['WALLS_DESCRIPTION'] = df['WALLS_DESCRIPTION'].str.replace("gydag inswleiddio allanol","with external insulation")
    df['WALLS_DESCRIPTION'] = df['WALLS_DESCRIPTION'].str.replace("gwenithfaen neu risgraig","granite or whinstone")
    df['WALLS_DESCRIPTION'] = df['WALLS_DESCRIPTION'].str.replace("gydag inswleiddio mewnol","with internal insulation")
    # standardising punctuation
    df['WALLS_DESCRIPTION'] = df['WALLS_DESCRIPTION'].str.replace("solid brick\.","solid brick,")
    # standardising language
    df['WALLS_DESCRIPTION'] = df['WALLS_DESCRIPTION'].str.replace("granite or whin,","granite or whinstone,")
    df['WALLS_DESCRIPTION'] = df['WALLS_DESCRIPTION'].str.replace("stone \(granite or whin\)\.","granite or whinstone,")
    df['WALLS_DESCRIPTION'] = df['WALLS_DESCRIPTION'].str.replace("with external insulation","insulated")
    df['WALLS_DESCRIPTION'] = df['WALLS_DESCRIPTION'].str.replace("with internal insulation","insulated")
    df['WALLS_DESCRIPTION'] = df['WALLS_DESCRIPTION'].str.replace("with additional insulation","insulated")
    df['WALLS_DESCRIPTION'] = df['WALLS_DESCRIPTION'].str.replace("with insulation","insulated")
    # clean up
    df['WALLS_DESCRIPTION'] = df['WALLS_DESCRIPTION'].str.replace(r'\.$','')
    df['WALLS_DESCRIPTION'] = df['WALLS_DESCRIPTION'].str.replace('\+ chr\(13\) \+','+')
    df['WALLS_DESCRIPTION'] = df['WALLS_DESCRIPTION'].str.replace("  \+"," +")
    df['WALLS_DESCRIPTION'] = df['WALLS_DESCRIPTION'].str.replace("[ \t]+$","")
    df['WALLS_DESCRIPTION'] = df['WALLS_DESCRIPTION'].str.replace('timber frame\.','timber frame,')
    df['WALLS_DESCRIPTION'] = df['WALLS_DESCRIPTION'].str.replace('\?','')
    df['WALLS_DESCRIPTION'] = df['WALLS_DESCRIPTION'].str.replace('system built\.','system built,')
    
    return df

In [None]:
epc_train = walls_cleanup(epc_train)
epc_test = walls_cleanup(epc_test)

In [None]:
len(set(epc_train['WALLS_DESCRIPTION']))

In [None]:
def thermal_walls(df):
    
    ''' 
    rounds the average thermal transmittance figure which is within a str field
    Parameters
      df: a dataframe containing a variable called 'WALLS_DESCRIPTION'
    Returns a dataframe 
    '''
    
    df['walls_average_thermal_transmittance'] = df['WALLS_DESCRIPTION'].str.findall(r'\d.\d\d')
    df['walls_average_thermal_transmittance'] = round(df['walls_average_thermal_transmittance'].str[0].astype(float),1)
    df['WALLS_DESCRIPTION'] = df.apply(lambda row: 'average thermal transmittance %.1f w/m²k' % (row['walls_average_thermal_transmittance']) if 'average' in str(row['WALLS_DESCRIPTION']) else row['WALLS_DESCRIPTION'],axis=1)
    
    return df

In [None]:
epc_train = thermal_walls(epc_train)
epc_test = thermal_walls(epc_test)

In [None]:
len(set(epc_train['WALLS_DESCRIPTION']))

### windows description

In [None]:
len(set(epc_train['WINDOWS_DESCRIPTION']))

In [None]:
def windows_cleanup(df): 
    
    ''' 
    cleans-up the WINDOWS_DESCRIPTION feature from the EPC dataset
    Parameters
      df: a dataframe containing a variable called 'WINDOWS_DESCRIPTION'
    Returns a dataframe 
    '''
    
    df['WINDOWS_DESCRIPTION'] = df['WINDOWS_DESCRIPTION'].str.lower()
    # translating welsh sentences
    df['WINDOWS_DESCRIPTION'] = df['WINDOWS_DESCRIPTION'].str.replace("ffenestri perfformiad uchel","high performance glazing")
    df['WINDOWS_DESCRIPTION'] = df['WINDOWS_DESCRIPTION'].str.replace("gwydrau dwbl gan mwyaf","mostly double glazing")
    df['WINDOWS_DESCRIPTION'] = df['WINDOWS_DESCRIPTION'].str.replace("rhai gwydrau dwbl","partial double glazing")
    df['WINDOWS_DESCRIPTION'] = df['WINDOWS_DESCRIPTION'].str.replace("gwydrau sengl","single glazing")
    df['WINDOWS_DESCRIPTION'] = df['WINDOWS_DESCRIPTION'].str.replace("gwydrau dwbl rhannol","partial double glazing")
    df['WINDOWS_DESCRIPTION'] = df['WINDOWS_DESCRIPTION'].str.replace("gwydrau dwbl llawn","fully double glazing")
    df['WINDOWS_DESCRIPTION'] = df['WINDOWS_DESCRIPTION'].str.replace("gwydrau lluosog ym mhobman","multiple glazing throughout")
    df['WINDOWS_DESCRIPTION'] = df['WINDOWS_DESCRIPTION'].str.replace("gwydrau eilaidd llawn","full secondary glazing")
    # standardising language
    df['WINDOWS_DESCRIPTION'] = df['WINDOWS_DESCRIPTION'].str.replace("glazed","glazing")
    df['WINDOWS_DESCRIPTION'] = df['WINDOWS_DESCRIPTION'].str.replace("fully","full")
    df['WINDOWS_DESCRIPTION'] = df['WINDOWS_DESCRIPTION'].str.replace("fully","full")
    # cleanup
    df['WINDOWS_DESCRIPTION'] = df['WINDOWS_DESCRIPTION'].str.replace("single glazingsingle glazing","single glazing")
    df['WINDOWS_DESCRIPTION'] = df['WINDOWS_DESCRIPTION'].str.replace("single glazingdouble glazing","single glazing and double glazing")
    df['WINDOWS_DESCRIPTION'] = df['WINDOWS_DESCRIPTION'].str.replace("single glazingsecondary glazing","single glazing and secondary glazing")
    df['WINDOWS_DESCRIPTION'] = df['WINDOWS_DESCRIPTION'].str.replace("[ \t]+$","")
    df['WINDOWS_DESCRIPTION'] = df['WINDOWS_DESCRIPTION'].str.replace("  "," ")
    
    return df

In [None]:
epc_train = windows_cleanup(epc_train)
epc_test = windows_cleanup(epc_test)

In [None]:
len(set(epc_train['WINDOWS_DESCRIPTION']))

### hotwater description

In [None]:
len(set(epc_train['HOTWATER_DESCRIPTION']))

In [None]:
def hotwater_cleanup(df):
    
    ''' 
    cleans-up the HOTWATER_DESCRIPTION feature from the EPC dataset
    Parameters
      df: a dataframe containing a variable called 'HOTWATER_DESCRIPTION'
    Returns a dataframe 
    '''
    
    # removing spaces at the end
    df['HOTWATER_DESCRIPTION'] = df['HOTWATER_DESCRIPTION'].str.replace("[ \t]+$","")

    # translating welsh sentences
    df['HOTWATER_DESCRIPTION'] = df['HOTWATER_DESCRIPTION'].str.replace(r"O(.*?)r brif system","From main system")
    df['HOTWATER_DESCRIPTION'] = df['HOTWATER_DESCRIPTION'].str.replace(r"Trochi trydan","Electric immersion")
    df['HOTWATER_DESCRIPTION'] = df['HOTWATER_DESCRIPTION'].str.replace(r"an-frig","off-peak")
    df['HOTWATER_DESCRIPTION'] = df['HOTWATER_DESCRIPTION'].str.replace(r"O system eilaidd","From secondary system")
    df['HOTWATER_DESCRIPTION'] = df['HOTWATER_DESCRIPTION'].str.replace(r"Nwy wrth fwy nag un pwynt","Gas multipoint")
    df['HOTWATER_DESCRIPTION'] = df['HOTWATER_DESCRIPTION'].str.replace(r"Popty estynedig olew","Oil range cooker")
    df['HOTWATER_DESCRIPTION'] = df['HOTWATER_DESCRIPTION'].str.replace(r"tarriff safonol","standard tariff")
    df['HOTWATER_DESCRIPTION'] = df['HOTWATER_DESCRIPTION'].str.replace(r"Dim system ar gael","No system present")
    df['HOTWATER_DESCRIPTION'] = df['HOTWATER_DESCRIPTION'].str.replace(r"adfer gwres nwyon ffliw","flue gas heat recovery")
    df['HOTWATER_DESCRIPTION'] = df['HOTWATER_DESCRIPTION'].str.replace(r"gydag ynni(.*?)r haul","plus solar")
    df['HOTWATER_DESCRIPTION'] = df['HOTWATER_DESCRIPTION'].str.replace(r"dim thermostat ar y silindr","no cylinderstat")
    df['HOTWATER_DESCRIPTION'] = df['HOTWATER_DESCRIPTION'].str.replace(r"rhagdybir bod twymwr tanddwr trydan","electric immersion assumed")
    df['HOTWATER_DESCRIPTION'] = df['HOTWATER_DESCRIPTION'].str.replace(r"an-frig","off peak")
    df['HOTWATER_DESCRIPTION'] = df['HOTWATER_DESCRIPTION'].str.replace(r"Twymwr tanddwr","underfloor heating")

    # standardising language
    df['HOTWATER_DESCRIPTION'] = df['HOTWATER_DESCRIPTION'].str.replace("cylinder thermostat","cylinderstat")
    df['HOTWATER_DESCRIPTION'] = df['HOTWATER_DESCRIPTION'].str.replace("No system present :","No system present:")
    df['HOTWATER_DESCRIPTION'] = df['HOTWATER_DESCRIPTION'].str.replace("No hot water system present -","No system present:")
    df['HOTWATER_DESCRIPTION'] = df['HOTWATER_DESCRIPTION'].str.replace("From community scheme","Community scheme")
    df['HOTWATER_DESCRIPTION'] = df['HOTWATER_DESCRIPTION'].str.replace("Community heat pump","Community scheme with CHP")
    df['HOTWATER_DESCRIPTION'] = df['HOTWATER_DESCRIPTION'].str.replace("From secondary heater","From secondary system")
    df['HOTWATER_DESCRIPTION'] = df['HOTWATER_DESCRIPTION'].str.replace("SAP05:Hot-Water","SAP:Hot-Water")
    df['HOTWATER_DESCRIPTION'] = df['HOTWATER_DESCRIPTION'].str.replace("community scheme","Community scheme")
    df['HOTWATER_DESCRIPTION'] = df['HOTWATER_DESCRIPTION'].str.replace("plus solar, no cylinderstat","no cylinderstat, plus solar")
    df['HOTWATER_DESCRIPTION'] = df['HOTWATER_DESCRIPTION'].str.replace("From second main heating system","From secondary system")
    df['HOTWATER_DESCRIPTION'] = df['HOTWATER_DESCRIPTION'].str.replace("none","No system present: electric immersion assumed")
    df['HOTWATER_DESCRIPTION'] = df['HOTWATER_DESCRIPTION'].str.replace("plus solar, flue gas heat recovery","flue gas heat recovery, plus solar")
    df['HOTWATER_DESCRIPTION'] = df['HOTWATER_DESCRIPTION'].str.replace("no cylinderstat, no cylinderstat","no cylinderstat")

    # cleanup
    df['HOTWATER_DESCRIPTION'] = df['HOTWATER_DESCRIPTION'].str.replace("No system present\?electric immersion assumed","No system present: electric immersion assumed")
    df['HOTWATER_DESCRIPTION'] = df['HOTWATER_DESCRIPTION'].replace("***SAMPLE***",np.nan)

    return df

In [None]:
epc_train = hotwater_cleanup(epc_train)
epc_test = hotwater_cleanup(epc_test)

In [None]:
len(set(epc_train['HOTWATER_DESCRIPTION']))

### main heating controls

In [None]:
len(set(epc_train['MAIN_HEATING_CONTROLS']))

In [None]:
def heat_control_cleanup(df):
    
    ''' 
    cleans-up the MAIN_HEATING_CONTROLS feature from the EPC dataset
    Parameters
      df: a dataframe containing a variable called 'MAIN_HEATING_CONTROLS'
    Returns a dataframe 
    '''
    
    df['MAIN_HEATING_CONTROLS'] = df['MAIN_HEATING_CONTROLS'].str.lower()

    # translating welsh sentences
    df['MAIN_HEATING_CONTROLS'] = df['MAIN_HEATING_CONTROLS'].str.replace(r"rheoli.r t.l . llaw","manual charge control")
    df['MAIN_HEATING_CONTROLS'] = df['MAIN_HEATING_CONTROLS'].str.replace("rhaglennydd, dim thermostat ystafell","programmer, no room thermostat")
    df['MAIN_HEATING_CONTROLS'] = df['MAIN_HEATING_CONTROLS'].str.replace("rheolaeth amser a rheolaeth parthau tymheredd","time and temperature zone control")
    df['MAIN_HEATING_CONTROLS'] = df['MAIN_HEATING_CONTROLS'].str.replace("rhaglennydd a thermostat ystafell","programmer and room thermostat")
    df['MAIN_HEATING_CONTROLS'] = df['MAIN_HEATING_CONTROLS'].str.replace("rhaglennydd a thermostatau ar y cyfarpar","programmer and appliance thermostats")
    df['MAIN_HEATING_CONTROLS'] = df['MAIN_HEATING_CONTROLS'].str.replace("rhaglennydd ac o leiaf ddau thermostat ystafell","programmer and at least two room thermostats")
    df['MAIN_HEATING_CONTROLS'] = df['MAIN_HEATING_CONTROLS'].str.replace("thermostat ystafell yn unig","room thermostat only")
    df['MAIN_HEATING_CONTROLS'] = df['MAIN_HEATING_CONTROLS'].str.replace("dim rheolaeth thermostatig ar dymheredd yr ystafell","no thermostatic control of room temperature")
    df['MAIN_HEATING_CONTROLS'] = df['MAIN_HEATING_CONTROLS'].str.replace("rheoli gwefr drydanol yn awtomatig","automatic charge control")
    df['MAIN_HEATING_CONTROLS'] = df['MAIN_HEATING_CONTROLS'].str.replace("dim rheolaeth amser na rheolaeth thermostatig ar dymheredd yr ystafell","no time or thermostatic control of room temperature")
    df['MAIN_HEATING_CONTROLS'] = df['MAIN_HEATING_CONTROLS'].str.replace("trvs a falf osgoi","trvs and bypass")
    df['MAIN_HEATING_CONTROLS'] = df['MAIN_HEATING_CONTROLS'].str.replace("rhaglennydd","programmer")
    df['MAIN_HEATING_CONTROLS'] = df['MAIN_HEATING_CONTROLS'].str.replace("tal un gyfradd, thermostat ystafell yn unig","flat rate charging, room thermostat only")
    df['MAIN_HEATING_CONTROLS'] = df['MAIN_HEATING_CONTROLS'].str.replace("thermostat ystafell a trvs","room thermostat and trvs")
    df['MAIN_HEATING_CONTROLS'] = df['MAIN_HEATING_CONTROLS'].str.replace("t(.*?)l un gyfradd","flat rate charging")
    df['MAIN_HEATING_CONTROLS'] = df['MAIN_HEATING_CONTROLS'].str.replace("thermostatau ar y cyfarpar","appliance thermostat")
    df['MAIN_HEATING_CONTROLS'] = df['MAIN_HEATING_CONTROLS'].str.replace("dim","none")

    # standardising language
    df['MAIN_HEATING_CONTROLS'] = df['MAIN_HEATING_CONTROLS'].str.replace("programmer, no thermostat","programmer, no room thermostat")
    df['MAIN_HEATING_CONTROLS'] = df['MAIN_HEATING_CONTROLS'].str.replace("flat rate charging\*","flat rate charging")
    df['MAIN_HEATING_CONTROLS'] = df['MAIN_HEATING_CONTROLS'].str.replace("\+","and")
    df['MAIN_HEATING_CONTROLS'] = df['MAIN_HEATING_CONTROLS'].str.replace("\&","and")
    df['MAIN_HEATING_CONTROLS'] = df['MAIN_HEATING_CONTROLS'].str.replace("trv.s","trvs")
    df['MAIN_HEATING_CONTROLS'] = df['MAIN_HEATING_CONTROLS'].str.replace("thermostats","thermostat")
    df['MAIN_HEATING_CONTROLS'] = df['MAIN_HEATING_CONTROLS'].str.replace("communit ","community ")
    df['MAIN_HEATING_CONTROLS'] = df['MAIN_HEATING_CONTROLS'].str.replace("to the use of community heating","to use of community heating")
    df['MAIN_HEATING_CONTROLS'] = df['MAIN_HEATING_CONTROLS'].str.replace(" stat"," thermostat")
    df['MAIN_HEATING_CONTROLS'] = df['MAIN_HEATING_CONTROLS'].str.replace("controls","control")
    df['MAIN_HEATING_CONTROLS'] = df['MAIN_HEATING_CONTROLS'].str.replace("prog ","programmer ")
    df['MAIN_HEATING_CONTROLS'] = df['MAIN_HEATING_CONTROLS'].str.replace("program ","programmer ")
    df['MAIN_HEATING_CONTROLS'] = df['MAIN_HEATING_CONTROLS'].str.replace("programmerand","programmer and")
    df['MAIN_HEATING_CONTROLS'] = df['MAIN_HEATING_CONTROLS'].str.replace("delayed start thermostat and program and trvs","delayed start thermostat, program and trvs")
    df['MAIN_HEATING_CONTROLS'] = df['MAIN_HEATING_CONTROLS'].str.replace("thermostatic","thermostat")
    df['MAIN_HEATING_CONTROLS'] = df['MAIN_HEATING_CONTROLS'].str.replace("flat rate charging, programmer no room thermostat","flat rate charging, programmer, no room thermostat")
    df['MAIN_HEATING_CONTROLS'] = df['MAIN_HEATING_CONTROLS'].str.replace(" 2 "," two ")
    df['MAIN_HEATING_CONTROLS'] = df['MAIN_HEATING_CONTROLS'].str.replace("roomstat","room thermostat")
    df['MAIN_HEATING_CONTROLS'] = df['MAIN_HEATING_CONTROLS'].str.replace("programmer and room thermostat and trvs","programmer, room thermostat and trvs")
    df['MAIN_HEATING_CONTROLS'] = df['MAIN_HEATING_CONTROLS'].str.replace("programmer and trvs and boiler energy manager","programmer, trvs and boiler energy manager")
    df['MAIN_HEATING_CONTROLS'] = df['MAIN_HEATING_CONTROLS'].str.replace("programmer and trvs and bypass","programmer, trvs and bypass")
    df['MAIN_HEATING_CONTROLS'] = df['MAIN_HEATING_CONTROLS'].str.replace("programmer and trvs and flow switch","programmer, trvs and flow switch")
    df['MAIN_HEATING_CONTROLS'] = df['MAIN_HEATING_CONTROLS'].str.replace("temp+$","temperature")
    df['MAIN_HEATING_CONTROLS'] = df['MAIN_HEATING_CONTROLS'].str.replace("no thermostat control of room temperature","no thermostat control")
    df['MAIN_HEATING_CONTROLS'] = df['MAIN_HEATING_CONTROLS'].str.replace("appliance thermostat and programmer","programmer and appliance thermostat")
    df['MAIN_HEATING_CONTROLS'] = df['MAIN_HEATING_CONTROLS'].str.replace("delayed start thermostat and programmer and trvs","delayed start thermostat, programmer and trvs")
    df['MAIN_HEATING_CONTROLS'] = df['MAIN_HEATING_CONTROLS'].str.replace("no time or thermostat control of temperature","no time or thermostat control of room temperature")
    df['MAIN_HEATING_CONTROLS'] = df['MAIN_HEATING_CONTROLS'].str.replace("programmer\?and","programmer and")
  
    
    return df

In [None]:
epc_train = heat_control_cleanup(epc_train)
epc_test = heat_control_cleanup(epc_test)

In [None]:
len(set(epc_train['MAIN_HEATING_CONTROLS']))

In [None]:
epc_train['TRANSACTION_TYPE'] = epc_train['TRANSACTION_TYPE'].str.replace(" - this is for backwards compatibility only and should not be used","")
epc_test['TRANSACTION_TYPE'] = epc_test['TRANSACTION_TYPE'].str.replace(" - this is for backwards compatibility only and should not be used","")
epc_train['GLAZED_TYPE'] = epc_train['GLAZED_TYPE'].replace('INVALID!',np.nan)
epc_test['GLAZED_TYPE'] = epc_test['GLAZED_TYPE'].replace('INVALID!',np.nan)

### exporting data

In [None]:
epc_train[epc_train.isnull()]

In [None]:
epc_train.to_csv(os.path.join(processing_path,epc_train_clean_fname),index = False)
epc_test.to_csv(os.path.join(processing_path,epc_test_clean_fname),index = False)