In [1]:
# imports
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import statsmodels
import statsmodels.api as sm

# Read data

In [7]:
# read data into a DataFrame
df = pd.read_csv("Data/need_2021_anon_dataset_50k.csv")

print(df.shape)

(50000, 75)


In [8]:
df.head()

Unnamed: 0,PROP_TYPE,PROP_AGE_BAND,FLOOR_AREA_BAND,CONSERVATORY_FLAG,COUNCIL_TAX_BAND,IMD_BAND_ENG,IMD_BAND_WALES,REGION,LI_FLAG,LI_DATE,...,ElecValFlag2014,ElecValFlag2013,ElecValFlag2012,ElecValFlag2011,ElecValFlag2010,ElecValFlag2009,ElecValFlag2008,ElecValFlag2007,ElecValFlag2006,ElecValFlag2005
0,Mid terrace,2,3,,B,2.0,,E12000005,0,,...,V,E,E,V,E,V,V,V,V,V
1,Flat,3,2,,A,1.0,,E12000008,0,,...,V,V,V,V,V,V,V,V,V,V
2,Mid terrace,1,3,,A,2.0,,E12000001,0,,...,V,V,V,V,V,V,V,V,V,V
3,Flat,4,1,0.0,A,3.0,,E12000004,0,,...,O,O,O,O,O,O,O,O,O,O
4,Semi detached,2,2,,D,5.0,,E12000008,0,,...,V,V,V,V,V,V,V,V,V,V


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 75 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   PROP_TYPE          50000 non-null  object 
 1   PROP_AGE_BAND      50000 non-null  int64  
 2   FLOOR_AREA_BAND    50000 non-null  int64  
 3   CONSERVATORY_FLAG  7418 non-null   float64
 4   COUNCIL_TAX_BAND   50000 non-null  object 
 5   IMD_BAND_ENG       47410 non-null  float64
 6   IMD_BAND_WALES     2590 non-null   float64
 7   REGION             50000 non-null  object 
 8   LI_FLAG            50000 non-null  int64  
 9   LI_DATE            8568 non-null   float64
 10  CWI_FLAG           50000 non-null  int64  
 11  CWI_DATE           8412 non-null   float64
 12  PV_FLAG            50000 non-null  int64  
 13  PV_DATE            561 non-null    float64
 14  MAIN_HEAT_FUEL     50000 non-null  int64  
 15  Gcons2019          40329 non-null  float64
 16  Gcons2018          403

# Wrangle data
## Select rows

In [10]:
# Select rows where the 'IMD_BAND_ENG' column is not NaN
df = df[df['IMD_BAND_ENG'].notna()]

print(df.shape)

(47410, 75)


## Select columns

In [16]:
columns_of_interest = [
    'PROP_TYPE', 'PROP_AGE_BAND', 'FLOOR_AREA_BAND', 'COUNCIL_TAX_BAND',
    'IMD_BAND_ENG', 'REGION', 'LI_FLAG', 'CWI_FLAG', 'PV_FLAG', 'MAIN_HEAT_FUEL',
    'Gcons2019', 'Gcons2018', 'Gcons2017', 'Gcons2016', 'Gcons2015', 'Gcons2014', 'Gcons2013', 'Gcons2012', 'Gcons2011', 'Gcons2010', 'Gcons2009', 'Gcons2008', 'Gcons2007', 'Gcons2006', 'Gcons2005',
    'Econs2019', 'Econs2018', 'Econs2017', 'Econs2016', 'Econs2015', 'Econs2014', 'Econs2013', 'Econs2012', 'Econs2011', 'Econs2010', 'Econs2009', 'Econs2008', 'Econs2007', 'Econs2006', 'Econs2005'
]

df_filtered = df[columns_of_interest].copy()
print(df_filtered.shape)

(47410, 40)


## Handle outliers

In [18]:
numerical_cols = [f'Gcons{i}' for i in range(2019, 2004, -1)] + [f'Econs{i}' for i in range(2019, 2004, -1)]
for col in numerical_cols:
    Q1 = df_filtered[col].quantile(0.25)
    Q3 = df_filtered[col].quantile(0.75)
    IQR = Q3 - Q1

    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    df_filtered.loc[df_filtered[col] < lower_bound, col] = lower_bound
    df_filtered.loc[df_filtered[col] > upper_bound, col] = upper_bound

## Define the data type

In [19]:
categorical_cols = ['PROP_TYPE', 'COUNCIL_TAX_BAND', 'REGION']
other_cols = ['PROP_AGE_BAND', 'FLOOR_AREA_BAND', 'IMD_BAND_ENG', 'LI_FLAG', 'CWI_FLAG', 'PV_FLAG', 'MAIN_HEAT_FUEL']

## Converters and zoomers

In [20]:
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

## Applicating line converter

In [21]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols),
        ('other', 'passthrough', other_cols)
    ])

## Applicating conversion

In [22]:
df_processed = preprocessor.fit_transform(df_filtered)

In [24]:
# Displays the shape of the processed data
df_processed_shape = df_processed.shape
print (df_processed_shape)

(47410, 60)


In [31]:
# Convert the converted Numpy array back to Pandas DataFrame
# Because the unique thermal encoding creates additional columns, the column names need to be redefined

# Gets the class column name after the unique heat encoding
categories = [f"{col}_{cat}" for col, categories in zip(categorical_cols, preprocessor.named_transformers_['cat'].named_steps['onehot'].categories_) for cat in categories]

# The numeric column and other columns remain unchanged
new_columns = numerical_cols + categories + other_cols

# Create DataFrame
df_processed = pd.DataFrame(df_processed, columns=new_columns)

# Look at the first few lines
print(df_processed.head)

<bound method NDFrame.head of        Gcons2019  Gcons2018  Gcons2017  Gcons2016  Gcons2015  Gcons2014  \
0       0.016052  -0.353765   0.047426  -1.108477  -1.108466  -0.131642   
1      -0.357304  -1.287228  -0.780180  -0.757788  -0.446439  -0.115052   
2      -0.276140  -0.222753  -0.118095   0.227484  -0.976061   0.133803   
3      -1.152715  -1.139839  -1.177431  -0.139905  -0.131977  -0.131642   
4      -0.146277  -0.140870  -0.134648  -0.139905  -0.131977  -0.131642   
...          ...        ...        ...        ...        ...        ...   
47405  -0.422236  -0.386518  -0.267065  -0.223403   0.215588   0.117212   
47406   0.129682  -0.173623  -0.068439  -0.707689  -0.711250  -1.707723   
47407  -0.146277  -0.140870  -1.690547  -1.659561  -1.555335  -1.575000   
47408   1.006257   0.661580   0.626750   0.611573   0.761760   0.697873   
47409   1.606873   1.431277   0.808823   0.778568   0.761760   1.394667   

       Gcons2013  Gcons2012  Gcons2011  Gcons2010  ...  REGION_E12000