In [1]:
#!git clone https://github.com/darioka/impactdeal-2022.git
#%cd impactdeal-2022
#!pip install -r requirements.txt
#!pip install .

# Boosting - EPC Rating

In this notebook you fit a `HistGradientBoostingClassifier` on the EPC dataset.

The first part of the pipeline, devoted to loading and cleaning the data, has already been written for you.
The class `impactdeal.cleaning.Cleaner` apply the same steps we saw in previous notebooks. Take a look at the docstring if you are curious.

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

from impactdeal.config.column_names import TARGET, NUMERICAL, CATEGORICAL
from impactdeal.cleaning import Cleaner

df = pd.read_csv("../data/known_epc_ratings.csv.gz", parse_dates=["INSPECTION_DATE"], infer_datetime_format=True)
df = df.sort_values(["BUILDING_REFERENCE_NUMBER", "INSPECTION_DATE"]).drop_duplicates(subset="BUILDING_REFERENCE_NUMBER", keep="last")

y = df.pop(TARGET)
X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=0.3, stratify=y, random_state=42)

cleaner = Cleaner()
X_train = cleaner.fit_transform(X_train)
X_test = cleaner.transform(X_test)


  exec(code_obj, self.user_global_ns, self.user_ns)


In [2]:
cleaner.feature_names_in_

['TOTAL_FLOOR_AREA',
 'MULTI_GLAZE_PROPORTION',
 'EXTENSION_COUNT',
 'NUMBER_HABITABLE_ROOMS',
 'NUMBER_HEATED_ROOMS',
 'LOW_ENERGY_LIGHTING',
 'NUMBER_OPEN_FIREPLACES',
 'WIND_TURBINE_COUNT',
 'FLOOR_HEIGHT',
 'CITY',
 'PROPERTY_TYPE',
 'BUILT_FORM',
 'CONSTRUCTION_AGE_BAND',
 'ENERGY_TARIFF',
 'MAINS_GAS_FLAG',
 'FLOOR_LEVEL',
 'FLAT_TOP_STOREY',
 'MAIN_HEATING_CONTROLS',
 'GLAZED_TYPE',
 'GLAZED_AREA',
 'MAIN_FUEL',
 'SOLAR_WATER_HEATING_FLAG',
 'MECHANICAL_VENTILATION']

In [4]:
X_train.head(5)

Unnamed: 0,TOTAL_FLOOR_AREA,MULTI_GLAZE_PROPORTION,EXTENSION_COUNT,NUMBER_HABITABLE_ROOMS,NUMBER_HEATED_ROOMS,LOW_ENERGY_LIGHTING,NUMBER_OPEN_FIREPLACES,WIND_TURBINE_COUNT,FLOOR_HEIGHT,CITY,...,ENERGY_TARIFF,MAINS_GAS_FLAG,FLOOR_LEVEL,FLAT_TOP_STOREY,MAIN_HEATING_CONTROLS,GLAZED_TYPE,GLAZED_AREA,MAIN_FUEL,SOLAR_WATER_HEATING_FLAG,MECHANICAL_VENTILATION
548618,79.92,78.0,0.0,4.0,4.0,89.0,0.0,0.0,2.62,Birmingham,...,Single,Y,,,2104,double glazing installed before 2002,Normal,mains gas - this is for backwards compatibilit...,N,natural
501535,121.0,100.0,1.0,7.0,7.0,0.0,1.0,0.0,,Birmingham,...,Single,Y,,,2107,double glazing installed during or after 2002,Normal,mains gas (not community),N,natural
813073,67.7,100.0,0.0,2.0,1.0,7.0,0.0,0.0,2.43,Manchester,...,dual,N,1st,Y,2402,double glazing installed before 2002,Normal,electricity - this is for backwards compatibil...,N,natural
519055,108.0,100.0,0.0,6.0,6.0,75.0,0.0,0.0,,Birmingham,...,Single,Y,,,2107,double glazing installed before 2002,Normal,mains gas (not community),,natural
353227,49.0,100.0,0.0,2.0,2.0,100.0,0.0,0.0,,Birmingham,...,Single,N,1st,Y,2601,"double glazing, unknown install date",Normal,electricity (not community),N,natural


## Preprocessing

It is time now to apply our usual preprocessing steps, namely:

* missing value handling,
* categorical encoding.

But this time it's up to you. Should you replace missing or not? Which categorical encoder would you try? You are free to make choices, given the flexibility of `HistGradientBoostingClassifier`, but some choices may be better than others...

In [None]:
# Write your code here!


## Training

Finally, train and evaluate the model!

In [3]:
# Write your code here!
