# The Ashrae project

> Building models for the Ashrae prediction challenge.

TODO: update with recent changes to `loading` etc

In [1]:
#hide
%load_ext autoreload
%autoreload 2

In [2]:
#hide
from ashrae import inspection, preprocessing, modelling
import plotly.express as px
from fastcore.foundation import L, Path
from fastai.tabular.all import *

In [3]:
#hide
pd.options.plotting.backend = "plotly"

## Configuring

Defining wether to process the test set (warning, this alone takes 12+ minutes) and submit the results to kaggel (you will need your credentials set up).

In [None]:
do_test = False
do_submit = True

Defining where the csv files are located

In [4]:
data_path = Path("../data")

In [6]:
# !kaggle competitions download -c ashrae-energy-prediction -p {data_path}

Downloading ashrae-energy-prediction.zip to ../data
 97%|██████████████████████████████████████▉ | 369M/379M [00:11<00:00, 20.5MB/s]
100%|████████████████████████████████████████| 379M/379M [00:11<00:00, 33.4MB/s]


In [7]:
# !unzip {data_path}/ashrae-energy-prediction.zip -d {data_path}/

Archive:  ../data/ashrae-energy-prediction.zip
  inflating: ../data/building_metadata.csv  
  inflating: ../data/sample_submission.csv  
  inflating: ../data/test.csv        
  inflating: ../data/train.csv       
  inflating: ../data/weather_test.csv  
  inflating: ../data/weather_train.csv  


In [8]:
# !kaggle competitions leaderboard -c ashrae-energy-prediction -p {data_path} --download

Downloading ashrae-energy-prediction.zip to ../data
100%|███████████████████████████████████████| 70.8k/70.8k [00:00<00:00, 331kB/s]
100%|███████████████████████████████████████| 70.8k/70.8k [00:00<00:00, 331kB/s]


In [9]:
# !unzip {data_path}/ashrae-energy-prediction.zip -d {data_path}/

Archive:  ../data/ashrae-energy-prediction.zip
  inflating: ../data/ashrae-energy-prediction-publicleaderboard.csv  


## Loading

In [None]:
csvs = inspection.get_csvs(data_path)
csvs

In [None]:
%%time
train = inspection.get_core_Xy(csvs['train'])
display(train.head(), train.info())

In [None]:
%%time
if do_test:
    test = inspection.get_core_Xy(csvs['test'])
    display(test.head(), test.info())

In [None]:
%%time
building = inspection.get_building_X(csvs['building'])
display(building.head(), building.info())

In [None]:
%%time
weather_train = inspection.get_weather_X(csvs['weather_train'])
display(weather_train.head(), weather_train.info())

In [None]:
%%time
if do_test:
    weather_test = inspection.get_weather_X(csvs['weather_test'])
    display(weather_test.head(), weather_test.info())

## Building features

In [None]:
process_config = dict(
    add_time_features = True,
    add_dep_var_stats = True,
    df_building = building,
    df_weather = weather_train
)
process = preprocessing.Processor()

In [None]:
%%time
df, var_names = process(train.copy(), 
                        **process_config)

In [None]:
%%time
if do_test:
    df_test, _ = process(test.copy(), 
                         **process_config)

## Sampling from `df`

In [None]:
%%time
n = len(df)

if True: # per building_id and meter sampling
    n_sample_per_bid = 500
    replace = True

    df = (df.groupby(['building_id', 'meter'])
         .sample(n=n_sample_per_bid, replace=replace))

if False: # general sampling
    frac_samples = .05
    replace = False

    df = (df.sample(frac=frac_samples, replace=replace))

print(f'using {len(df)} samples = {len(df)/n*100:.2f} %')

## Preparing the data for modelling

In [None]:
%%time
split_kind = 'random'
#split_kind = 'time'
splits = modelling.split_dataset(df, split_kind=split_kind, train_frac=.8)
#splits=None

In [None]:
%%time
to = preprocessing.get_tabular_object(df, var_names, splits=splits)

In [None]:
%%time
train_bs = 256*8
val_bs = 256*8

dls = to.dataloaders(bs=train_bs, val_bs=val_bs)

In [None]:
%%time
test_bs = 1024*4

if do_test:
    test_dl = dls.test_dl(df_test, bs=test_bs) 

## Training a neural net using `tabular_learner`

In [None]:
y_range = [0,
           np.max([to.train.ys.values.max(), 
                   to.valid.ys.values.max()]),]
y_range

In [None]:
layers = [50, 25, 12]

config = tabular_config(embed_p=.1, ps = [.1, .1, .1])

learn = tabular_learner(dls, y_range=y_range, layers=layers,
                        n_out=1, config=config, 
                        loss_func=modelling.evaluate_torch)

In [None]:
learn.lr_find()

In [None]:
learn.fit_one_cycle(5, lr_max=1e-2)

In [None]:
learn.recorder.plot_loss()

## Inspecting the predictions

### Basic score

In [None]:
%%time
y_valid_pred, y_valid_true = learn.get_preds()

In [None]:
%%time
if do_test:
    y_test_pred, _ = learn.get_preds(dl=test_dl)
    y_test_pred = cnr(y_test_pred)

In [None]:
nb_score = modelling.evaluate_torch(y_valid_true, 
                                    y_valid_pred).item()
print(f'fastai loss {nb_score:.4f}')

In [None]:
y_valid_pred, y_valid_true = modelling.cnr(y_valid_pred), modelling.cnr(y_valid_true)

### Histogram of  `dep_var`

In [None]:
preprocessing.hist_plot_preds(modelling.pick_random(y_valid_true), 
                              modelling.pick_random(y_valid_pred), 
                              label0='truth', label1='prediction')

In [None]:
if do_test:
    preprocessing.hist_plot_preds(modelling.pick_random(y_valid_true), 
                                  modelling.pick_random(y_test_pred), 
                                  label0='truth (validation)', 
                                  label1='prediction (test set)').show()

### Confidently wrong predictions by `building_id`

In [None]:
%%time
bwt = preprocessing.BoldlyWrongTimeseries(to.valid.xs, y_valid_true, y_valid_pred,
                                          t=df.iloc[splits[1]].loc[:,['timestampElapsed']].copy())

In [None]:
bwt.run_boldly()

## Submission

In [None]:
%%time
if do_test:
    y_test_pred_original = torch.exp(tensor(y_test_pred)) - 1

    y_out = pd.DataFrame(cnr(y_test_pred_original),
                         columns=['meter_reading'],
                         index=df_test.index)
    display(y_out.head())

    assert len(y_out) == 41697600

In [None]:
%%time
if do_submit:
    y_out.to_csv(data_path/'my_submission.csv')

In [None]:
# message = ['random forest', '500 obs/bid', 'all features', f'nb score {nb_score:.4f}']
message = ['lightgbm', '500 obs/bid', '100 rounds', '42 leaves', 'lr .5', f'nb score {nb_score:.4f}']
# message = ['tabular_learner', '500 obs/bid', 'all features', f'layers {layers}, embed_p .1, ps [.1,.1,.1]', f'nb score {nb_score:.4f}']
message = ' + '.join(message)
message

In [None]:
if do_test and do_submit:
    print('Submitting...')
    !kaggle competitions submit -c ashrae-energy-prediction -f '{data_path}/my_submission.csv' -m '{message}'