In [None]:
!pip install -r requirements.txt

In [None]:
from preprocessing.preprocess_data import preprocess

from xgboost import XGBRegressor
import os
from pathlib import Path
import pandas as pd
import joblib

In [None]:
data_dir = "data"
output_dir = "data_processed"
model_file = "model.joblib"

# Download data

Execute the file `download_data.py` to get the original data

In [None]:
%%capture
if not os.path.exists(data_dir):
    !python download_data.py

# Preprocess the data

Run the preprocess of the data and save output in `output_dir`

In [None]:
if not os.path.exists(output_dir):
    preprocess(Path(data_dir), Path(output_dir))

# Load data

Load preprocessed data and labels from CSVs

In [None]:
data_train = pd.read_csv(
    Path(output_dir) / "train/X_train_processed.csv", index_col="level_0"
)
labels_train = pd.read_csv(
    Path(output_dir) / "train/labels_train_processed.csv", index_col="level_0"
)

# Limit data for faster models

In [None]:
data_train_sub = data_train[:300_000]
labels_train_sub = labels_train[:300_000]

# Train

Fit the model on the train data and labels

## Best parameters

Here is the best parameters we found :

```python
{
    "max_depth":10,
    "n_estimators":935,
    "eta":0.061178,
    "colsample_bytree":0.546984
}
```

In [None]:
# params to go faster
params = {
    "max_depth": 6,
    "n_estimators": 135,
    "eta": 0.061178,
    "colsample_bytree": 0.546984,
}

xgb = XGBRegressor(**params, n_jobs=-1)
xgb.fit(data_train_sub, labels_train_sub["energy_consumption_per_annum"].squeeze())

In [None]:
# Export the model

joblib.dump(xgb, model_file)