# EnsembleSwarm calorie burn regression

In [None]:
import pickle
import logging
import h5py
import glob
from pathlib import Path
from logging.handlers import RotatingFileHandler

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from ensembleset.dataset import DataSet
from sklearn.metrics import mean_squared_log_error
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import HistGradientBoostingRegressor

%cd ..
import examples.functions.helper_functions as helper_funcs
from ensembleswarm.swarm import Swarm


example_name='calories'
log_dir='examples/logs'

Path(log_dir).mkdir(parents=True, exist_ok=True)
helper_funcs.delete_old_logs(log_dir, example_name)

logging.captureWarnings(True)

logger = logging.getLogger()

logging.basicConfig(
    handlers=[RotatingFileHandler(
        f'{log_dir}/{example_name}.log',
        maxBytes=1000000,
        backupCount=10
    )],
    level=logging.DEBUG,
    format='%(levelname)s - %(name)s - %(message)s'
)

## 1. Load and prepare data

In [None]:
data_df=pd.read_csv('examples/example_data/calories.csv')
data_df.info()

In [None]:
data_df.drop('id', axis=1, inplace=True, errors='ignore')
train_df, test_df=train_test_split(data_df, test_size=0.5)
train_df.reset_index(inplace=True, drop=True)
test_df.reset_index(inplace=True, drop=True)
train_df.info()

## 2. EnsembleSwarm model

### 2.1. Generate ensembleset

In [None]:
ensemble_members=100

data_ensemble=DataSet(
    label='Calories',
    train_data=train_df,
    test_data=test_df,
    string_features=['Sex'],
    data_directory='examples/ensemblesets',
    ensembleset_name='calories.h5'
)

data_ensemble.make_datasets(
    n_datasets=ensemble_members,
    frac_features=0.7,
    n_steps=3
)

### 2.2. Train swarm

In [None]:
%%time

swarm = Swarm(
    ensembleset = 'examples/ensemblesets/calories.h5',
    swarm_directory = 'examples/swarms/calories'
)

swarm.optimize_swarm(sample = 10000)

### 2.3. Swarm predictions

In [None]:
level_two_dataset={}
swarm_rmsle={
    'model': [],
    'ensemble': [],
    'RMSLE': []
}

with h5py.File('examples/ensemblesets/calories.h5', 'r') as hdf:

    num_datasets=len(list(hdf['train'].keys())) - 1
    print(f"Testing datasets: {list(hdf['test'].keys())}")
    print(f'Will generate swarm output for {num_datasets} datasets')

    for i in range(4):
        models=glob.glob(f'examples/swarms/calories/swarm/{i}/*.pkl')

        for _, model_path in enumerate(models):
            model_type = model_path.split('/')[-1]
            model_type = model_type.split('.')[0]

            with open(model_path, 'rb') as input_file:
                model = pickle.load(input_file)

            if model is not None and isinstance(model, dict) == False:
                print(f'Generating test output for ensemble {i}, {model_type}')

                predictions = model.predict(hdf[f'test/{i}'][:])
                level_two_dataset[f'{i}_{model_type}']=predictions.flatten()

                rmsle = np.sqrt(
                    mean_squared_log_error(
                        hdf['test/labels'][:], 
                        np.where(predictions >= 0, predictions, 0)
                    )
                )

                swarm_rmsle['ensemble'].append(i)
                swarm_rmsle['model'].append(model_type)
                swarm_rmsle['RMSLE'].append(rmsle)

    level_two_dataset['label'] = np.array(hdf['test/labels'])
    level_two_df = pd.DataFrame.from_dict(level_two_dataset)

swarm_rmsle_df = pd.DataFrame.from_dict(swarm_rmsle)

### 2.4. Swarm RMSLE distribution

In [None]:
plt.title('Distribution of swarm RMSLE')
plt.hist(swarm_rmsle_df['RMSLE'], color='black', bins=30)
plt.xlabel('RMSLE')
plt.ylabel('Swarm models (n)')
plt.show()

### 2.5. Individual model scores

### 2.5. Level II model cross-validation

In [None]:
scores=cross_val_score(
    HistGradientBoostingRegressor(loss='gamma'),
    level_two_df.drop('label', axis=1),
    level_two_df['label'],
    scoring='neg_mean_squared_log_error',
    n_jobs=-1,
    cv=7
)

print(f'Cross-validation RMSLE: {np.mean(np.sqrt(-scores)):.4f} +/- {np.std(np.sqrt(-scores)):.4f}')