# EnsembleSwarm calorie burn regression

In [1]:
import pickle
import h5py
import glob
import numpy as np
import pandas as pd
from ensembleset.dataset import DataSet
from sklearn.metrics import mean_squared_log_error
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import HistGradientBoostingRegressor

%cd ..
from ensembleswarm.swarm import Swarm

/workspaces/ensembleswarm


## 1. Load and prepare data

In [2]:
data_df=pd.read_csv('examples/example_data/calories.csv')
data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 750000 entries, 0 to 749999
Data columns (total 9 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   id          750000 non-null  int64  
 1   Sex         750000 non-null  object 
 2   Age         750000 non-null  int64  
 3   Height      750000 non-null  float64
 4   Weight      750000 non-null  float64
 5   Duration    750000 non-null  float64
 6   Heart_Rate  750000 non-null  float64
 7   Body_Temp   750000 non-null  float64
 8   Calories    750000 non-null  float64
dtypes: float64(6), int64(2), object(1)
memory usage: 51.5+ MB


In [3]:
data_df.drop('id', axis=1, inplace=True, errors='ignore')
train_df, test_df=train_test_split(data_df, test_size=0.5)
train_df.reset_index(inplace=True, drop=True)
test_df.reset_index(inplace=True, drop=True)
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 375000 entries, 0 to 374999
Data columns (total 8 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   Sex         375000 non-null  object 
 1   Age         375000 non-null  int64  
 2   Height      375000 non-null  float64
 3   Weight      375000 non-null  float64
 4   Duration    375000 non-null  float64
 5   Heart_Rate  375000 non-null  float64
 6   Body_Temp   375000 non-null  float64
 7   Calories    375000 non-null  float64
dtypes: float64(6), int64(1), object(1)
memory usage: 22.9+ MB


## 2. EnsembleSwarm model

### 2.1. Generate ensembleset

In [None]:
ensemble_members=3

data_ensemble=DataSet(
    label='Calories',
    train_data=train_df,
    test_data=test_df,
    string_features=['Sex'],
    data_directory='ensembleset_data'
)

data_ensemble.make_datasets(
    n_datasets=ensemble_members,
    frac_features=0.7,
    n_steps=3
)

### 2.2. Train swarm

In [None]:
swarm=Swarm(ensembleset = 'ensembleset_data/dataset.h5')
swarm.optimize_swarm(sample = 10000)

Starting worker 0
Training datasets: ['0', '1', '2', 'labels'])
Have 3 sets of training features.
Optimizing Linear regression, ensemble 0
Optimizing Quantile regression, ensemble 0

### 2.3. Swarm predictions

In [None]:
level_two_dataset={}
swarm_rmsle={
    'model': [],
    'ensemble': [],
    'RMSLE': []
}

with h5py.File('ensembleset_data/dataset.h5', 'r') as hdf:

    num_datasets=len(list(hdf['train'].keys())) - 1
    print(f"Testing datasets: {list(hdf['test'].keys())}")
    print(f'Will generate swarm output for {num_datasets} datasets')

    for i in range(ensemble_members):
        models=glob.glob(f'ensembleswarm_models/swarm/{i}/*.pkl')

        for _, model_path in enumerate(models):
            model_type = model_path.split('/')[-1]
            model_type = model_type.split('.')[0]

            with open(model_path, 'rb') as input_file:
                model = pickle.load(input_file)

            if model is not None and model_type != 'lightgbm':
                print(f'Generating test output for ensemble {i}, {model_type}')

                predictions = model.predict(hdf[f'test/{i}'][:])
                level_two_dataset[f'{i}_{model_type}']=predictions.flatten()

                rmsle = np.sqrt(
                    mean_squared_log_error(
                        hdf['test/labels'][:], 
                        np.where(predictions >= 0, predictions, 0)
                    )
                )

                swarm_rmsle['ensemble'].append(i)
                swarm_rmsle['model'].append(model_type)
                swarm_rmsle['RMSLE'].append(rmsle)

    level_two_dataset['label'] = np.array(hdf['test/labels'])
    level_two_df = pd.DataFrame.from_dict(level_two_dataset)

swarm_rmsle_df = pd.DataFrame.from_dict(swarm_rmsle)

### 2.4. Swarm RMSLE distribution

In [None]:
import matplotlib.pyplot as plt

plt.title('Distribution of swarm RMSLE')
plt.hist(swarm_rmsle_df['RMSLE'], color='black', bins=30)
plt.ylabel('RMSLE')
plt.xlabel('Swarm models (n)')
plt.show()

### 2.5. Level II model cross-validation

In [None]:
scores=cross_val_score(
    HistGradientBoostingRegressor(loss='gamma'),
    level_two_df.drop('label', axis=1),
    level_two_df['label'],
    scoring='neg_mean_squared_log_error',
    n_jobs=-1,
    cv=7
)

print(f'Cross-validation RMSLE: {np.mean(np.sqrt(-scores)):.4f} +/- {np.std(np.sqrt(-scores)):.4f}')