# Replace BG Dataset

## 1. Download and configure dataset
- Download `REPLACE-BG-Dataset.zip` from this [link](https://public.jaeb.org/datasets/diabetes);
- Create a folder in `fedbiomed/data/` named `replace-bg`;
- Unzip `REPLACE-BG-Dataset.zip` and move it the folder `Data Tables` in `{FEDBIOMED_DIR}/data/replace-bg`;
- Run `python ${FEDBIOMED_DIR}/docs/tutorials/advanced/client-selection/preprocessing.py`

## 2. Add 202 nodes and start --num_nodes nodes (up to 202)
- Open a terminal and run `python ${FEDBIOMED_DIR}/docs/tutorials/client-selection/add_nodes.py`
- run `python ${FEDBIOMED_DIR}/docs/tutorials/sec-agg/client-selection/start_nodes.py --num_nodes 10 #up to 202` 

# 3. FL Training

In [9]:
import datetime
import os
import torch
import torch.nn as nn
from torch.utils.data import Dataset
import numpy as np
import pandas as pd

from fedbiomed.common.training_plans import TorchTrainingPlan
from fedbiomed.common.data import DataManager
FEDBIOMED_DIR = os.environ.get('FEDBIOMED_DIR')
SEED = 42

DATA_DIR = f"{FEDBIOMED_DIR}/data/replace-bg/raw/patients"


class MyTrainingPlan(TorchTrainingPlan):
    
    def __init__(self):
        super().__init__()
        self.input_length = 12 # it mean past 3 hours
        self.pred_length = 4 # it mean next hour
    
    # Defines and return model 
    def init_model(self, model_args):
        return self.CNN_LSTM(model_args = model_args)
    
    # Defines and return optimizer
    def init_optimizer(self, optimizer_args):
        return torch.optim.SGD(self.model().parameters(), lr = optimizer_args["lr"])
    
    # Declares and return dependencies
    def init_dependencies(self):
        deps = ["import pandas as pd", "from torch.utils.data import Dataset", "import numpy as np", "import datetime"]
        return deps
    

    class CNN_LSTM(nn.Module):
        def __init__(self, model_args):
            super().__init__()
            # Channels for prediction (assuming one-dimensional time-series data)
            predict_channels = [0]
            self.predict_channels = predict_channels

            # Define the convolutional layers
            self.conv_layers = nn.Sequential(
                nn.Conv1d(in_channels=3, out_channels=32, kernel_size=7, padding=3),
                nn.ReLU(),
                nn.Conv1d(in_channels=32, out_channels=64, kernel_size=7, padding=3),
                nn.ReLU(),
                nn.MaxPool1d(kernel_size=2),  # MaxPool reduces dimensionality
                nn.Conv1d(in_channels=64, out_channels=64, kernel_size=5, padding=2),  # Added padding to maintain dimensions
                nn.ReLU(),
                nn.Conv1d(in_channels=64, out_channels=128, kernel_size=5, padding=2),  # Added padding to maintain dimensions
                nn.ReLU(),
                nn.MaxPool1d(kernel_size=2),  # MaxPool reduces dimensionality
            )

            # Define the LSTM layers
            self.lstm = nn.LSTM(input_size=128, hidden_size=100, num_layers=2, batch_first=True, dropout=0.2)

            # Define the fully connected layers
            self.fc_layers = nn.Sequential(
                nn.Linear(100, 64),
                nn.Tanh(),
                nn.Linear(64, 6),
                nn.Tanh(),
                nn.Linear(6, len(predict_channels))
            )

        def _forward(self, x):
            """
            Perform forward pass through the CNN and LSTM layers.

            Args:
                x: Input tensor with shape (batch_size, input_length, num_features)

            Returns:
                Output tensor after passing through the CNN and LSTM layers
            """
            x = x.permute(0, 2, 1)  # Change shape to (batch_size, num_features, input_length) for Conv1d
            x = self.conv_layers(x)
            x = x.permute(0, 2, 1)  # Change back to (batch_size, input_length, num_features) for LSTM
            x, _ = self.lstm(x)
            x = self.fc_layers(x[:, -1, :])  # Use the output of the last LSTM cell
            return x

        def forward(self, whole_example, input_length):
            """
            Perform sequential prediction using teacher forcing.

            Args:
                whole_example: Input tensor with shape (batch_size, total_length, num_features)
                input_length: Length of the sequence to use as input initially

            Returns:
                Tensor with the same shape as whole_example, where the values from input_length
                onward have been replaced with model predictions
            """
            whole_example_clone = whole_example.clone().detach()  # Create a detached clone of the input tensor
            total_length = whole_example_clone.shape[1]  # Get the total length of the sequence
            assert input_length < total_length, "input_length should be less than total_length"

            # Sequentially predict and replace the values in the cloned tensor
            # This process uses teacher forcing: it feeds the true previous (up to :input_length) values as input
            # And from [input_length:total_length] it feeds the prediction
            while input_length < total_length:
                # Use the sequence up to the current input_length as input
                x = whole_example[:, :input_length, :]
                # Perform forward pass to get the prediction for the next time step
                y_hat = self._forward(x)
                # Replace the value at the current input_length position with the prediction
                whole_example_clone[:, input_length, self.predict_channels] = y_hat[:, self.predict_channels]
                # Increment input_length to move to the next position
                input_length += 1

            return whole_example_clone

    class ReplaceBGDataset(Dataset):
        """The Replace-BG dataset for Torch training."""
    
        def __init__(self, raw_df, example_len, unimodal=False):
            """
            Args
                raw_df: dataframe
                example_len: int
                unimodal: bool
                    If True, data contains glucose only
            """
            raw_df.replace(to_replace=-1, value=np.nan, inplace=True)
            self.example_len = example_len
            self.unimodal = unimodal
            self.data, self.times = self._initial(raw_df)  # (len, n_features)
            self.example_indices = self._example_indices(self.times)
            self.external_mean = [160.87544032,   0.21893523,   1.49783614]
            self.external_std = [63.60143682,  1.14457581,  8.92042825]
            self._standardise(self.external_mean, self.external_std)
            print("Dataset loaded, total examples: {}.".format(len(self)))
    
        
        @staticmethod
        def str2dt(s):
            return datetime.datetime.strptime(s, "%Y-%m-%d %H:%M:%S")
    
        def _initial(self, raw_df):
            times = [self.str2dt(s) for s in raw_df["time"]]
            glucose = raw_df["GlucoseValue"].to_numpy(dtype=np.float32)
            bolus = raw_df["Normal"].to_numpy(dtype=np.float32)
            carbs = raw_df["CarbInput"].to_numpy(dtype=np.float32)
    
            bolus[np.isnan(bolus)] = 0.0
            carbs[np.isnan(carbs)] = 0.0
    
            return (
                np.array(
                    [
                        glucose,
                        bolus,
                        carbs,
                    ],
                    dtype=np.float32,
                ).T,
                times,
            )
    
        def _example_indices(self, times):
            """Extract every possible example from the dataset, st. all data entry in this example is not missing.
    
            Returns:
                [(start_row, end_row)]
                    Starting and ending indices for each possible example from this dataframe.
            """
            res = []
            total_len = self.data.shape[0]
    
            def look_ahead(start):
                end = start
                res = []
                while end < total_len:
                    if np.any(np.isnan(self.data[end, :])):
                        break
                    if end - start + 1 >= self.example_len:
                        # check that between start and end, there is the difference of self.example_len * 15 minutes
                        gap_min = self.example_len * 15
                        if (times[end] - times[end - self.example_len + 1]) <= datetime.timedelta(minutes=gap_min):
                            res.append((end - self.example_len + 1, end))
                    end += 1
                return res, end
    
            i = 0
            while i < total_len:
                if not np.any(np.isnan(self.data[i, :])):
                    temp_res, temp_end = look_ahead(i)
                    res += temp_res
                    i = temp_end + 1
                else:
                    i += 1
            return res
    
        def _standardise(self, external_mean=None, external_std=None):
            if external_mean is None and external_std is None:
                mean = []
                std = []
                for i in range(self.data.shape[1]):
                    mean.append(np.nanmean(self.data[:, i]))
                    std.append(np.nanstd(self.data[:, i]))
            else:
                mean = external_mean
                std = external_std
            self.mean = mean
            self.std = std
            print("Standardising with mean: {} and std: {}.".format(mean, std))
            for i in range(self.data.shape[1]):
                self.data[:, i] = (self.data[:, i] - mean[i]) / std[i]
    
        def __len__(self):
            return len(self.example_indices)
    
        def __getitem__(self, idx):
            """
            Args:
                idx: int
            Returns:
                (example_len, channels)
            """
            start_row, end_row = self.example_indices[idx]
            res = torch.from_numpy(self.data[start_row : end_row + 1, :])
            # print(f"start_row: {self.times[start_row]}, end_row: {self.times[end_row +1]}")
            return res,0  #Dummy variable to be compatible with TorchTrainingPlan

    def training_data(self):
        dataset1 = self.ReplaceBGDataset(raw_df=pd.read_csv(self.dataset_path), example_len=16)
        train_kwargs = { 'shuffle': True}
        return DataManager(dataset=dataset1, **train_kwargs)
    
    def training_step(self, data, target):
        target = data[:, -self.pred_length :, 0]
        output = self.model().forward(data, self.input_length)[:, -self.pred_length :, 0] 
        loss = torch.nn.functional.l1_loss(output, target)
        return loss


2024-05-26 20:10:07,714 fedbiomed DEBUG - Node: NODE_3e1e57c8-1364-41cc-8854-15c82e9d5b93 polling for the tasks

2024-05-26 20:10:15,949 fedbiomed DEBUG - Node: NODE_b14a8bc6-ff16-4c92-8772-4a755ad2855f polling for the tasks

2024-05-26 20:10:18,324 fedbiomed DEBUG - Node: NODE_61dbf902-17f9-4bc9-b9f2-d46bfdca0ec5 polling for the tasks

In [3]:
model_args = {}

training_args = {
    'loader_args': { 'batch_size': 64, }, 
    'optimizer_args': {
        "lr" : 1e-3
    },
    'num_updates': 10, 
    'dry_run': False,  
    'random_seed':SEED, 
    'log_interval': 5,
}

In [4]:
from fedbiomed.researcher.strategies import DefaultStrategy
import random
class UniformSelectStrategy(DefaultStrategy):
    
    def __init__(self, sampling_fraction = 1.0):
        super().__init__()
        """
        Initialize the class with a sampling fraction.

        Args:
            sampling_fraction (float): Fraction of nodes to sample each round. Must be between 0 and 1.
        """
        self.sampling_fraction = sampling_fraction

    def sample_nodes(self, from_nodes, round_i):
        """
        Samples and selects nodes on which to train the local model. This strategy considers a subsample of existing nodes.

        Args:
            from_nodes: List of node ids which may be sampled.
            round_i: Number of the round.

        Returns:
            List of node ids considered for training during this round `round_i`.
        """
        random.seed(SEED + round_i)  # Using round_i to ensure different seeds for different rounds
        num_nodes_to_sample = int(len(from_nodes) * self.sampling_fraction)
        sampled_nodes = random.sample(from_nodes, num_nodes_to_sample)

        self._sampling_node_history[round_i] = sampled_nodes

        return sampled_nodes

In [None]:
from fedbiomed.researcher.federated_workflows import Experiment
from fedbiomed.researcher.aggregators.fedavg import FedAverage


tags =  ['replace-bg']
rounds = 5

exp = Experiment(tags=tags,
                 model_args=model_args,
                 training_plan_class=MyTrainingPlan,
                 training_args=training_args,
                 round_limit=rounds,
                 aggregator=FedAverage(),
                 node_selection_strategy=UniformSelectStrategy(0.5))
exp.set_retain_full_history(True)


In [None]:
exp.run()

In [None]:
print("\nList the training rounds : ", exp.training_replies().keys())

print("\nList the nodes for the last training round and their timings : ")
for rnd in range(rounds):
    round_data = exp.training_replies()[rnd]
    for r in round_data.values():
        print("\t- {id} :\
        \n\t\trtime_training={rtraining:.2f} seconds\
        \n\t\tptime_training={ptraining:.2f} seconds\
        \n\t\trtime_total={rtotal:.2f} seconds".format(id = r['node_id'],
            rtraining = r['timing']['rtime_training'],
            ptraining = r['timing']['ptime_training'],
            rtotal = r['timing']['rtime_total'],))
    print('\n')

# 4. Test the global model for each round over testing nodes: 200,201, 203

In [None]:
patients = [f for f in os.listdir(DATA_DIR) if f.endswith('.csv')]
patients.sort()
patients = [int(p.replace(".csv", "")) for p in patients]

patients_testing = patients[200:]
test_dataset = torch.utils.data.ConcatDataset(
        [
            MyTrainingPlan.ReplaceBGDataset(
                raw_df=pd.read_csv(os.path.join(DATA_DIR, "{}.csv".format(p))),
            example_len=16)
            for p in patients_testing
        ])

In [16]:
import torch.nn.functional as F
from torch.utils.data import DataLoader

def testing_rmse(model, data_loader):
    model.eval()
    total_samples = 0
    input_length = 12
    pred_length = 4

    device = "gpu"
    model.to(device)

    # Initialize lists to collect predictions and targets
    predictions = []
    targets = []

    with torch.no_grad():
        for idx, (data, target) in enumerate(data_loader):
            target = data[:, -pred_length:, 0]
            target = target * 63.60143682 + 160.87544032
            data, target = data.to(device), target.to(device)
            output = model(data, input_length)[:, -pred_length:, 0]
            output = output * 63.60143682 + 160.87544032

            # Collect predictions and targets
            predictions.extend(output.cpu().tolist())
            targets.extend(target.cpu().tolist())

            # Only uses 10% of the dataset for faster results
            if idx >= len(data_loader) / 10:
                break
    
    # Convert lists to tensors
    predictions = torch.tensor(predictions)
    targets = torch.tensor(targets)

    # Calculate the Root Mean Squared Error
    final_rmse = torch.sqrt(torch.mean((predictions - targets) ** 2))

    return final_rmse.item()



batch_size = 32  # Adjust as needed
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)


In [None]:
for r in range(rounds):
    fed_model = exp.training_plan().model()
    fed_model.load_state_dict(exp.aggregated_params()[r]['params'])
    print(f"Round{r}, RMSE{testing_rmse(fed_model,test_loader)}")

143.43421936035156
143.565185546875
143.51353454589844
143.60012817382812
143.7473907470703
143.89306640625
143.95703125
143.8656005859375
143.8738555908203
144.01394653320312
143.94305419921875
143.94717407226562
144.00003051757812
