In [402]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in

from torch.utils.data import Dataset, DataLoader
from torch.optim import lr_scheduler
from tqdm import tqdm
import torch.nn as nn
import gc
import torch
from numpy import array
import numpy as np  # linear algebra
import pandas as pd  # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

Lets load the CSV file into a Pandas dataframe.

In [403]:
# data = pd.read_csv('/kaggle/input/data-csv/data.csv', dtype=str)
# data = pd.read_csv('/Users/fernando/PyNetSim/tutorials/surrogate/data/data.csv', dtype=str)
data = pd.read_csv(
    '/Users/ffjla/PyNetSim/tutorials/surrogate/data/data.csv', dtype=str)

Lets check the shape of the dataframe.


In [404]:
print(data.shape)

(76460, 16)


Lets get only the first 10% of the data to speed up the training process.

In [405]:
num_samples = len(data)
# Validation is the last 10% of samples
validation_set = data[int(num_samples*0.9):]
# reset index in both dataframes
validation_set.reset_index(drop=True, inplace=True)
# print shapes
print('Validation set shape: ', validation_set.shape)

Validation set shape:  (7646, 16)


Proportion of the data that will be used for training and testing.

In [406]:
print(f"Proportion of validation set: {validation_set.shape[0]/len(data)*100}%")

Proportion of validation set: 10.0%


In [407]:
print(f"{validation_set.head()},{validation_set.shape}")

               alpha               beta               gamma  \
0  6.396204291486541  3.317884312892152  2.3993866139171405   
1  6.396204291486541  3.317884312892152  2.3993866139171405   
2  6.396204291486541  3.317884312892152  2.3993866139171405   
3  6.396204291486541  3.317884312892152  2.3993866139171405   
4  6.396204291486541  3.317884312892152  2.3993866139171405   

     remaining_energy alive_nodes         cluster_heads  \
0   2.845430725207556          89  [33, 49, 56, 70, 88]   
1   2.801312292226756          89     [2, 4, 8, 25, 72]   
2   2.757201859245956          89    [4, 8, 72, 78, 88]   
3   2.713493629049157          89   [4, 22, 58, 59, 72]   
4  2.6706714707487573          89  [17, 25, 30, 78, 91]   

                                       energy_levels  \
0  [0.03218247005397702, 0.02710971871302542, 0.0...   
1  [0.02885879005397704, 0.026880038713025422, 0....   
2  [0.02863811005397704, 0.026650358713025422, 0....   
3  [0.02841523005397704, 0.026420678713025

In [408]:
def split_sequence(sequence, n_steps):
    x_data = []
    y_data = []
    num_samples = len(sequence)

    # Get the eelect, pkt_size, eamp, efs, eda, d0
    eelect = sequence['eelect'][0]
    eelect = float(eelect)
    pkt_size = sequence['pkt_size'][0]
    pkt_size = float(pkt_size)/4000
    eamp = sequence['eamp'][0]
    eamp = float(eamp)
    efs = sequence['efs'][0]
    efs = float(efs)
    eda = sequence['eda'][0]
    eda = float(eda)
    d0 = sequence['d0'][0]
    d0 = float(d0)/100

    avg_min_max_distances = sequence['avg_min_max_distances'].values[0]
    avg_min_max_distances = eval(avg_min_max_distances)
    avg_min_max_distances = [float(x)/300 for x in avg_min_max_distances]

    for i in tqdm(range(num_samples), desc="Processing sequence"):
        end_ix = i + n_steps
        if end_ix > num_samples - 1:
            break
        alpha_val, beta_val, gamma_val = sequence['alpha'][i:end_ix].values, sequence['beta'][
            i:end_ix].values, sequence['gamma'][i:end_ix].values
        # convert to float
        alpha_val = [float(x)/10 for x in alpha_val]
        beta_val = [float(x)/10 for x in beta_val]
        gamma_val = [float(x)/10 for x in gamma_val]
        assert all(
            x <= 1 and x >= -1 for x in alpha_val), f"Incorrect values of alpha: {alpha_val}"
        assert all(
            x <= 1 and x >= -1 for x in beta_val), f"Incorrect values of beta: {beta_val}"
        assert all(
            x <= 1 and x >= -1 for x in gamma_val), f"Incorrect values of gamma: {gamma_val}"
        # Normalize remaining energy dividing by 10
        remaining_energy = sequence['remaining_energy'][i:end_ix]
        remaining_energy = [float(x)/10 for x in remaining_energy]
        assert all(
            x <= 1 and x >= -1 for x in remaining_energy), f"Incorrect values of remaining energy: {remaining_energy}"
        # seq_x.extend(remaining_energy)
        # Normalize alive nodes dividing by 100
        alive_nodes = sequence['alive_nodes'][i:end_ix].values
        alive_nodes = [float(x)/100 for x in alive_nodes]
        assert all(
            x <= 1 and x >= -1 for x in alive_nodes), f"Incorrect values of alive nodes: {alive_nodes}"
        # seq_x.extend(alive_nodes)
        # Normalize energy levels dividing by 5
        energy_levels = sequence['energy_levels'][i:end_ix].values
        energy_levels = [eval(x) for x in energy_levels]
        # Convert to float every element in the array of arrays
        energy_levels = [[float(x)/5 for x in sublist]
                         for sublist in energy_levels]
        # energy levels is a list of lists, so we need to assert that all values are between -1 and 1
        # We iterate over the list of lists and assert that all values are between -1 and 1
        assert all(
            -1 <= x <= 1 for sublist in energy_levels for x in sublist), f"Incorrect values of energy levels: {energy_levels}"
        # seq_x.extend(energy_levels)
        # Normalize distance to cluster head dividing by 100
        dst_to_cluster_head = sequence['dst_to_cluster_head'][i:end_ix].values
        dst_to_cluster_head = [eval(x) for x in dst_to_cluster_head]
        dst_to_cluster_head = [[float(x)/200 for x in sublist]
                               for sublist in dst_to_cluster_head]
        assert all(-1 <= x <=
                   1 for sublist in dst_to_cluster_head for x in sublist), f"Incorrect values of distance to cluster head: {dst_to_cluster_head}"

        # seq_x.extend(dst_to_cluster_head)
        # Normalize membership dividing by 100
        membership = sequence['membership'][i:end_ix].values
        membership = [eval(x) for x in membership]
        membership = [[float(x)/100 for x in sublist]
                      for sublist in membership]
        assert all(-1 <= x <=
                   1 for sublist in membership for x in sublist), f"Incorrect values of membership: {membership}"
        # seq_x.extend(membership)
        # Normalize cluster heads dividing by 100
        chs, seq_y = sequence['cluster_heads'][i:
                                               end_ix], sequence['cluster_heads'][end_ix]
        chs = [eval(x) for x in chs]
        chs = [[float(x)/100 for x in sublist] for sublist in chs]
        assert all(-1 <= x <=
                   1 for sublist in chs for x in sublist), f"Incorrect values of cluster heads: {chs}"

        seq_y = eval(seq_y)

        next_alpha_val, next_beta_val, next_gamma_val = sequence['alpha'][end_ix], sequence['beta'][
            end_ix], sequence['gamma'][end_ix]
        # convert to float
        next_alpha_val = float(next_alpha_val)/10
        next_beta_val = float(next_beta_val)/10
        next_gamma_val = float(next_gamma_val)/10

        assert all(
            x <= 1 and x >= 0 for x in avg_min_max_distances), f"Incorrect values of avg_min_max_distances: {avg_min_max_distances}"

        if (next_alpha_val != alpha_val[0]) or (next_beta_val != beta_val[0]) or (next_gamma_val != gamma_val[0]):
            continue

        # Lets put the data into the seq_x like this weights[0], remaining energy[0],...,weights[1], remaining energy[1]
        seq_x_tmp = []
        for i in range(n_steps):
            a = alpha_val[i]
            b = beta_val[i]
            g = gamma_val[i]
            re = remaining_energy[i]
            an = alive_nodes[i]
            ch = chs[i]
            el = energy_levels[i]
            dst = dst_to_cluster_head[i]
            mem = membership[i]
            # Put the alpha, beta, gamma, remaining energy and alive nodes at the end of the list
            exp = []
            if i == 0:
                exp.extend([a, b, g, re, an])
                exp.extend(ch)
                exp.extend(el)
                exp.extend(dst)
                exp.extend(mem)
                exp.extend([eelect, pkt_size, eamp, efs, eda, d0])
                exp.extend(avg_min_max_distances)
            else:
                exp.extend([re, an])
                exp.extend(ch)
                exp.extend(el)
                exp.extend(dst)
                exp.extend(mem)
            # Append to the list
            seq_x_tmp.append(exp)
            # seq_x.extend([a, b, g, re, an, el, dst, mem, ch])

        # Convert seq_x into a single list
        seq_x = [item for sublist in seq_x_tmp for item in sublist]

        x_data.append(seq_x)
        y_data.append(seq_y)

    return np.array(x_data), np.array(y_data)


n_steps = 5
x_val_split, y_val_split = split_sequence(validation_set, n_steps)

Processing sequence: 100%|█████████▉| 7641/7646 [00:15<00:00, 497.81it/s]


In [409]:
y_val = np.zeros((y_val_split.shape[0], y_val_split.max()+1))
# Set to 1 the index of the label
for i, label in enumerate(y_val_split):
    y_val[i][label] = 1
x_val = x_val_split
print(y_val[0])
# print the indeces where y_val[0] is 1
print(np.where(y_val[0] == 1)[0])

[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.
 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0.]
[12 22 26 71 82]


Create the dataset class.

In [410]:
class ClusterHeadDataset(Dataset):
    def __init__(self, x, y):
        self.X = torch.from_numpy(x.astype(np.float32))
        self.y = torch.from_numpy(y.astype(np.float32))
        self.len = x.shape[0]

    def __len__(self):
        return self.len

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

    # Support batching
    def collate_fn(self, batch):
        X = torch.stack([x[0] for x in batch])
        y = torch.stack([x[1] for x in batch])
        return X, y

Create the network architecture.

In [411]:
class ForecastCCH(nn.Module):
    def __init__(self):
        super(ForecastCCH, self).__init__()
        self.batch_norm = nn.BatchNorm1d(1826)
        self.fc1 = nn.Linear(1826, 3000)
        self.relu1 = nn.ReLU()
        self.dropout1 = nn.Dropout(0.2)
        # self.batch_norm2 = nn.BatchNorm1d(2000)

        self.fc2 = nn.Linear(3000, 4000)
        self.relu2 = nn.ReLU()
        self.dropout2 = nn.Dropout(0.2)
        # self.batch_norm3 = nn.BatchNorm1d(4000)

        self.fc3 = nn.Linear(4000, 2000)
        self.relu3 = nn.ReLU()
        self.dropout3 = nn.Dropout(0.2)
        # self.batch_norm4 = nn.BatchNorm1d(2000)

        self.fc4 = nn.Linear(2000, 101)

        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.batch_norm(x)
        x = self.fc1(x)
        # x = self.batch_norm2(x)
        x = self.relu1(x)
        x = self.dropout1(x)

        x = self.fc2(x)
        # x = self.batch_norm3(x)
        x = self.relu2(x)
        x = self.dropout2(x)

        x = self.fc3(x)
        # x = self.batch_norm4(x)
        x = self.relu3(x)
        x = self.dropout3(x)

        x = self.fc4(x)
        x = self.sigmoid(x)

        return x

In [417]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = ForecastCCH().to(device)
# If there is a model saved, load it
if os.path.isfile('model.pt'):
    model.load_state_dict(torch.load('model.pt'))
    print("Model loaded")
else:
    print("No model found")

Model loaded


In [418]:
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
criterion = nn.BCELoss()


Create the dataset objects.

In [419]:
valid = ClusterHeadDataset(x_val, y_val)
valid_loader = DataLoader(valid, batch_size=1, shuffle=True)

In [420]:
def test_predicted():
    model.eval()
    avg_accuracy = []
    losses = []
    threshold = 0.5
    with torch.no_grad():
        for idx, (inputs, labels) in enumerate(valid_loader):
            # print(f"inputs: {inputs}, shape: {inputs.shape}")
            # print(f"labels: {labels}, shape: {labels.shape}")
            inputs = inputs.to(device)
            labels = labels.to(device)
            optimizer.zero_grad()
            preds = model(inputs.float())
            loss = criterion(preds, labels)
            losses.append(loss.item())
            correct = 0
            total = 0
            # print(f"y shape: {labels.shape}")
            # print(f"preds shape: {preds.shape}")
            # Loop over both the predictions and the labels
            for pred, label in zip(preds, labels):
                # print(f"pred: {pred}")
                # print(f"label: {label}")
                # Get the indices from labels where the value is 1
                y = np.where(label == 1)[0]
                # sort the indices
                y.sort()
                # print(f"y: {y}")
                # Get the topk indices from the predictions
                y_hat = torch.topk(pred, len(y))
                # Convert y_hat to numpy
                y_hat = y_hat.indices.cpu().numpy()
                # sort the indices
                y_hat.sort()
                # print(f"y_hat: {y_hat}")
                # Compute the accuracy
                correct += np.sum(y == y_hat)
                total += len(y)
            avg_accuracy.append(correct/total*100)
    # Mean accuracy
    print(f"Mean accuracy: {np.mean(avg_accuracy):.1f}%")
    # Min accuracy
    print(f"Min accuracy: {np.min(avg_accuracy):.1f}%")
    # Number of samples with min accuracy
    print(f"Number of samples with min accuracy: {np.sum(np.array(avg_accuracy) == np.min(avg_accuracy))}")
    # Max accuracy
    print(f"Max accuracy: {np.max(avg_accuracy):.1f}%")
    # Number of samples with max accuracy
    print(f"Number of samples with max accuracy: {np.sum(np.array(avg_accuracy) == np.max(avg_accuracy))}")

In [421]:

test_predicted()

Mean accuracy: 30.6%
Min accuracy: 0.0%
Number of samples with min accuracy: 1673
Max accuracy: 100.0%
Number of samples with max accuracy: 418
