## Federated classifier

In [7]:
import time
import pickle
import numpy as np
from tqdm import tqdm
from tensorboardX import SummaryWriter
import torch
import torch.nn as nn
import torch.optim as optim
import copy
import torch.nn.functional as F
import pandas as pd
from torch.autograd import Variable
from sklearn.preprocessing import StandardScaler
from torch.autograd import Variable
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
from torch.utils.data import TensorDataset
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix

file_path = '/Users/gautamjajoo/Desktop/FAL/dataset/Edge-IIoTset/DNN-EdgeIIoT-dataset.csv'
from dataset.data import preprocess_dataset, split_dataset


In [8]:
# This function takes a list of model weights (each model weight being a dictionary), averages them and returns the average weights
def average_weights(w):
    """
    Returns the average of the weights.
    """
    w_avg = copy.deepcopy(w[0])
    for key in w_avg.keys():
        for i in range(1, len(w)):
            w_avg[key] += w[i][key]
        w_avg[key] = torch.div(w_avg[key], len(w))
    return w_avg



#This function splits a dataset into a number of independent and identically distributed (IID) subsets. It is done in such a way that each user or client receives a unique set of samples.

def split_iid(dataset, num_users):
    num_items = int(len(dataset)/num_users) # the number of allocated samples for each client
    dict_users, all_idxs = {}, [i for i in range (len(dataset))]
    for i in range (num_users):
        dict_users[i] = set(np.random.choice(all_idxs, num_items, replace=False))
        all_idxs = list (set(all_idxs) - dict_users[i])
    return dict_users 


# This function reads the training and test data from a csv file, 
#performs some preprocessing operations and returns a TensorDataset for both training and test data. 
#It also splits the training data into multiple subsets using the unlabeled_iid() function and returns these user groups
def get_dataset(options):
    
    
    #print("x_shape", x.shape)
    df = preprocess_dataset(file_path)
    num_classes = df['Attack_type'].nunique()
    input_features = df.drop(['Attack_type'], axis=1).shape[1]

    print("Number of classes:", num_classes)
    print("Number of input features:", input_features)
    # X_train, X_val, X_test, y_train, y_val, y_test = split_dataset(df, seed=args.seed, size=args.size)
    X_train, X_val, X_test, y_train, y_val, y_test = split_dataset(df, seed=1, size=0.5)

    # Print the shapes of the resulting datasets
    print("Training set shape:", X_train.shape)
    print("Validation set shape:", X_val.shape)
    print("Test set shape:", X_test.shape)

    # file_labeled_train = \
    #     pd.read_csv('/Users/gautamjajoo/Desktop/FAL/dataset/Edge-IIoTset/DNN-EdgeIIoT-dataset.csv', skiprows=0, sep=',')
    # x_train=file_labeled_train.values[:, 0:30]
    # y_train=file_labeled_train.values[:, 30]

    #y_train=pd.get_dummies(y_train)
    #y_train = y_train.astype(np.float32)
    #print(y_train.dtypes)
    #y_train = y_train.to_numpy()

    # print("y_train_shape", y_train.shape)
    # print("y_train", y_train)

    #y_class=len(set(y_train))
    #print("y_class", y_class)

    
    # file_labeled_test = pd.read_csv('IIoT_edge_base_classifier_pred_test_new.csv', skiprows=0, sep=',')
    # x_test=file_labeled_test.values[:, 0:30]
    # y_test=file_labeled_test.values[:, 30]

    #y_test=pd.get_dummies(y_test)
    #print(y_test.dtypes)
    #y_test = y_test.astype(np.float32)
    #y_test = y_test.to_numpy()

    # print("y_test_shape", y_test.shape)
    # print("y_test", y_test)

    #y_class_test=len(set(y_test))
    #print("y_class_test", y_class_test)
    
    #raise
    
    #train_data
    #x_train=pd.DataFrame(x_test)
    #y_train=pd.DataFrame(y)

    #train_data=pd.concat([x_train, y_train],axis=1)
    #print("train_data", train_data.shape)
    
    #train_data.to_csv('IIoT_edge_base_classifier_pred_dummies.csv',index=False)

    #test_data
    #x_test=pd.DataFrame(x_test)
    #y_test=pd.DataFrame(y_test)

    #test_data=pd.concat([x_val, y_val], axis=1)
    #print("test_data", test_data.shape)
    
    #train_data.to_csv('IIoT_edge_base_classifier_pred_test_dummies.csv',index=False)

    
    #x_train_l,x_test,y_train_l,y_test=train_test_split(x_l, y ,random_state=0, test_size=0.5)
    
    #print(x_train_l.dtype) # This should not be 'object'
    
    #train_data = train_data.astype(float)
    #test_data = test_data.astype(float)
    #y_train_l = y_train_l.astype(float)
    #y_test=y_test.astype(float)

    #sc = MinMaxScaler()
    #x_train_l = sc.fit_transform(x_train_l)
    #x_test_l = sc.transform(x_test_l)
    
    #scaler=StandardScaler()
    #x_train_l=scaler.fit_transform(x_train_l)
    #x_test=scaler.fit_transform(x_test)
    
    #scaler=StandardScaler()
    #x_train=scaler.fit_transform(x_train)
    #x_test=scaler.fit_transform(x_test)

    X_train_tensor = torch.Tensor(X_train.values)
    y_train_tensor = torch.LongTensor(y_train.values)

    train_dataset = TensorDataset(torch.from_numpy(X_train_tensor),torch.from_numpy(y_train_tensor))

    X_test_tensor = torch.Tensor(X_test.values)
    y_test_tensor = torch.LongTensor(y_test.values)

    test_dataset = TensorDataset(torch.from_numpy(X_test_tensor),torch.from_numpy(y_test_tensor))

    
    # train_dataset = TensorDataset(torch.from_numpy(x_train),torch.from_numpy(y_train))
    # test_dataset = TensorDataset(torch.from_numpy(x_test),torch.from_numpy(y_test))

    
    
    #train_dataset = TensorDataset(torch.from_numpy(train_data))
    #test_dataset = TensorDataset(torch.from_numpy(test_data))

    
    #sc = MinMaxScaler()
    #unlabeled_dataset = sc.fit_transform(x)

    
    
    #Feature scaling
    #sc = StandardScaler()
    #unlabeled_dataset = sc.fit_transform(x)

    #converting to torch tensor
    #train_dataset = torch.tensor(train_dataset, dtype=torch.float32)
    #test_dataset = torch.tensor(test_dataset, dtype=torch.float32)

    #print("train_dataset")

    
    user_groups = split_iid(train_dataset, options.num_users)
    print("user_group", user_groups)
    print("Done...")
    return train_dataset, test_dataset, user_groups




#This class is a custom dataset that receives a base dataset and a list of indices. 
#It enables getting only a subset of the data in the base dataset, as specified by the indices list. 
#This is helpful in a federated learning scenario, where each client might only have access to a subset of the total data.
class DatasetSplit(Dataset):
    def __init__(self, dataset, idxs):
        self.dataset = dataset
        self.idxs = [int(i) for i in idxs]
    def __len__(self):
        return len(self.idxs)

    def __getitem__(self, item):
        
        data, target = self.dataset[self.idxs[item]]
        return torch.tensor(data), torch.tensor(target)

        #data = self.dataset[self.idxs[item]]
        #return torch.tensor(data), torch.tensor(data)

In [9]:
#set program arguments
options = lambda: None
options.rounds= 10
options.num_users= 15
options.frac= 1
options.local_ep= 10
options.local_bs= 10
options.lr= 0.01
options.momentum= 0.5
options.model= "mlp"
options.kernel_num= 9
options.kernel_sizes= "3,4,5"
options.num_channels= 1
options.norm= "batch_norm"
options.num_filters= 32
options.max_pool= "True"
options.dataset= "mnist"
options.num_classes= 15
options.gpu= None
options.optimizer= "sgd"
options.iid= 1
options.unequal= 0
options.stopping_rounds= 10
options.verbose= 1
options.seed= 1
options.batch_size = 100
options.client_epochs = 50
options.server_epochs=20



In [10]:
input_size = n_features=75
num_classes = 15
# hidden_size = [100, 100, 15]
hidden_size = [2, 90]


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# class MLP(nn.Module):
#     def __init__(self):
#         super(MLP, self).__init__()
#         self.MLP= nn.Sequential(
#         nn.Linear(input_size, hidden_size[0]),
#         nn.ReLU(),
#         nn.Dropout(0.01),
#         nn.Linear(hidden_size[0], hidden_size[1]),
#         nn.ReLU(),
#         nn.Dropout(0.01),
#         nn.Linear(hidden_size[1], hidden_size[2]))
#         #nn.Softmax())
#
#     def forward(self, x):
#         x=self.MLP(x)
#         return x

class DNN(nn.Module):
    def __init__(self):
        super(DNN, self).__init__()
        self.layers = nn.ModuleList()

        # input layer
        self.layers.append(nn.Linear(input_size, hidden_size[1]))
        self.layers.append(nn.ReLU())

        # hidden layers
        for _ in range(hidden_size[0] - 1):
            self.layers.append(nn.Linear(hidden_size[1], hidden_size[1]))
            self.layers.append(nn.ReLU())

        # output layer
        self.layers.append(nn.Linear(hidden_size[1], num_classes))

    def forward(self, x):
        for layer in self.layers:
            x = layer(x)

        # Applying the softmax function to the output layer
        x = nn.functional.softmax(x, dim=1)
        return x
    

In [11]:
class DNNModel(object):
    def __init__(self, options, train_dataset, test_dataset, idxs, logger):
        
        self.options = options
        self.logger = logger
        self.train_loader = DataLoader(DatasetSplit(train_dataset, idxs),
                             batch_size=options.batch_size, shuffle=True)
        self.test_loader= DataLoader(test_dataset, batch_size= options.batch_size, shuffle=False)

        #size=sys.getsizeof(self.train_loader)
        #print("data_user", size)
        self.device = 'cuda' if options.gpu else 'cpu'
        # Default criterion set to NLL loss function
        #self.criterion = nn.NLLLoss()
        self.criterion = nn.CrossEntropyLoss()
        #self.criterion= nn.BCELoss()
        self.client_epochs=options.client_epochs
        self.net=DNN().to(device)
        self.optimizer = optim.Adam(self.net.parameters(), lr=0.01)

        self.history = {}
        self.history['train_loss'] = []
        self.history['test_loss'] = []

    def train(self, model):
        mean_losses_superv = []
        #self.net.train()
        total = 0
        correct = 0
        for epoch in range(self.options.client_epochs):
            h = np.array([])
            
            for x, y in self.train_loader:
                
                self.optimizer.zero_grad()

                x=x.float()
                
                output = self.net(x)
                
                y=y.long()
                
                loss = self.criterion(output, y)
                h = np.append(h, loss.item())
                #raise
                
        # ===================backward====================
                loss.backward()
                self.optimizer.step()
                output = output.argmax(axis=1)
                
                total += y.size(0)
                
                y=y.float()
                output=output.float()
                
                correct += (output == y).sum().item()
                
        #raise
        # ===================log========================
            mean_loss_superv = np.mean(h)
            train_acc=correct/total
            
            mean_losses_superv.append(mean_loss_superv)
            PATH = "state_dict_model_Sup_IIoT_edge.pt"
            # Save
            torch.save(self.net.state_dict(), PATH)
            return sum(mean_losses_superv) / len(mean_losses_superv) , train_acc, self.net.state_dict() 
            #print('Done.....')

   
    def test_inference(self, model, test_dataset):
        nb_classes = 15
        confusion_matrix = np.zeros((nb_classes, nb_classes))
        self.net.eval()
        test_loss = 0
        correct = 0
        total = 0
        output_list=torch.zeros(0,dtype=torch.long)
        target_list=torch.zeros(0,dtype=torch.long)
        with torch.no_grad():
            
            for data, target in self.test_loader:
                
                data, target = data.to(device), target.to(device)
        
                
                output = self.net(data.float())
                
                batch_loss = self.criterion(output, target.long())
                #print("done... test...")
                #raise
                test_loss += batch_loss.item()
                total += target.size(0)
                
                

                target=target.float()
                
                output=output.argmax(axis=1)
                output=output.float()
                
                
                output_list=torch.cat([output_list, output.view(-1).long()])
                target_list=torch.cat([target_list, target.view(-1).long()])
                
        
                correct += (output == target).sum().item()
        
                    
            test_loss/=total
            acc=correct/total
       
    
            F1_score= f1_score(target_list, output_list, average = "macro") #labels=np.unique(output_list))))
            Precision=precision_score(target_list, output_list, average="macro")                                      
            Recall=recall_score(target_list, output_list, average="macro")
            class_report=classification_report(target_list,output_list, digits=4)

            #print(' F1 Score : ' + str(f1_score(target_list, output_list, average = "macro"))) #labels=np.unique(output_list))))
            #print(' Presicion : '+str(precision_score(target_list, output_list, average="macro", labels=np.unique(output_list))))                                       
            #print(' Recall : '+str(recall_score(target_list, output_list, average="macro", labels=np.unique(output_list))))
            #print("report", classification_report(target_list,output_list, digits=4))

            return acc, F1_score, Precision, Recall, class_report, test_loss



In [12]:
logger = SummaryWriter('../logs')
#load data
train_dataset, test_dataset,user_groups=get_dataset(options)


Attack_type
Normal                   1615643
DDoS_UDP                  121568
DDoS_ICMP                 116436
SQL_injection              51203
Password                   50153
Vulnerability_scanner      50110
DDoS_TCP                   50062
DDoS_HTTP                  49911
Uploading                  37634
Backdoor                   24862
Port_Scanning              22564
XSS                        15915
Ransomware                 10925
MITM                        1214
Fingerprinting              1001
Name: count, dtype: int64
<class 'pandas.core.frame.DataFrame'>
Index: 1909671 entries, 0 to 2219193
Data columns (total 97 columns):
 #   Column                                                                                                Dtype  
---  ------                                                                                                -----  
 0   arp.opcode                                                                                            float64
 1   arp.hw.si

TypeError: can't convert np.ndarray of type numpy.object_. The only supported types are: float64, float32, float16, complex64, complex128, int64, int32, int16, int8, uint8, and bool.

In [None]:

# BUILD MODEL
#global_model=MLPModel(options=options, train_dataset=train_dataset, test_dataset=test_dataset, logger=logger)
DNN_model=DNN()
print(DNN_model)
# Training
train_loss, train_accuracy = [], []
val_acc_list, net_list = [], []
cv_loss, cv_acc = [], []
print_every = 2
val_loss_pre, counter = 0, 0

for rounds in tqdm(range(options.rounds)):
    # in the server
    local_weights, local_losses = [], []
    print(f'\n | Training Round : {rounds+1} |\n')
    
    #global_model.train(auto_encoder_model)
    m = max(int(options.frac * options.num_users), 1)
    idxs_users = np.random.choice(range(options.num_users), m, replace=False)
    print("idxs_users",idxs_users)

    
    
    for idx in idxs_users:
        DNN_client = DNNModel(options=options, train_dataset=train_dataset, test_dataset=test_dataset,
                                    idxs=user_groups[idx], logger=logger)
        loss, train_acc, w = DNN_client.train(model=copy.deepcopy(DNN_model))
        #print(w)
        #print("w", w)
        #print("loss", loss)
        local_weights.append(copy.deepcopy(w))
        #local_losses.append(copy.deepcopy(loss))
        
        test_acc, F1_score, Precision, Recall, class_report, test_loss = DNN_client.test_inference(DNN_model, test_dataset)
        print(f'client_id {idx}')
        print("|---- Test Accuracy_client: {:.2f}%".format(test_acc))
        print("|---- F1_score:", F1_score)
        print("|---- Precision:", Precision)
        print("|---- Recall:", Recall)
        print(class_report)
        print(f'Testing Loss : {np.mean(np.array(test_loss))}')
    #print(local_weights)
    DNN_model.load_state_dict(average_weights(local_weights))
    #loss_avg = sum(local_losses) / len(local_losses)
    #train_loss.append(loss_avg)
    
    #mean_losses_superv = global_model.train(auto_encoder_model)
    #mean_losses_superv_avg = sum(mean_losses_superv) / len(mean_losses_superv)
    
    
   
    # Calculate avg training accuracy over all users at every epoch
    #list_acc, list_loss = [], []
    #auto_encoder_model.eval()
    #for c in range(options.num_users):
        #local_model = AutoEncoder(options=options, dataset=train_dataset,
                                       #idxs=user_groups[idx], logger=logger)
        #acc, loss = auto_encoder.inference(model=auto_encoder_model)
        #list_acc.append(acc)
        #list_loss.append(loss)
    #train_accuracy.append(sum(list_acc)/len(list_acc))

    # print global training loss after every 'i' rounds
    #if (round+1) % print_every == 0:
        #print(f' \nAvg Training Stats after {round+1} global rounds:')
        #print(f'Training Loss : {np.mean(np.array(train_loss))}')
        #print('Train Accuracy: {:.2f}% \n'.format(100*train_accuracy[-1]))
#print("yes")

    test_acc_g, F1_score_g, Precision_g, Recall_g, class_report_g, test_loss_g = DNN_client.test_inference(DNN_model, test_dataset)

    
    #if (rounds+1) % print_every == 0:
    print(f' \nAvg Training Stats after {rounds+1} global rounds:')
    print("|---- Test Accuracy:", test_acc_g)
    print("|---- F1_score:", F1_score_g)
    print("|---- Precision:", Precision_g)
    print("|---- Recall:", Recall_g)
    print(class_report_g)
    #print(f'Training Loss : {np.mean(np.array(mean_losses_superv_avg))}')
    print(f'Testing Loss : {np.mean(np.array(test_loss))}')
    
   
        #print(f' \n Results after {options.rounds} global rounds of training:')
#print("|---- Avg Train Accuracy: {:.2f}%".format(100*train_accuracy[-1]))
        #print(f'Training Loss : {np.mean(np.array(mean_losses_superv_avg))}')
        #print(f'Testing Loss : {np.mean(np.array(test_loss))}')
        
        #print("yes")