<a href="https://colab.research.google.com/github/dks11/Research/blob/main/Research.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install --pre flwr[simulation] torch torchvision matplotlib transformers datasets

In [3]:
from collections import OrderedDict
from typing import List
from datetime import datetime

import flwr as fl
import numpy as np
import matplotlib.pyplot as plt
import sys
import pandas as pd

import torch
import torch.nn as nn
import torchvision
import torch.nn.functional as F
import torchvision.transforms as transforms
from torch.utils.data import DataLoader, random_split
from torchvision.datasets import CIFAR10
from torch.optim import AdamW
from datasets import load_dataset, load_metric, Dataset
from transformers import AutoTokenizer, DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification


In [4]:
torch.cuda.empty_cache()

start_time = datetime.now()

DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
CHECKPOINT = "bert-base-uncased"  # transformer model checkpoint


NUM_CLIENTS = 2

In [5]:
def get_parameters(net) -> List[np.ndarray]:
    return [val.cpu().numpy() for _, val in net.state_dict().items()]

def set_parameters(net, parameters: List[np.ndarray]):
    params_dict = zip(net.state_dict().keys(), parameters)
    state_dict = OrderedDict({k: torch.Tensor(v) for k, v in params_dict})
    net.load_state_dict(state_dict, strict=True)

In [6]:
class FLClient(fl.client.NumPyClient):
        def __init__(self, net, trainloader, testloader):
            self.net = net
            self.trainloader = trainloader
            self.testloader = testloader
     
        def get_parameters(self, config):
            return get_parameters(self.net)

        def fit(self, parameters, config):
            set_parameters(self.net, parameters)
            train(self.net, self.trainloader, epochs=1)
            return get_parameters(self.net), len(self.trainloader), {}

        def evaluate(self, parameters, config):
            set_parameters(self.net, parameters)
            loss, accuracy = test(self.net, self.testloader)
            return float(loss), len(self.testloader), {"accuracy": float(accuracy)}

In [7]:
def clientID(id):
    if id == "1":
        return r"/content/enron1.csv"
    if id == "2":
        return r"/content/enron2.csv"
    if id == "3":
        return r"/content/enron3.csv"
    if id == "4":
        return r"/content/enron4.csv"
    if id == "5":
        return r"/content/enron5.csv"
    if id == "6":
        return r"/content/enron6.csv"
            
            
def testID(id):
    if id == "1":
        return r"/content/enron1.csv"
    if id == "2":
        return r"/content/enron2.csv"
    if id == "3":
        return r"/content/enron3.csv"
    if id == "4":
        return r"/content/enron4.csv"
    if id == "5":
        return r"/content/enron5.csv"
    if id == "6":
        return r"/content/enron6.csv"

In [8]:
def load_data(cid):
    csvFile = clientID(cid)
    
    df = pd.read_csv(csvFile)

    raw_datasets = Dataset.from_pandas(df)

    raw_datasets = raw_datasets.shuffle(seed=42)

    tokenizer = AutoTokenizer.from_pretrained(CHECKPOINT)

    def tokenize_function(examples):
        return tokenizer(examples["message"], truncation=True)

    tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)

    tokenized_datasets = tokenized_datasets.remove_columns("message")
    tokenized_datasets = tokenized_datasets.rename_column("spam/ham", "labels")

    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
    trainloader = DataLoader(
        tokenized_datasets,
        shuffle=True,
        batch_size=32,
        collate_fn=data_collator,
    )
    csvFile = testID(cid)
    
    df = pd.read_csv(csvFile)

    raw_datasets = Dataset.from_pandas(df)

    raw_datasets = raw_datasets.shuffle(seed=42)
    tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
    tokenized_datasets = tokenized_datasets.remove_columns("message")
    tokenized_datasets = tokenized_datasets.rename_column("spam/ham", "labels")


    testloader = DataLoader(
        tokenized_datasets, batch_size=32, collate_fn=data_collator
    )

    return trainloader, testloader

In [9]:
def train(net, trainloader, epochs):
    optimizer = AdamW(net.parameters())
    net.train()
    for _ in range(epochs):
        for batch in trainloader:
            batch = {k: v.to(DEVICE) for k, v in batch.items()}
            outputs = net(**batch)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()


def test(net, testloader):
    metric = load_metric("accuracy")
    loss = 0
    net.eval()
    for batch in testloader:
        batch = {k: v.to(DEVICE) for k, v in batch.items()}
        with torch.no_grad():
            outputs = net(**batch)
        logits = outputs.logits
        loss += outputs.loss.item()
        predictions = torch.argmax(logits, dim=-1)
        metric.add_batch(predictions=predictions, references=batch["labels"])
    loss /= len(testloader.dataset)
    accuracy = metric.compute()["accuracy"]
    return loss, accuracy

In [10]:
def client_fn(cid: str) -> FLClient:
    # Create model
    net = AutoModelForSequenceClassification.from_pretrained(
        CHECKPOINT, num_labels=2
    ).to(DEVICE)

    trainloader, testloader = load_data(cid)

    # Create and return client
    return FLClient(net, trainloader, testloader)

In [None]:
strategy = fl.server.strategy.FedAvg(
        fraction_fit=1.0,  # Sample 100% of available clients for training
        fraction_evaluate=0.5,  # Sample 50% of available clients for evaluation
)


fl.simulation.start_simulation(
    client_fn=client_fn,
    num_clients=NUM_CLIENTS,
    config=fl.server.ServerConfig(num_rounds=1),
    strategy=strategy,
    clients_ids = ["1","2"],
)
    