<a href="https://colab.research.google.com/github/dks11/Research/blob/main/Research.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install --pre -U flwr[simulation] torch torchvision matplotlib transformers datasets

In [21]:
from collections import OrderedDict
from typing import List
from datetime import datetime

import flwr as fl
import numpy as np
import matplotlib.pyplot as plt
import sys
import pandas as pd

from typing import Dict, List, Optional, Tuple

import torch
import torch.nn as nn
import torchvision
import torch.nn.functional as F
import torchvision.transforms as transforms
from torch.utils.data import DataLoader, random_split
from torchvision.datasets import CIFAR10
from torch.optim import AdamW
from datasets import load_dataset, load_metric, Dataset
from transformers import AutoTokenizer, DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification


In [22]:
start_time = datetime.now()

#DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
DEVICE = torch.device("cpu")
CHECKPOINT = "distilbert-base-multilingual-cased"  # transformer model checkpoint


NUM_CLIENTS = 2

In [23]:
def get_parameters(net) -> List[np.ndarray]:
    return [val.cpu().numpy() for _, val in net.state_dict().items()]

def set_parameters(net, parameters: List[np.ndarray]):
    params_dict = zip(net.state_dict().keys(), parameters)
    state_dict = OrderedDict({k: torch.Tensor(v) for k, v in params_dict})
    net.load_state_dict(state_dict, strict=True)

In [24]:
class FLClient(fl.client.NumPyClient):
        def __init__(self, net, trainloader, testloader):
            self.net = net
            self.trainloader = trainloader
            self.testloader = testloader
     
        def get_parameters(self, config):
            return get_parameters(self.net)

        def fit(self, parameters, config):
            set_parameters(self.net, parameters)
            train(self.net, self.trainloader, epochs=1)
            return get_parameters(self.net), len(self.trainloader), {}

        def evaluate(self, parameters, config):
          set_parameters(self.net, parameters)
          loss, accuracy = test(self.net, self.testloader)
          return float(loss), len(self.testloader), {"accuracy": float(accuracy)}

In [25]:
def clientID(id):
    if id == "1":
        return r"/content/urdu1.csv"
    if id == "2":
        return r"/content/urdu2.csv"
    if id == "3":
        return r"/content/urdu3.csv"
    if id == "4":
        return r"/content/urdu4.csv"
    if id == "5":
        return r"/content/urdu5.csv"
    if id == "6":
        return r"/content/urdu6.csv"
            
            
def testID(id):
    if id == "1":
        return r"/content/urdu1.csv"
    if id == "2":
        return r"/content/urdu2.csv"
    if id == "3":
        return r"/content/urdu3.csv"
    if id == "4":
        return r"/content/urdu4.csv"
    if id == "5":
        return r"/content/urdu5.csv"
    if id == "6":
        return r"/content/urdu6.csv"

In [26]:
def load_data(cid):
    csvFile = clientID(cid)
    
    df = pd.read_csv(csvFile)

    raw_datasets = Dataset.from_pandas(df)

    raw_datasets = raw_datasets.shuffle(seed=42)

    tokenizer = AutoTokenizer.from_pretrained(CHECKPOINT)

    def tokenize_function(examples):
        return tokenizer(examples["message"], truncation=True)

    tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)

    tokenized_datasets = tokenized_datasets.remove_columns("message")
    tokenized_datasets = tokenized_datasets.rename_column("spam/ham", "labels")

    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
    trainloader = DataLoader(
        tokenized_datasets,
        shuffle=True,
        batch_size=32,
        collate_fn=data_collator,
    )
    csvFile = testID(cid)
    
    df = pd.read_csv(csvFile)

    raw_datasets = Dataset.from_pandas(df)

    raw_datasets = raw_datasets.shuffle(seed=42)
    tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
    tokenized_datasets = tokenized_datasets.remove_columns("message")
    tokenized_datasets = tokenized_datasets.rename_column("spam/ham", "labels")


    testloader = DataLoader(
        tokenized_datasets, batch_size=32, collate_fn=data_collator
    )

    return trainloader, testloader

In [27]:
def train(net, trainloader, epochs):
    optimizer = AdamW(net.parameters())
    net.train()
    for _ in range(epochs):
        for batch in trainloader:
            batch = {k: v.to(DEVICE) for k, v in batch.items()}
            outputs = net(**batch)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()


def test(net, testloader):
    metric = load_metric("accuracy")
    loss = 0
    net.eval()
    for batch in testloader:
        batch = {k: v.to(DEVICE) for k, v in batch.items()}
        with torch.no_grad():
            outputs = net(**batch)
        logits = outputs.logits
        loss += outputs.loss.item()
        predictions = torch.argmax(logits, dim=-1)
        metric.add_batch(predictions=predictions, references=batch["labels"])
    loss /= len(testloader.dataset)
    accuracy = metric.compute()["accuracy"]
    return loss, accuracy

In [28]:
def client_fn(cid: str) -> FLClient:
    # Create model
    net = AutoModelForSequenceClassification.from_pretrained(
        CHECKPOINT, num_labels=2
    ).to(DEVICE)

    trainloader, testloader = load_data(cid)

    # Create and return client
    return FLClient(net, trainloader, testloader)

In [29]:
def evaluate(
    server_round: int, parameters: fl.common.NDArrays, config: Dict[str, fl.common.Scalar]
) -> Optional[Tuple[float, Dict[str, fl.common.Scalar]]]:
    net = AutoModelForSequenceClassification.from_pretrained(
        CHECKPOINT, num_labels=2
    ).to(DEVICE)

    trainloader, testloader = load_data("1")
    set_parameters(net, parameters)  # Update model with the latest parameters
    loss, accuracy = test(net, testloader)
    print(f"Server-side evaluation loss {loss} / accuracy {accuracy}")
    return loss, {"accuracy": accuracy}

In [None]:
strategy = fl.server.strategy.FedAvg(
    fraction_fit=0.1,
    fraction_evaluate=0.1,
    evaluate_fn=evaluate,  # Pass the evaluation function
)


fl.simulation.start_simulation(
    client_fn=client_fn,
    num_clients=NUM_CLIENTS,
    config=fl.server.ServerConfig(num_rounds=2),
    clients_ids = ["1","2"],
    strategy=strategy,
)
    

INFO flower 2022-08-14 15:19:00,334 | app.py:145 | Starting Flower simulation, config: ServerConfig(num_rounds=2, round_timeout=None)
INFO:flower:Starting Flower simulation, config: ServerConfig(num_rounds=2, round_timeout=None)
INFO flower 2022-08-14 15:19:05,797 | app.py:179 | Flower VCE: Ray initialized with resources: {'object_store_memory': 8093771366.0, 'CPU': 4.0, 'node:172.28.0.2': 1.0, 'memory': 16187542734.0}
INFO:flower:Flower VCE: Ray initialized with resources: {'object_store_memory': 8093771366.0, 'CPU': 4.0, 'node:172.28.0.2': 1.0, 'memory': 16187542734.0}
INFO flower 2022-08-14 15:19:05,804 | server.py:86 | Initializing global parameters
INFO:flower:Initializing global parameters
INFO flower 2022-08-14 15:19:05,807 | server.py:270 | Requesting initial parameters from one random client
INFO:flower:Requesting initial parameters from one random client
[2m[36m(launch_and_get_parameters pid=2487)[0m Some weights of the model checkpoint at distilbert-base-multilingual-case

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

INFO flower 2022-08-14 15:21:22,209 | server.py:95 | initial parameters (loss, other metrics): 0.02042479970707343, {'accuracy': 0.8497596153846154}
INFO:flower:initial parameters (loss, other metrics): 0.02042479970707343, {'accuracy': 0.8497596153846154}
INFO flower 2022-08-14 15:21:22,214 | server.py:101 | FL starting
INFO:flower:FL starting
DEBUG flower 2022-08-14 15:21:22,219 | server.py:220 | fit_round 1: strategy sampled 2 clients (out of 2)
DEBUG:flower:fit_round 1: strategy sampled 2 clients (out of 2)


Server-side evaluation loss 0.02042479970707343 / accuracy 0.8497596153846154


[2m[36m(launch_and_fit pid=2487)[0m Some weights of the model checkpoint at distilbert-base-multilingual-cased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_projector.weight']
[2m[36m(launch_and_fit pid=2487)[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
[2m[36m(launch_and_fit pid=2487)[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
[2m[36m(launch_and_fit pid=2487)[0m Some weights of DistilBertForSequenceClass

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

INFO flower 2022-08-14 15:31:56,257 | server.py:122 | fit progress: (1, 0.016978751957559816, {'accuracy': 0.8497596153846154}, 634.0375792660002)
INFO:flower:fit progress: (1, 0.016978751957559816, {'accuracy': 0.8497596153846154}, 634.0375792660002)
DEBUG flower 2022-08-14 15:31:56,260 | server.py:170 | evaluate_round 1: strategy sampled 2 clients (out of 2)
DEBUG:flower:evaluate_round 1: strategy sampled 2 clients (out of 2)


Server-side evaluation loss 0.016978751957559816 / accuracy 0.8497596153846154


[2m[36m(launch_and_evaluate pid=2487)[0m Some weights of the model checkpoint at distilbert-base-multilingual-cased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_projector.weight']
[2m[36m(launch_and_evaluate pid=2487)[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
[2m[36m(launch_and_evaluate pid=2487)[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
[2m[36m(launch_and_evaluate pid=2487)[0m Some weights of Distil