In [4]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification
from torch.optim import AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns
import flwr as fl
from typing import Dict, List, Tuple, Optional
from collections import defaultdict
import json
import os

# Configuration dictionary
CONFIG = {
    "seed": 42,
    "num_rounds": 5,
    "num_clients": 6,
    "epochs_per_round": 1,
    "batch_size": 8,
    "learning_rate": 2e-5,
    "max_length": 256,
    "model_name": "xlm-roberta-base",
    "test_size": 0.2,
    "spam_ratio": 0.8,  # For non-IID distribution
    "results_file": "federated_learning_results.json",
    "confusion_matrix_file": "confusion_matrix.png",
    "metrics_chart_file": "metrics_over_rounds.png",
    "loss_accuracy_chart_file": "loss_accuracy_over_rounds.png"
}

# Set random seeds
np.random.seed(CONFIG["seed"])
torch.manual_seed(CONFIG["seed"])
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(CONFIG["seed"])

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Dataset class
class BanglaSpamDataset(Dataset):
    def __init__(self, texts, labels, tokenizer):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])  # Ensure text is a string
        label = self.labels[idx]
        encoding = self.tokenizer(
            text,
            max_length=CONFIG["max_length"],
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )
        return {
            "input_ids": encoding["input_ids"].flatten(),
            "attention_mask": encoding["attention_mask"].flatten(),
            "labels": torch.tensor(label, dtype=torch.long)
        }

# Load and preprocess data
def load_data():
    # Google Sheet CSV export URL
    sheet_id = "1TL5gYe07k4ZF5HoLHia64xs5J6JMOPxCFQEVAzNseTw"
    tab_name = "data"
    csv_url = f"https://docs.google.com/spreadsheets/d/{sheet_id}/gviz/tq?tqx=out:csv&sheet={tab_name}"

    # Load directly from Google Sheets
    df = pd.read_csv(csv_url)

    # Ensure correct column names
    df.columns = ["label", "text"]

    # Convert all texts to strings to avoid type issues
    df["text"] = df["text"].astype(str)

    texts = df["text"].tolist()
    labels = df["label"].tolist()

    print("Label classes:", set(labels))
    print(f"Total samples: {len(texts)}")
    print(f"Class distribution: {pd.Series(labels).value_counts().to_dict()}")

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(
        texts, labels,
        test_size=CONFIG["test_size"],
        stratify=labels,
        random_state=CONFIG["seed"]
    )
    return (X_train, y_train), (X_test, y_test)

# Create non-IID partitions
def create_non_iid_clients(X_train, y_train):
    spam_indices = np.where(np.array(y_train) == 1)[0]
    ham_indices = np.where(np.array(y_train) == 0)[0]
    clients_data = []
    client_distributions = []

    # Create spam-dominant clients
    for i in range(CONFIG["num_clients"] // 2):
        spam_size = int(len(spam_indices) * CONFIG["spam_ratio"] / (CONFIG["num_clients"] // 2))
        ham_size = int(len(ham_indices) * (1 - CONFIG["spam_ratio"]) / (CONFIG["num_clients"] // 2))

        spam_samples = np.random.choice(spam_indices, spam_size, replace=False)
        ham_samples = np.random.choice(ham_indices, ham_size, replace=False)

        client_indices = np.concatenate([spam_samples, ham_samples])
        clients_data.append(client_indices)
        
        # Store distribution info
        client_labels = [y_train[idx] for idx in client_indices]
        distribution = {
            "client_id": i,
            "spam_percentage": sum(client_labels) / len(client_labels) * 100,
            "ham_percentage": (1 - sum(client_labels) / len(client_labels)) * 100,
            "total_samples": len(client_indices)
        }
        client_distributions.append(distribution)
        
        spam_indices = np.setdiff1d(spam_indices, spam_samples)
        ham_indices = np.setdiff1d(ham_indices, ham_samples)

    # Create ham-dominant clients
    for i in range(CONFIG["num_clients"] // 2, CONFIG["num_clients"]):
        ham_size = int(len(ham_indices) * CONFIG["spam_ratio"] / (CONFIG["num_clients"] // 2))
        spam_size = int(len(spam_indices) * (1 - CONFIG["spam_ratio"]) / (CONFIG["num_clients"] // 2))

        ham_samples = np.random.choice(ham_indices, ham_size, replace=False)
        spam_samples = np.random.choice(spam_indices, spam_size, replace=False)

        client_indices = np.concatenate([ham_samples, spam_samples])
        clients_data.append(client_indices)
        
        # Store distribution info
        client_labels = [y_train[idx] for idx in client_indices]
        distribution = {
            "client_id": i,
            "spam_percentage": sum(client_labels) / len(client_labels) * 100,
            "ham_percentage": (1 - sum(client_labels) / len(client_labels)) * 100,
            "total_samples": len(client_indices)
        }
        client_distributions.append(distribution)
        
        ham_indices = np.setdiff1d(ham_indices, ham_samples)
        spam_indices = np.setdiff1d(spam_indices, spam_samples)

    # Print client data distributions
    print("\nClient Data Distributions:")
    for dist in client_distributions:
        print(f"Client {dist['client_id']}: {dist['total_samples']} samples - "
              f"Spam: {dist['spam_percentage']:.1f}%, Ham: {dist['ham_percentage']:.1f}%")
    
    return clients_data

# Modified Flower client class
class FlowerClient(fl.client.NumPyClient):
    def __init__(self, train_loader, client_id):
        self.train_loader = train_loader
        self.client_id = client_id
        self.model = XLMRobertaForSequenceClassification.from_pretrained(
            CONFIG["model_name"], num_labels=2
        ).to(device)
        self.optimizer = AdamW(
            self.model.parameters(),
            lr=CONFIG["learning_rate"]
        )
        self.training_metrics = []

    def get_parameters(self, config):
        return [val.cpu().numpy() for _, val in self.model.state_dict().items()]

    def fit(self, parameters, config):
        self.set_parameters(parameters)
        self.model.train()

        epoch_metrics = {"client_id": self.client_id, "loss": 0, "samples": 0}
        
        for epoch in range(CONFIG["epochs_per_round"]):
            running_loss = 0.0
            samples = 0
            
            for batch in self.train_loader:
                self.optimizer.zero_grad()
                input_ids = batch["input_ids"].to(device)
                attention_mask = batch["attention_mask"].to(device)
                labels = batch["labels"].to(device)

                outputs = self.model(input_ids, attention_mask=attention_mask, labels=labels)
                loss = outputs.loss
                loss.backward()
                self.optimizer.step()
                
                # Accumulate metrics
                running_loss += loss.item() * labels.size(0)
                samples += labels.size(0)

            # Record metrics for this epoch
            epoch_metrics["loss"] = running_loss / samples
            epoch_metrics["samples"] = samples
            
        self.training_metrics.append(epoch_metrics)
        print(f"Client {self.client_id} - Loss: {epoch_metrics['loss']:.4f}")
        
        return self.get_parameters({}), len(self.train_loader.dataset), {"client_id": self.client_id, "loss": epoch_metrics["loss"]}

    def set_parameters(self, parameters):
        params_dict = zip(self.model.state_dict().keys(), parameters)
        state_dict = {k: torch.tensor(v) for k, v in params_dict}
        self.model.load_state_dict(state_dict, strict=True)

# Evaluation function with detailed metrics
def evaluate_model(model, data_loader, return_predictions=False):
    model.eval()
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            preds = torch.argmax(outputs.logits, dim=1)
            
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    accuracy = accuracy_score(all_labels, all_preds)
    
    if return_predictions:
        return accuracy, all_preds, all_labels
    return accuracy

# Function to generate and save confusion matrix
def generate_confusion_matrix(y_true, y_pred, save_path=None):
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(10, 8))
    
    # Create confusion matrix heatmap
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                xticklabels=['Ham (0)', 'Spam (1)'], 
                yticklabels=['Ham (0)', 'Spam (1)'])
    
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.title('Confusion Matrix')
    
    if save_path:
        plt.savefig(save_path)
        print(f"Confusion matrix saved to {save_path}")
    plt.close()  # Close the figure to prevent display in non-interactive mode
    
    # Calculate metrics from confusion matrix
    tn, fp, fn, tp = cm.ravel()
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
    
    metrics = {
        "true_negative": int(tn),
        "false_positive": int(fp),
        "false_negative": int(fn),
        "true_positive": int(tp),
        "precision": float(precision),
        "recall": float(recall),
        "f1_score": float(f1),
        "accuracy": float((tp + tn) / (tp + tn + fp + fn))
    }
    
    return metrics

# Server-side evaluation with detailed metrics
def server_evaluate(
    server_round: int,
    parameters: fl.common.NDArrays,
    config: Dict[str, fl.common.Scalar],
    test_loader,
    X_test,
    y_test,
    metrics_history
):
    """Evaluate model parameters on the test dataset."""
    model = XLMRobertaForSequenceClassification.from_pretrained(
        CONFIG["model_name"], num_labels=2
    ).to(device)
    params_dict = zip(model.state_dict().keys(), parameters)
    state_dict = {k: torch.tensor(v) for k, v in params_dict}
    model.load_state_dict(state_dict)

    # Get accuracy and predictions
    accuracy, all_preds, all_labels = evaluate_model(model, test_loader, return_predictions=True)
    
    # Generate classification report
    report = classification_report(all_labels, all_preds, target_names=['Ham', 'Spam'], output_dict=True, zero_division=0)
    
    # Generate confusion matrix and metrics - only save on final round
    cm_metrics = generate_confusion_matrix(
        all_labels, all_preds, 
        save_path=CONFIG["confusion_matrix_file"] if server_round == CONFIG["num_rounds"] else None
    )
        
    # Store metrics for this round
    round_metrics = {
        "round": server_round,
        "accuracy": accuracy,
        "confusion_matrix_metrics": cm_metrics,
        "classification_report": report
    }
    metrics_history.append(round_metrics)
    
    # Save metrics history to file
    with open(CONFIG["results_file"], 'w') as f:
        json.dump(metrics_history, f, indent=2)
    
    # Advanced error analysis on misclassifications - final round only
    if server_round == CONFIG["num_rounds"]:
        misclassified_indices = [i for i, (true, pred) in enumerate(zip(all_labels, all_preds)) if true != pred]
        num_samples = min(5, len(misclassified_indices))
        
        if num_samples > 0:
            print("\nMisclassification Analysis:")
            print(f"Showing {num_samples} of {len(misclassified_indices)} misclassified examples:")
            
            for i in range(num_samples):
                idx = misclassified_indices[i]
                # Make sure text is a string and handle potential type issues
                text = str(X_test[idx])
                text_display = text[:100] + "..." if len(text) > 100 else text
                true_label = "Spam" if all_labels[idx] == 1 else "Ham"
                pred_label = "Spam" if all_preds[idx] == 1 else "Ham"
                print(f"\nExample {i+1}:")
                print(f"Text: {text_display}")
                print(f"True: {true_label}, Predicted: {pred_label}")
    
    # Print round summary
    print(f"\nRound {server_round} - Evaluation Results:")
    print(f"- Accuracy: {accuracy:.4f}")
    print(f"- Precision (Spam): {cm_metrics['precision']:.4f}")
    print(f"- Recall (Spam): {cm_metrics['recall']:.4f}")
    print(f"- F1 Score (Spam): {cm_metrics['f1_score']:.4f}")
    
    return accuracy

# Function to analyze and visualize results
def analyze_results(client_metrics=None, results_file=CONFIG["results_file"]):
    try:
        with open(results_file, 'r') as f:
            results = json.load(f)
            
        # Plot metrics across rounds
        rounds = [r["round"] for r in results]
        accuracy = [r["accuracy"] for r in results]
        precision = [r["confusion_matrix_metrics"]["precision"] for r in results]
        recall = [r["confusion_matrix_metrics"]["recall"] for r in results]
        f1 = [r["confusion_matrix_metrics"]["f1_score"] for r in results]
        
        # Create the metrics chart
        plt.figure(figsize=(12, 8))
        plt.plot(rounds, accuracy, 'o-', label='Accuracy')
        plt.plot(rounds, precision, 's-', label='Precision')
        plt.plot(rounds, recall, '^-', label='Recall')
        plt.plot(rounds, f1, 'D-', label='F1 Score')
        
        plt.xlabel('Round')
        plt.ylabel('Score')
        plt.title('Federated Learning Performance Metrics')
        plt.legend()
        plt.grid(True)
        plt.savefig(CONFIG['metrics_chart_file'])
        plt.close()
        print(f"Performance metrics chart saved to {CONFIG['metrics_chart_file']}")
        
        # Create a loss and accuracy chart if client metrics are available
        if client_metrics:
            plt.figure(figsize=(15, 6))
            
            # Create a subplot for loss
            plt.subplot(1, 2, 1)
            for client_id, metrics in client_metrics.items():
                rounds = list(range(1, len(metrics) + 1))
                loss_values = [m.get('loss', 0) for m in metrics]
                plt.plot(rounds, loss_values, 'o-', label=f'Client {client_id}')
            
            plt.xlabel('Round')
            plt.ylabel('Loss')
            plt.title('Training Loss by Client')
            plt.legend()
            plt.grid(True)
            
            # Create a subplot for accuracy
            plt.subplot(1, 2, 2)
            plt.plot(rounds, accuracy, 'D-', linewidth=2, color='black', label='Global Model')
            
            plt.xlabel('Round')
            plt.ylabel('Accuracy')
            plt.title('Global Model Accuracy')
            plt.legend()
            plt.grid(True)
            
            plt.tight_layout()
            plt.savefig(CONFIG['loss_accuracy_chart_file'])
            plt.close()
            print(f"Loss and accuracy chart saved to {CONFIG['loss_accuracy_chart_file']}")
        
        # Generate the results table
        print("\n" + "="*80)
        print(" " * 20 + "FEDERATED LEARNING RESULTS SUMMARY")
        print("="*80)
        
        # Header
        print(f"{'Round':<10}{'Accuracy':<15}{'Precision':<15}{'Recall':<15}{'F1 Score':<15}")
        print("-"*70)
        
        # Data rows
        for r in results:
            print(f"{r['round']:<10}{r['accuracy']:.4f}{'':10}{r['confusion_matrix_metrics']['precision']:.4f}{'':9}{r['confusion_matrix_metrics']['recall']:.4f}{'':9}{r['confusion_matrix_metrics']['f1_score']:.4f}")
        
        print("-"*70)
        
        # Print final round summary
        final_round = results[-1]
        print("\nFinal Model Performance Summary:")
        print(f"- Accuracy: {final_round['accuracy']:.4f}")
        print(f"- Precision (Spam): {final_round['confusion_matrix_metrics']['precision']:.4f}")
        print(f"- Recall (Spam): {final_round['confusion_matrix_metrics']['recall']:.4f}")
        print(f"- F1 Score (Spam): {final_round['confusion_matrix_metrics']['f1_score']:.4f}")
        
        # Class-specific metrics
        print("\nClass-specific Metrics:")
        for cls in ['Ham', 'Spam']:
            print(f"\n{cls} Class:")
            print(f"- Precision: {final_round['classification_report'][cls]['precision']:.4f}")
            print(f"- Recall: {final_round['classification_report'][cls]['recall']:.4f}") 
            print(f"- F1-Score: {final_round['classification_report'][cls]['f1-score']:.4f}")
            
    except FileNotFoundError:
        print(f"Results file {results_file} not found. Run the federated learning simulation first.")
    except Exception as e:
        print(f"Error analyzing results: {e}")

# Modified implementation to avoid Ray transport errors
def run_federated_learning_simulation():
    # Clear previous results if they exist
    if os.path.exists(CONFIG["results_file"]):
        os.remove(CONFIG["results_file"])
    
    # Load and prepare data
    (X_train, y_train), (X_test, y_test) = load_data()
    tokenizer = XLMRobertaTokenizer.from_pretrained(CONFIG["model_name"])

    # Create test loader
    test_dataset = BanglaSpamDataset(X_test, y_test, tokenizer)
    test_loader = DataLoader(test_dataset, batch_size=CONFIG["batch_size"], shuffle=False)

    # Create non-IID clients
    client_indices = create_non_iid_clients(X_train, y_train)
    
    # Initialize clients
    clients = []
    for i, indices in enumerate(client_indices):
        client_texts = [X_train[idx] for idx in indices]
        client_labels = [y_train[idx] for idx in indices]
        dataset = BanglaSpamDataset(client_texts, client_labels, tokenizer)
        loader = DataLoader(dataset, batch_size=CONFIG["batch_size"], shuffle=True)
        clients.append(FlowerClient(loader, i))
    
    # Initialize server model
    server_model = XLMRobertaForSequenceClassification.from_pretrained(
        CONFIG["model_name"], num_labels=2
    ).to(device)
    
    # Get initial parameters
    server_parameters = [val.cpu().numpy() for _, val in server_model.state_dict().items()]
    
    # Track metrics
    metrics_history = []
    client_metrics = defaultdict(list)
    
    # Simulation loop
    print(f"\nStarting Federated Learning simulation with {CONFIG['num_clients']} clients")
    print(f"Running for {CONFIG['num_rounds']} rounds with {CONFIG['epochs_per_round']} epochs per round")
    print(f"Device: {device}")
    
    for server_round in range(1, CONFIG["num_rounds"] + 1):
        print(f"\nRound {server_round}/{CONFIG['num_rounds']}")
        
        # Client-side training
        client_results = []
        for client_idx, client in enumerate(clients):
            print(f"Training client {client_idx}...")
            parameters, num_examples, client_metrics_dict = client.fit(server_parameters, {})
            client_results.append((parameters, num_examples))
            
            # Store client metrics
            client_metrics[client_idx].append(client_metrics_dict)
        
        # Aggregate parameters (FedAvg)
        if client_results:
            weights = [num_examples for _, num_examples in client_results]
            total_examples = sum(weights)
            weighted_parameters = [
                [layer * weight / total_examples for layer in parameters]
                for parameters, weight in zip(
                    [parameters for parameters, _ in client_results], weights
                )
            ]
            
            # Sum up weighted parameters
            server_parameters = [
                np.sum(
                    [params[i] for params in weighted_parameters], axis=0
                )
                for i in range(len(weighted_parameters[0]))
            ]
        
        # Server-side evaluation
        accuracy = server_evaluate(
            server_round=server_round,
            parameters=server_parameters,
            config={},
            test_loader=test_loader,
            X_test=X_test,
            y_test=y_test,
            metrics_history=metrics_history
        )
        
    # Final analysis
    print("\nSimulation complete! Analyzing results...")
    analyze_results(client_metrics=client_metrics)

# Main execution
if __name__ == "__main__":
    run_federated_learning_simulation()

Label classes: {0, 1}
Total samples: 5667
Class distribution: {0: 4299, 1: 1368}

Client Data Distributions:
Client 0: 520 samples - Spam: 56.0%, Ham: 44.0%
Client 1: 427 samples - Spam: 50.1%, Ham: 49.9%
Client 2: 356 samples - Spam: 44.1%, Ham: 55.9%
Client 3: 774 samples - Spam: 3.6%, Ham: 96.4%
Client 4: 573 samples - Spam: 4.5%, Ham: 95.5%
Client 5: 426 samples - Spam: 5.9%, Ham: 94.1%


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You sho


Starting Federated Learning simulation with 6 clients
Running for 5 rounds with 1 epochs per round
Device: cpu

Round 1/5
Training client 0...
Client 0 - Loss: 0.6549
Training client 1...
Client 1 - Loss: 0.6554
Training client 2...
Client 2 - Loss: 0.6754
Training client 3...
Client 3 - Loss: 0.2440
Training client 4...
Client 4 - Loss: 0.2737
Training client 5...
Client 5 - Loss: 0.3478


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Round 1 - Evaluation Results:
- Accuracy: 0.7584
- Precision (Spam): 0.0000
- Recall (Spam): 0.0000
- F1 Score (Spam): 0.0000

Round 2/5
Training client 0...
Client 0 - Loss: 0.7250
Training client 1...
Client 1 - Loss: 0.6689
Training client 2...
Client 2 - Loss: 0.7141
Training client 3...
Client 3 - Loss: 0.1701
Training client 4...
Client 4 - Loss: 0.2253
Training client 5...
Client 5 - Loss: 0.2462


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Round 2 - Evaluation Results:
- Accuracy: 0.8034
- Precision (Spam): 0.7339
- Recall (Spam): 0.2920
- F1 Score (Spam): 0.4178

Round 3/5
Training client 0...
Client 0 - Loss: 0.7236
Training client 1...
Client 1 - Loss: 0.6659
Training client 2...
Client 2 - Loss: 0.6879
Training client 3...
Client 3 - Loss: 0.1804
Training client 4...
Client 4 - Loss: 0.1871
Training client 5...
Client 5 - Loss: 0.2417


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Round 3 - Evaluation Results:
- Accuracy: 0.7804
- Precision (Spam): 0.6263
- Recall (Spam): 0.2263
- F1 Score (Spam): 0.3324

Round 4/5
Training client 0...
Client 0 - Loss: 0.6535
Training client 1...
Client 1 - Loss: 0.6458
Training client 2...
Client 2 - Loss: 0.6986
Training client 3...
Client 3 - Loss: 0.1670
Training client 4...
Client 4 - Loss: 0.1848
Training client 5...
Client 5 - Loss: 0.2412


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Round 4 - Evaluation Results:
- Accuracy: 0.8236
- Precision (Spam): 0.9625
- Recall (Spam): 0.2810
- F1 Score (Spam): 0.4350

Round 5/5
Training client 0...
Client 0 - Loss: 0.6652
Training client 1...
Client 1 - Loss: 0.6886
Training client 2...
Client 2 - Loss: 0.6844
Training client 3...
Client 3 - Loss: 0.1660
Training client 4...
Client 4 - Loss: 0.1532
Training client 5...
Client 5 - Loss: 0.2364


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Confusion matrix saved to confusion_matrix.png

Misclassification Analysis:
Showing 5 of 250 misclassified examples:

Example 1:
Text: nan
True: Spam, Predicted: Ham

Example 2:
Text: nan
True: Spam, Predicted: Ham

Example 3:
Text: 
বিষয়ঃ একমাত্র সফ্টওয়্যার যা আমরা প্রতিশ্রুতি দিতে পারি তা 100% বৈধ। খুব সস্তা দামে নাম-ব্র্যান্ড ...
True: Spam, Predicted: Ham

Example 4:
Text: বিষয়: অংশীদারি
মি. এডওয়ার্ড মোকো
18 ইন্ডিপেন্ডেন্স ক্লোজ, জোহানেসবার্গ, দক্ষিণ আফ্রিকা।

প্রিয় স্...
True: Spam, Predicted: Ham

Example 5:
Text: ### ব্যবস্থাপনা:
True: Spam, Predicted: Ham

Round 5 - Evaluation Results:
- Accuracy: 0.7795
- Precision (Spam): 1.0000
- Recall (Spam): 0.0876
- F1 Score (Spam): 0.1611

Simulation complete! Analyzing results...
Performance metrics chart saved to metrics_over_rounds.png
Loss and accuracy chart saved to loss_accuracy_over_rounds.png

                    FEDERATED LEARNING RESULTS SUMMARY
Round     Accuracy       Precision      Recall         F1 Score       
------

In [9]:
import matplotlib.pyplot as plt
import numpy as np

def plot_loss_accuracy(metrics_history, client_metrics, config=CONFIG):
    # Gather rounds
    rounds = [r['round'] for r in metrics_history]
    # Global accuracy per round (already collected)
    accuracy = [r['accuracy'] for r in metrics_history]
    # Average client loss per round
    avg_client_loss = []
    num_clients = max(client_metrics.keys()) + 1

    # Gather average loss for each round across all clients
    for rnd in range(len(rounds)):
        losses = []
        for client_id in client_metrics:
            if len(client_metrics[client_id]) > rnd:
                losses.append(client_metrics[client_id][rnd]['loss'])
        avg_loss = np.mean(losses) if losses else 0
        avg_client_loss.append(avg_loss)

    # Plot
    plt.figure(figsize=(10,6))
    plt.plot(rounds, accuracy, 'o-', label="Global Test Accuracy")
    plt.plot(rounds, avg_client_loss, 's-', label="Average Client Training Loss")
    plt.xlabel('Round')
    plt.title("Federated Learning: Test Accuracy & Avg Client Loss per Round")
    plt.ylabel('Score / Loss')
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.savefig(config['loss_accuracy_chart_file'])
    plt.show()
    print(f"Loss and accuracy plot saved to {config['loss_accuracy_chart_file']}")