In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import tensorflow as tf

# Load the CSV data
data = pd.read_csv("preprocessed_data.csv")

# Data cleanup and processing
# Drop rows with missing values
data = data.dropna()

# Initialize label encoder to encode the 'class' column
label_encoder = LabelEncoder()
data["class_encoded"] = label_encoder.fit_transform(data["label"])

# Initialize TF-IDF vectorizer for tweet text
vectorizer = TfidfVectorizer(max_features=100) 
X = vectorizer.fit_transform(data["post_text"]).toarray()
y = data["class_encoded"]

# Ensure 'y' is one-hot encoded
onehot_encoder = OneHotEncoder(sparse_output=False)
y = onehot_encoder.fit_transform(data["class_encoded"].values.reshape(-1, 1))

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

import joblib




In [4]:
import numpy as np

# Define the number of clients (e.g., 10 clients)
num_clients = 5

# Create data shards (subsets) for each client
def create_clients(X, y, num_clients):
    # Shuffle the data
    data_size = len(X)
    shuffled_indices = np.random.permutation(data_size)

    # Create partitions
    shard_size = data_size // num_clients
    clients_data = {}

    for i in range(num_clients):
        start = i * shard_size
        end = start + shard_size if i != num_clients - 1 else data_size
        clients_data[f"client_{i + 1}"] = (X[shuffled_indices[start:end]], y[shuffled_indices[start:end]])

    return clients_data

# Create the clients' data
clients_data = create_clients(X_train, y_train, num_clients)

# Display an example of client data
print(clients_data["client_1"][0].shape)  # Features
print(clients_data["client_1"][1].shape)  # Labels


(5390, 100)
(5390, 3)


In [5]:
# Function to calculate the scaling factor for model weights
def weight_scaling_factor(client_data, global_data):
    client_size = len(client_data)
    global_size = len(global_data)
    return client_size / global_size

# Function to scale model weights based on the scaling factor
def scale_model_weights(model_weights, scaling_factor):
    return [weight * scaling_factor for weight in model_weights]

# Function to sum scaled model weights to create the new global model
def sum_scaled_weights(scaled_weights):
    new_weights = [np.zeros_like(w) for w in scaled_weights[0]]
    for sw in scaled_weights:
        for i in range(len(sw)):
            new_weights[i] += sw[i]
    return new_weights


In [11]:
# Define a simple model function
def build_mlp_model(input_shape, num_classes):
    model = tf.keras.Sequential([
        tf.keras.layers.Dense(128, activation='relu', input_shape=(input_shape,)),
        tf.keras.layers.Dropout(0.5),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dropout(0.5),
        tf.keras.layers.Dense(num_classes, activation='softmax')
    ])
    return model

# Parameters for federated learning
num_global_rounds = 50 # Number of global communication rounds
local_epochs = 1  # Number of local training epochs
batch_size = 32  # Batch size for training
input_shape = X_train.shape[1]  # Input size (number of features)
num_classes = y_train.shape[1]  # Number of output classes

# Initialize the global model
global_model = build_mlp_model(input_shape, num_classes)

# Compile the global model
global_model.compile(
    optimizer=tf.keras.optimizers.SGD(learning_rate=0.01),
    loss="categorical_crossentropy",
    metrics=["accuracy"]
)

# Training loop for federated learning
global_weights = global_model.get_weights()  # Initial global weights

# List to store test accuracy after each communication round
test_accuracies = []

for round in range(num_global_rounds):
    # List to store scaled weights from each client
    scaled_local_weights = []

    for client_name, (client_X, client_y) in clients_data.items():
        # Build and compile a new model for the client
        client_model = build_mlp_model(input_shape, num_classes)
        
        # Compile the client model
        client_model.compile(
            optimizer=tf.keras.optimizers.SGD(learning_rate=0.01),
            loss="categorical_crossentropy",
            metrics=["accuracy"]
        )

        # Set the global weights as the initial weights for this client model
        client_model.set_weights(global_weights)

        # Train the client model on its local data
        client_model.fit(client_X, client_y, batch_size=batch_size, epochs=local_epochs, verbose=0)

        # Get the new weights after training
        new_weights = client_model.get_weights()

        # Scale the new weights based on the client data proportion
        scaling_factor = weight_scaling_factor(client_X, X_train)
        scaled_weights = scale_model_weights(new_weights, scaling_factor)

        # Append the scaled weights to the list
        scaled_local_weights.append(scaled_weights)

    # Aggregate the scaled weights to update the global model
    global_weights = sum_scaled_weights(scaled_local_weights)
    global_model.set_weights(global_weights)

    # Test the global model after each communication round
    test_loss, test_accuracy = global_model.evaluate(X_test, y_test, verbose=0)
    test_accuracies.append(test_accuracy)
    print(f"Round {round + 1}: Test Accuracy = {test_accuracy * 100:.2f}%")

# Display the test accuracies for all communication rounds
print("Test Accuracies for each communication round:", test_accuracies)


Round 1: Test Accuracy = 42.44%
Round 2: Test Accuracy = 50.62%
Round 3: Test Accuracy = 54.09%
Round 4: Test Accuracy = 57.40%
Round 5: Test Accuracy = 58.93%
Round 6: Test Accuracy = 59.53%
Round 7: Test Accuracy = 59.97%
Round 8: Test Accuracy = 60.43%
Round 9: Test Accuracy = 60.83%
Round 10: Test Accuracy = 60.83%
Round 11: Test Accuracy = 61.14%
Round 12: Test Accuracy = 61.27%
Round 13: Test Accuracy = 61.30%
Round 14: Test Accuracy = 61.90%
Round 15: Test Accuracy = 61.77%
Round 16: Test Accuracy = 61.94%
Round 17: Test Accuracy = 62.04%
Round 18: Test Accuracy = 62.17%
Round 19: Test Accuracy = 62.17%
Round 20: Test Accuracy = 62.47%
Round 21: Test Accuracy = 62.74%
Round 22: Test Accuracy = 62.74%
Round 23: Test Accuracy = 62.90%
Round 24: Test Accuracy = 62.84%
Round 25: Test Accuracy = 62.77%
Round 26: Test Accuracy = 62.74%
Round 27: Test Accuracy = 62.74%
Round 28: Test Accuracy = 62.97%
Round 29: Test Accuracy = 63.17%
Round 30: Test Accuracy = 63.17%
Round 31: Test Accu