In [8]:
# https://www.kaggle.com/code/kaanboke/beginner-friendly-end-to-end-ml-project-enjoy

import warnings

import flwr as fl

import logging

logger = logging.getLogger('flwr')

NUM_CLIENT = 5
CLIENT_INDEX = 0

In [18]:
from datasets import load_dataset
from sklearn.compose import ColumnTransformer

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, PowerTransformer
from sklearn.model_selection import train_test_split

def load_data ():
    data_files = ["./stroke-prediction-dataset/healthcare-dataset-stroke-data.csv"]
    dataset = load_dataset("csv", data_files=data_files)
    df = dataset['train'].to_pandas()
    df = df.drop('id', axis=1)
    y= df['stroke']
    X = df.drop('stroke', axis=1)

    partition_ratio = 1/NUM_CLIENT 
    # data splitting should not happend like this, shoould sharded from start
    X, X_, y, y_ = train_test_split(X, y, train_size=(partition_ratio), stratify=y, random_state=42)
    

    
    categorical = [ 'hypertension', 'heart_disease', 'ever_married','work_type', 'Residence_type', 'smoking_status']
    numerical = ['avg_glucose_level', 'bmi','age']
    

    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42, stratify=y)

    transformer = ColumnTransformer(
        transformers=[
            ('imp', SimpleImputer(strategy='median'), numerical),
            ('ohe', OneHotEncoder(), categorical)
        ]
    )

    unique, counts = np.unique(y, return_counts=True)
    class_distribution = dict(zip(unique, counts))
    print("Class distribution in original data:", class_distribution)

    # Apply the ColumnTransformer to the training data
    X_train_transformed = transformer.fit_transform(X_train)
    X_test_transformed = transformer.transform(X_test)

    # Apply the PowerTransformer to the numerical features in the training data
    power_transformer = PowerTransformer(method='yeo-johnson', standardize=True)
    X_train_transformed[:, :len(numerical)] = power_transformer.fit_transform(X_train_transformed[:, :len(numerical)])
    X_test_transformed[:, :len(numerical)] = power_transformer.transform(X_test_transformed[:, :len(numerical)])

    return X_train_transformed, y_train, X_test_transformed, y_test

In [19]:
import numpy as np
from sklearn.linear_model import LogisticRegression

from flwr.common import NDArrays


def get_model_parameters(model: LogisticRegression) -> NDArrays:
    """Returns the parameters of a sklearn LogisticRegression model."""
    if model.fit_intercept:
        params = [
            model.coef_,
            model.intercept_,
        ]
    else:
        params = [
            model.coef_,
        ]
    return params


def set_model_params(model: LogisticRegression, params: NDArrays) -> LogisticRegression:
    """Sets the parameters of a sklean LogisticRegression model."""
    model.coef_ = params[0]
    if model.fit_intercept:
        model.intercept_ = params[1]
    return model


def set_initial_params(model: LogisticRegression):
    """Sets initial parameters as zeros Required since model params are uninitialized
    until model.fit is called.

    But server asks for initial parameters from clients at launch. Refer to
    sklearn.linear_model.LogisticRegression documentation for more information.
    """
    n_classes = 2  # Number of classes in dataset
    n_features = 20  # Number of features in dataset
    model.classes_ = np.array([i for i in range(n_classes)])

    model.coef_ = np.zeros((n_classes, n_features))
    if model.fit_intercept:
        model.intercept_ = np.zeros((n_classes,))

In [26]:
model = LogisticRegression(
    penalty="l2",
    solver="liblinear",
    max_iter=1,  # local epoch
    warm_start=True,  # prevent refreshing weights when fitting
)

set_initial_params(model)

Class distribution in original data: {0: 4861, 1: 249}
Model trained on local data




0.9510763209393346

In [21]:
from sklearn.metrics import log_loss

class SklearnClient(fl.client.NumPyClient):
    def get_parameters(self, config):  # type: ignore
        return get_model_parameters(model)

    def fit(self, parameters, config):  # type: ignore
        set_model_params(model, parameters)
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            model.fit(X_train, y_train)
        return get_model_parameters(model), len(X_train), {}

    def evaluate(self, parameters, config):  # type: ignore
        set_model_params(model, parameters)
        loss = log_loss(y_test, model.predict_proba(X_test))
        accuracy = model.score(X_test, y_test)
        return loss, len(X_test), {"accuracy": accuracy}

In [24]:
# fl.client.start_client(server_address="20.198.223.216:8000", client=SklearnClient().to_client())
fl.client.start_client(server_address="20.198.223.216:8000", client=SklearnClient().to_client())

[92mINFO [0m:      
[92mINFO [0m:      Received: get_parameters message 095a14be-0984-4678-bae5-8bc7fafed098
[92mINFO [0m:      Sent reply
[92mINFO [0m:      
[92mINFO [0m:      Received: train message e1962ba8-1f53-42ec-b628-057cc77e8973
[92mINFO [0m:      Sent reply
[92mINFO [0m:      
[92mINFO [0m:      Received: evaluate message c21f498b-2bb2-428e-ad9c-aa019b7394de
[92mINFO [0m:      Sent reply
[92mINFO [0m:      
[92mINFO [0m:      Received: train message e428ce01-e985-4524-a041-441c1185686e
[92mINFO [0m:      Sent reply
[92mINFO [0m:      
[92mINFO [0m:      Received: evaluate message 46492b1a-3287-4b0b-8d80-a120949d91d5
[92mINFO [0m:      Sent reply
[92mINFO [0m:      
[92mINFO [0m:      Received: train message e034ee35-8ab6-42bd-9189-520de1d5ca4f
[92mINFO [0m:      Sent reply
[92mINFO [0m:      
[92mINFO [0m:      Received: evaluate message 878f8354-20ee-44e8-8c1a-b2134009ea75
[92mINFO [0m:      Sent reply
[92mINFO [0m:      
[92mINF