In [None]:
!pip install numpy pandas scikit-learn

In [1]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_breast_cancer
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split


In [2]:

# Charger et préparer les données
data = load_breast_cancer()
X, y = data.data, data.target
X = StandardScaler().fit_transform(X)



In [7]:

# Split entre les clients (par exemple 3 hôpitaux)
num_clients = 3
client_data = np.array_split(X, num_clients)
client_labels = np.array_split(y, num_clients)

# Fonction pour entraîner un modèle local
def train_local_model(X_local, y_local):
    model = LogisticRegression(solver='lbfgs', max_iter=1000)
    model.fit(X_local, y_local)
    return model.coef_.flatten(), model.intercept_

# Confidentialité différentielle : bruit gaussien
def add_noise(coef, intercept, epsilon=1.0):
    noise_coef = np.random.normal(0, 1/epsilon, size=coef.shape)
    print(f"Noise Coef : {noise_coef[:5]}")
    print(f"True Coef : {coef[:5]}")
    print(f"Combined Coef : {coef[:5] + noise_coef[:5]}")
    print("-"*10)
    noise_intercept = np.random.normal(0, 1/epsilon, size=intercept.shape)
    return coef + noise_coef, intercept + noise_intercept




In [8]:
# ========== Entraînement AVEC bruit (DP) ==========
coefs, intercepts = [], []
for i in range(num_clients):
    coef, intercept = train_local_model(client_data[i], client_labels[i])
    noisy_coef, noisy_intercept = add_noise(coef, intercept, epsilon=5.0)
    coefs.append(noisy_coef)
    intercepts.append(noisy_intercept)

global_coef = np.mean(coefs, axis=0)
global_intercept = np.mean(intercepts, axis=0)

Noise Coef : [-0.13714571  0.25421288 -0.14915752  0.12131373  0.02349403]
True Coef : [-0.3531813  -1.13580171 -0.39771313 -0.35209543 -0.07371329]
Combined Coef : [-0.49032702 -0.88158883 -0.54687065 -0.2307817  -0.05021926]
----------
Noise Coef : [-0.25518202 -0.24501462 -0.21433774 -0.01431957 -0.06599607]
True Coef : [-0.47764494 -0.22021727 -0.45665274 -0.53151359 -0.42448044]
Combined Coef : [-0.73282696 -0.46523189 -0.67099048 -0.54583316 -0.49047652]
----------
Noise Coef : [-0.12783895 -0.1269319   0.17132163  0.07768419 -0.23646028]
True Coef : [-0.62753621 -0.39513745 -0.58749913 -0.57637334 -0.14887469]
Combined Coef : [-0.75537516 -0.52206934 -0.41617751 -0.49868915 -0.38533497]
----------


In [9]:

# ========== Entraînement SANS bruit (référence) ==========
true_coefs, true_intercepts = [], []
for i in range(num_clients):
    coef, intercept = train_local_model(client_data[i], client_labels[i])
    true_coefs.append(coef)
    true_intercepts.append(intercept)

true_global_coef = np.mean(true_coefs, axis=0)
true_global_intercept = np.mean(true_intercepts, axis=0)

# ========== Évaluation ==========
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [10]:

# Modèle AVEC DP
final_model = LogisticRegression()
final_model.coef_ = global_coef.reshape(1, -1)
final_model.intercept_ = global_intercept
final_model.classes_ = np.array([0, 1])
y_pred = final_model.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print(f"🔐 Accuracy du modèle global avec DP : {acc:.4f}")



🔐 Accuracy du modèle global avec DP : 0.9825


In [11]:
# Modèle SANS DP (vrai modèle agrégé)
true_model = LogisticRegression()
true_model.coef_ = true_global_coef.reshape(1, -1)
true_model.intercept_ = true_global_intercept
true_model.classes_ = np.array([0, 1])
y_pred_true = true_model.predict(X_test)
acc_true = accuracy_score(y_test, y_pred_true)
print(f"📊 Accuracy du modèle global SANS DP : {acc_true:.4f}")


📊 Accuracy du modèle global SANS DP : 0.9737


In [12]:

# ========== Comparaison des coefficients ==========
print("\n--- Comparaison des coefficients ---")
print("➡️ Moyenne des coefficients réels (sans DP):")
print(true_global_coef)
print("\n🔒 Moyenne des coefficients avec DP (bruit ajouté):")
print(global_coef)




--- Comparaison des coefficients ---
➡️ Moyenne des coefficients réels (sans DP):
[-0.48612081 -0.58371881 -0.48062167 -0.48666079 -0.21568948  0.08509568
 -0.57557527 -0.60684666  0.0322042   0.27416076 -0.73869177  0.00558824
 -0.58934347 -0.56161302  0.00698184  0.48567459 -0.02366443 -0.21991231
  0.26316175  0.47641164 -0.72174403 -0.87384852 -0.67874896 -0.67356547
 -0.52775687 -0.13082811 -0.56660293 -0.64182127 -0.60071142 -0.24330265]

🔒 Moyenne des coefficients avec DP (bruit ajouté):
[-0.65950971 -0.62296336 -0.54467954 -0.42510134 -0.30867692  0.04743952
 -0.53651933 -0.38155152 -0.02593978  0.30777614 -0.53330273  0.04559644
 -0.57062199 -0.5426478   0.0185569   0.50778574  0.0148504  -0.13538709
  0.17841273  0.4806916  -0.93429592 -0.89825142 -0.84482828 -0.63967962
 -0.50221853  0.11509009 -0.48968553 -0.67367919 -0.76950242 -0.2900192 ]


In [13]:
# Différence absolue moyenne
coef_diff = np.abs(global_coef - true_global_coef)
print("\n📉 Écart absolu moyen entre les coefficients :")
print(np.mean(coef_diff))


📉 Écart absolu moyen entre les coefficients :
0.07955011271417572
