## Subdivision of the dataset into N institutions 

In [54]:
# Libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split

In [55]:
# Load data
# Diabeters
df_train = pd.read_csv('diabetes_binary_5050split_health_indicators_BRFSS2015.csv')
df_train = df_train.rename(columns={'Diabetes_binary': 'Labels'})
# Breast cancer
X_breast = pd.read_csv('X_breast.csv')
y_breast = pd.read_csv('y_breast.csv')
y_breast['Diagnosis'] = y_breast['Diagnosis'].map({'M': 1, 'B': 0})
# add labels to X_breast with the same name as in df_train
df_train_breast = pd.DataFrame(X_breast)
df_train_breast['Labels'] = y_breast['Diagnosis']

print(f"Diabetes dataset: {df_train.shape}")
print(f"Breast cancer dataset: {df_train_breast.shape}")

Diabetes dataset: (70692, 22)
Breast cancer dataset: (569, 32)


In [56]:
# find min and max values for each feature
XX = df_train.drop('Labels', axis=1)
min_values_diabetes = XX.min().values
max_values_diabetes = XX.max().values
print(f"Min values diabetes: {min_values_diabetes}")
print(f"Max values diabetes: {max_values_diabetes}")

XXX = df_train_breast.drop('Labels', axis=1)
XXX = XXX.drop(columns=["Unnamed: 0"])
min_values_breast = XXX.min().values
max_values_breast = XXX.max().values
print(f"Min values: {min_values_breast}")
print(f"Max values: {max_values_breast}")

Min values diabetes: [ 0.  0.  0. 12.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  0.  0.  0.  0.
  1.  1.  1.]
Max values diabetes: [ 1.  1.  1. 98.  1.  1.  1.  1.  1.  1.  1.  1.  1.  5. 30. 30.  1.  1.
 13.  6.  8.]
Min values: [6.981e+00 9.710e+00 4.379e+01 1.435e+02 5.263e-02 1.938e-02 0.000e+00
 0.000e+00 1.060e-01 4.996e-02 1.115e-01 3.602e-01 7.570e-01 6.802e+00
 1.713e-03 2.252e-03 0.000e+00 0.000e+00 7.882e-03 8.948e-04 7.930e+00
 1.202e+01 5.041e+01 1.852e+02 7.117e-02 2.729e-02 0.000e+00 0.000e+00
 1.565e-01 5.504e-02]
Max values: [2.811e+01 3.928e+01 1.885e+02 2.501e+03 1.634e-01 3.454e-01 4.268e-01
 2.012e-01 3.040e-01 9.744e-02 2.873e+00 4.885e+00 2.198e+01 5.422e+02
 3.113e-02 1.354e-01 3.960e-01 5.279e-02 7.895e-02 2.984e-02 3.604e+01
 4.954e+01 2.512e+02 4.254e+03 2.226e-01 1.058e+00 1.252e+00 2.910e-01
 6.638e-01 2.075e-01]


### Random Subdivision 


In [57]:
# N institutions (5% out for testing)
N = 5

def random_split(df, N, file_prefix='df_diabetes'):
    """
    Splits a DataFrame into N parts and saves each part as a CSV file.

    Parameters:
    df (pd.DataFrame): The DataFrame to split.
    N (int): Number of parts to split the DataFrame into.
    file_prefix (str): Prefix for the output file names.
    """
    # Shuffle the DataFrame
    df_shuffled = df.sample(frac=1, random_state=1).reset_index(drop=True)

    # # Leave out 5% for testing
    # df_train, df_test = train_test_split(df_shuffled, test_size=0.15, random_state=1)
    # df_test.to_csv(file_prefix + '_random_test.csv', index=False)
    # print(f'Saved: {file_prefix}_random_test.csv of shape {df_test.shape}')

    # Split the DataFrame into N parts
    df_splits = np.array_split(df_shuffled, N)

    # Save each part as a CSV file
    test_splits = []
    for i, split in enumerate(df_splits, start=1):
        # Leave out 5% for testing
        df_train, df_test = train_test_split(split, test_size=0.15, random_state=1)
        test_splits.append(df_test)
        df_test.to_csv(file_prefix + f'_random_test_{i}.csv', index=False)
        print(f'Saved: {file_prefix}_random_test_{i}.csv of shape {df_test.shape}')
        # Save the training split
        df_train.to_csv(f'{file_prefix}_random_{i}.csv', index=False)
        print(f'Saved: {f'{file_prefix}_random_{i}.csv'} of shape {df_train.shape}')
    
    # concatenate the test split of each part
    df_test = pd.concat(test_splits)
    df_test.to_csv(file_prefix + '_random_test.csv', index=False)
    print(f'Saved: {file_prefix}_random_test.csv of shape {df_test.shape}\n')

random_split(df_train, N, file_prefix='df_diabetes')
random_split(df_train_breast, N, file_prefix='df_breast')


  return bound(*args, **kwds)


Saved: df_diabetes_random_test_1.csv of shape (2121, 22)
Saved: df_diabetes_random_1.csv of shape (12018, 22)
Saved: df_diabetes_random_test_2.csv of shape (2121, 22)
Saved: df_diabetes_random_2.csv of shape (12018, 22)
Saved: df_diabetes_random_test_3.csv of shape (2121, 22)
Saved: df_diabetes_random_3.csv of shape (12017, 22)
Saved: df_diabetes_random_test_4.csv of shape (2121, 22)
Saved: df_diabetes_random_4.csv of shape (12017, 22)
Saved: df_diabetes_random_test_5.csv of shape (2121, 22)
Saved: df_diabetes_random_5.csv of shape (12017, 22)
Saved: df_diabetes_random_test.csv of shape (10605, 22)

Saved: df_breast_random_test_1.csv of shape (18, 32)
Saved: df_breast_random_1.csv of shape (96, 32)
Saved: df_breast_random_test_2.csv of shape (18, 32)
Saved: df_breast_random_2.csv of shape (96, 32)
Saved: df_breast_random_test_3.csv of shape (18, 32)
Saved: df_breast_random_3.csv of shape (96, 32)
Saved: df_breast_random_test_4.csv of shape (18, 32)
Saved: df_breast_random_4.csv of shap

### Cluster based Subdivision

In [59]:
from sklearn.cluster import KMeans
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import copy

# Function to calculate Euclidean distances between centroids
def centroid_distances(centroids0, centroids1):
    N = len(centroids0)
    print(f"N: {N}")
    distances = np.zeros((N, N))
    for i in range(N):
        for j in range(N):
            distances[i, j] = np.linalg.norm(centroids0[i] - centroids1[j])
    return distances

# Function to calculate centroids
def calculate_centroids(df, labels):
    N = len(np.unique(labels))
    centroids = []
    for i in range(N):
        centroids.append(df[labels == i].mean().to_numpy())
    return centroids

def cluster_by_class_split(df, N, file_prefix='df_diabetes'):
    """
    In this code, distances will be a matrix where the element at [i, j] represents
    the distance between the i-th cluster of class 0 and the j-th cluster of class 1.
    The final matrix will be a N x N matrix, not simmetrical in general.
    The following result means that for the first cluster of class 0, the second cluster 
    of class 1 is the closest one. For the second cluster of class 0, the third cluster of
    class 1 is the closest one. And so on.
    array([[22.52661847, 16.58598092, 30.50548191],
       [ 4.33080647, 32.17891945, 25.41195157],
       [27.11059815, 19.7759446 ,  8.12520036]])
    """

    # # Leave out 5% for testing
    # df_train, df_test = train_test_split(df_train, test_size=0.15, random_state=1)
    # df_test.to_csv(file_prefix + '_2cluster_test.csv', index=False)
    # print(f'Saved: {file_prefix}_2cluster_test.csv of shape {df_test.shape}')

    # Splitting the dataset by class
    df_train_0 = df[df['Labels'] == 0].drop('Labels', axis=1)
    df_train_1 = df[df['Labels'] == 1].drop('Labels', axis=1)
    # KMeans clustering
    kmeans_0 = KMeans(n_clusters=N, random_state=1).fit(df_train_0)
    kmeans_1 = KMeans(n_clusters=N, random_state=1).fit(df_train_1)
    # Calculating centroids
    centroids_0 = calculate_centroids(df_train_0, kmeans_0.labels_)
    centroids_1 = calculate_centroids(df_train_1, kmeans_1.labels_)
    # Calculating distances
    distance_matrix = centroid_distances(centroids_0, centroids_1)  

    # Pairing clusters
    pairs = pair_clusters(distance_matrix)

    # create the N clusters
    i = 1
    test_splits = []
    for c0,c1 in pairs:
        df_0 = df[df['Labels'] == 0][kmeans_0.labels_ == c0]
        df_1 = df[df['Labels'] == 1][kmeans_1.labels_ == c1]
        # merge the clusters
        df_merge = pd.concat([df_0, df_1])
        # randomize the order of the rows
        df_merge = df_merge.sample(frac=1).reset_index(drop=True)
        # Leave out 15% for testing
        df_train, df_test = train_test_split(df_merge, test_size=0.15, random_state=1)
        test_splits.append(df_test)
        df_test.to_csv(file_prefix + f'_2cluster_test_{i}.csv', index=False)
        print(f'Saved: {file_prefix}_2cluster_test_{i}.csv of shape {df_test.shape}')
        # save training split
        df_train.to_csv(f'{file_prefix}_2cluster_{i}.csv', index=False)
        print(f'Saved: {f'{file_prefix}_2cluster_{i}.csv'} of shape {df_train.shape} pairs: {c0} and {c1}')
        i += 1
    
    # concatenate the test split of each part
    df_test = pd.concat(test_splits)
    df_test.to_csv(file_prefix + '_2cluster_test.csv', index=False)
    print(f'Saved: {file_prefix}_2cluster_test.csv of shape {df_test.shape}\n')

def pair_clusters(dist_matrix):
    distances_copy = copy.deepcopy(dist_matrix)
    pairs = []
    # cycle
    while distances_copy.size > 0:
        # Find the minimum value and its column index
        min_value = np.min(distances_copy)
        min_col_index = np.argmin(np.min(distances_copy, axis=0))
        min_row_index = np.argmin(distances_copy[:, min_col_index])

        # identify the real position 
        ind = np.where(dist_matrix == min_value) #print("Minimum value:", min_value)#print("Column index of minimum value:", ind[1])#print("Row index of minimum value:", ind[0])

        # record pairing 
        pairs.append((ind[1].item(0), ind[0].item(0)))  # (cluster_{min_col_index}_0, cluster_{min_row_index}_1)

        # remove the paired clusters from further consideration
        distances_copy = np.delete(distances_copy, min_row_index, axis=0)  # remove row
        distances_copy = np.delete(distances_copy, min_col_index, axis=1)  # remove column

    return pairs

cluster_by_class_split(df_train, N, file_prefix='df_diabetes')
cluster_by_class_split(df_train_breast, N, file_prefix='df_breast')

N: 5
Saved: df_diabetes_2cluster_test_1.csv of shape (1143, 22)
Saved: df_diabetes_2cluster_1.csv of shape (6474, 22) pairs: 0 and 4
Saved: df_diabetes_2cluster_test_2.csv of shape (1658, 22)
Saved: df_diabetes_2cluster_2.csv of shape (9390, 22) pairs: 1 and 3
Saved: df_diabetes_2cluster_test_3.csv of shape (4134, 22)
Saved: df_diabetes_2cluster_3.csv of shape (23420, 22) pairs: 4 and 2
Saved: df_diabetes_2cluster_test_4.csv of shape (808, 22)
Saved: df_diabetes_2cluster_4.csv of shape (4576, 22) pairs: 2 and 1
Saved: df_diabetes_2cluster_test_5.csv of shape (2864, 22)
Saved: df_diabetes_2cluster_5.csv of shape (16225, 22) pairs: 3 and 0
Saved: df_diabetes_2cluster_test.csv of shape (10607, 22)

N: 5
Saved: df_breast_2cluster_test_1.csv of shape (10, 32)
Saved: df_breast_2cluster_1.csv of shape (52, 32) pairs: 2 and 1
Saved: df_breast_2cluster_test_2.csv of shape (17, 32)
Saved: df_breast_2cluster_2.csv of shape (92, 32) pairs: 4 and 2
Saved: df_breast_2cluster_test_3.csv of shape (24,

In [63]:
# N institutions - clusters _ OLD VERSION
N = 5

def cluster_split(df, N, file_prefix='df_diabetes'):
    """
    Splits a DataFrame into N clusters and saves each cluster as a CSV file.

    Parameters:
    df (pd.DataFrame): The DataFrame to cluster.
    N (int): Number of clusters to form.
    file_prefix (str): Prefix for the output file names.
    """

    # # Leave out 5% for testing
    # df_train, df_test = train_test_split(df, test_size=0.15, random_state=1)
    # df_test.to_csv(file_prefix + '_cluster_test.csv', index=False)
    # print(f'Saved: {file_prefix}_cluster_test.csv of shape {df_test.shape}')

    # Perform KMeans clustering
    kmeans = KMeans(n_clusters=N, random_state=1)
    clusters = kmeans.fit_predict(df)

    # Split the DataFrame based on clusters
    test_splits = []   
    for i in range(N):
        cluster_df = df[clusters == i]
        # Leave out 5% for testing
        df_train, df_test = train_test_split(cluster_df, test_size=0.15, random_state=1)
        df_test.to_csv(file_prefix + f'_cluster_test_{i+1}.csv', index=False)
        test_splits.append(df_test)
        print(f'Saved: {file_prefix}_cluster_test_{i+1}.csv of shape {df_test.shape}')
        # split train
        df_train.to_csv(f'{file_prefix}_cluster_{i+1}.csv', index=False)
        print(f'Saved: {f'{file_prefix}_cluster_{i+1}.csv'} of shape {df_train.shape}')
    
    # concatenate the test split of each part
    df_test = pd.concat(test_splits)
    df_test.to_csv(file_prefix + '_cluster_test.csv', index=False)
    print(f'Saved: {file_prefix}_cluster_test.csv of shape {df_test.shape}\n')


cluster_split(df_train, N, file_prefix='df_diabetes')
cluster_split(df_train_breast, N, file_prefix='df_breast')


Saved: df_diabetes_cluster_test_1.csv of shape (695, 22)
Saved: df_diabetes_cluster_1.csv of shape (3933, 22)
Saved: df_diabetes_cluster_test_2.csv of shape (1295, 22)
Saved: df_diabetes_cluster_2.csv of shape (7336, 22)
Saved: df_diabetes_cluster_test_3.csv of shape (4357, 22)
Saved: df_diabetes_cluster_3.csv of shape (24689, 22)
Saved: df_diabetes_cluster_test_4.csv of shape (973, 22)
Saved: df_diabetes_cluster_4.csv of shape (5513, 22)
Saved: df_diabetes_cluster_test_5.csv of shape (3286, 22)
Saved: df_diabetes_cluster_5.csv of shape (18615, 22)
Saved: df_diabetes_cluster_test.csv of shape (10606, 22)

Saved: df_breast_cluster_test_1.csv of shape (11, 32)
Saved: df_breast_cluster_1.csv of shape (60, 32)
Saved: df_breast_cluster_test_2.csv of shape (36, 32)
Saved: df_breast_cluster_2.csv of shape (201, 32)
Saved: df_breast_cluster_test_3.csv of shape (11, 32)
Saved: df_breast_cluster_3.csv of shape (59, 32)
Saved: df_breast_cluster_test_4.csv of shape (27, 32)
Saved: df_breast_cluste

#### Double-Check with 3 clients

In [10]:
# read the data
print("Diabetes dataset")
df1 = pd.read_csv('df_diabetes_2cluster_1.csv')
df2 = pd.read_csv('df_diabetes_2cluster_2.csv')
df3 = pd.read_csv('df_diabetes_2cluster_3.csv')
print(f"Total shape 2cluster: {df1.shape[0] + df2.shape[0] + df3.shape[0]},{df1.shape[1]}")

df1 = pd.read_csv('df_diabetes_random_1.csv')
df2 = pd.read_csv('df_diabetes_random_2.csv')
df3 = pd.read_csv('df_diabetes_random_3.csv')
print(f"Total shape random: {df1.shape[0] + df2.shape[0] + df3.shape[0]},{df1.shape[1]}")

df1 = pd.read_csv('df_diabetes_cluster_1.csv')
df2 = pd.read_csv('df_diabetes_cluster_2.csv')
df3 = pd.read_csv('df_diabetes_cluster_3.csv')
print(f"Total shape cluster: {df1.shape[0] + df2.shape[0] + df3.shape[0]},{df1.shape[1]}")

# print the shape of the data
df1 = pd.read_csv('df_diabetes_random_test.csv')
df2 = pd.read_csv('df_diabetes_2cluster_test.csv')
df3 = pd.read_csv('df_diabetes_cluster_test.csv')
print(f"Test shape random: {df1.shape}")
print(f"Test shape 2cluster: {df2.shape}")
print(f"Test shape cluster: {df3.shape}")

# breast dataset
print("\nBreast cancer dataset")
df1 = pd.read_csv('df_breast_random_1.csv')
df2 = pd.read_csv('df_breast_random_2.csv')
df3 = pd.read_csv('df_breast_random_3.csv')
print(f"Total shape random: {df1.shape[0] + df2.shape[0] + df3.shape[0]},{df1.shape[1]}")

df1 = pd.read_csv('df_breast_2cluster_1.csv')
df2 = pd.read_csv('df_breast_2cluster_2.csv')
df3 = pd.read_csv('df_breast_2cluster_3.csv')
print(f"Total shape 2cluster: {df1.shape[0] + df2.shape[0] + df3.shape[0]},{df1.shape[1]}")

df1 = pd.read_csv('df_breast_cluster_1.csv')
df2 = pd.read_csv('df_breast_cluster_2.csv')
df3 = pd.read_csv('df_breast_cluster_3.csv')
print(f"Total shape cluster: {df1.shape[0] + df2.shape[0] + df3.shape[0]},{df1.shape[1]}")

# print the shape of the data
df1 = pd.read_csv('df_breast_random_test.csv')
df2 = pd.read_csv('df_breast_2cluster_test.csv')
df3 = pd.read_csv('df_breast_cluster_test.csv')
print(f"Test shape random: {df1.shape}")
print(f"Test shape 2cluster: {df2.shape}")
print(f"Test shape cluster: {df3.shape}")


Diabetes dataset
Total shape 2cluster: 53046,22
Total shape random: 36054,22
Total shape cluster: 53419,22
Test shape random: (10604, 22)
Test shape 2cluster: (10604, 22)
Test shape cluster: (10604, 22)

Breast cancer dataset
Total shape random: 291,32
Total shape 2cluster: 274,32
Total shape cluster: 236,32
Test shape random: (86, 32)
Test shape 2cluster: (86, 32)
Test shape cluster: (86, 32)


In [47]:
df1.head()

Unnamed: 0.1,Unnamed: 0,radius1,texture1,perimeter1,area1,smoothness1,compactness1,concavity1,concave_points1,symmetry1,...,texture3,perimeter3,area3,smoothness3,compactness3,concavity3,concave_points3,symmetry3,fractal_dimension3,Labels
0,556,10.16,19.59,64.73,311.7,0.1003,0.07504,0.005025,0.01116,0.1791,...,22.88,67.88,347.3,0.1265,0.12,0.01005,0.02232,0.2262,0.06742,0
1,273,9.742,15.67,61.5,289.9,0.09037,0.04689,0.01103,0.01407,0.2081,...,20.88,68.09,355.2,0.1467,0.0937,0.04043,0.05159,0.2841,0.08175,0
2,256,19.55,28.77,133.6,1207.0,0.0926,0.2063,0.1784,0.1144,0.1893,...,36.27,178.6,1926.0,0.1281,0.5329,0.4251,0.1941,0.2818,0.1005,1
3,168,17.47,24.68,116.1,984.6,0.1049,0.1603,0.2159,0.1043,0.1538,...,32.33,155.3,1660.0,0.1376,0.383,0.489,0.1721,0.216,0.093,1
4,340,14.42,16.54,94.15,641.2,0.09751,0.1139,0.08007,0.04223,0.1912,...,21.51,111.4,862.1,0.1294,0.3371,0.3755,0.1414,0.3053,0.08764,0


In [48]:
df2.head()

Unnamed: 0.1,Unnamed: 0,radius1,texture1,perimeter1,area1,smoothness1,compactness1,concavity1,concave_points1,symmetry1,...,texture3,perimeter3,area3,smoothness3,compactness3,concavity3,concave_points3,symmetry3,fractal_dimension3,Labels
0,421,14.69,13.98,98.22,656.1,0.1031,0.1836,0.145,0.063,0.2086,...,18.34,114.1,809.2,0.1312,0.3635,0.3219,0.1108,0.2827,0.09208,0
1,47,13.17,18.66,85.98,534.6,0.1158,0.1231,0.1226,0.0734,0.2128,...,27.95,102.8,759.4,0.1786,0.4166,0.5006,0.2088,0.39,0.1179,1
2,292,12.95,16.02,83.14,513.7,0.1005,0.07943,0.06155,0.0337,0.173,...,19.93,88.81,585.4,0.1483,0.2068,0.2241,0.1056,0.338,0.09584,0
3,186,18.31,18.58,118.6,1041.0,0.08588,0.08468,0.08169,0.05814,0.1621,...,26.36,139.2,1410.0,0.1234,0.2445,0.3538,0.1571,0.3206,0.06938,1
4,414,15.13,29.81,96.71,719.5,0.0832,0.04605,0.04686,0.02739,0.1852,...,36.91,110.1,931.4,0.1148,0.09866,0.1547,0.06575,0.3233,0.06165,1


## Data Poisoning - Attacker datasets

### Random

In [65]:
N_attackers = 2

# find min and max values for each feature
min_values_diabetes = df_train.min().values
max_values_diabetes = df_train.max().values
min_values_breast = df_train_breast.min().values
max_values_breast = df_train_breast.max().values

# example for both datasets 
df_diabetes = pd.read_csv('df_diabetes_random_1.csv')
df_breast = pd.read_csv('df_breast_random_1.csv')


def create_attackers_random(df, N_attackers, min, max, file_prefix='df_diabetes_random'):
    """
    Create N_attackers attackers with random values between min and max for each feature.

    Parameters:
    df (pd.DataFrame): DataFrame to use as a template for the attackers.
    N_attackers (int): Number of attackers to create.
    min (np.array): Minimum values for each feature.
    max (np.array): Maximum values for each feature.
    file_prefix (str): Prefix for the output file names.

    """

    for i in range(N_attackers):
        attacker_df = pd.DataFrame(np.random.uniform(min, max, size=(df.shape[0], df.shape[1])), columns=df.columns)
        # make 'Labels' column binary
        attacker_df['Labels'] = attacker_df['Labels'].apply(lambda x: 1 if x >= 0.5 else 0)
        if 'diabetes' in file_prefix:
            # make all features equal to the closer integer
            attacker_df = attacker_df.round().astype(np.float64)
        attacker_df.to_csv(f'{file_prefix}_DP_random_{i+1}.csv', index=False)
        print(f'Saved: {file_prefix}_DP_random_{i+1}.csv of shape {attacker_df.shape}')

create_attackers_random(df_diabetes, N_attackers, min_values_diabetes, max_values_diabetes, file_prefix='df_diabetes_random')
create_attackers_random(df_breast, N_attackers, min_values_breast, max_values_breast, file_prefix='df_breast_random')

Saved: df_diabetes_random_DP_random_1.csv of shape (12018, 22)
Saved: df_diabetes_random_DP_random_2.csv of shape (12018, 22)
Saved: df_breast_random_DP_random_1.csv of shape (96, 32)
Saved: df_breast_random_DP_random_2.csv of shape (96, 32)


In [66]:
x = pd.read_csv('df_breast_random_DP_random_1.csv')
x.head()

Unnamed: 0.1,Unnamed: 0,radius1,texture1,perimeter1,area1,smoothness1,compactness1,concavity1,concave_points1,symmetry1,...,texture3,perimeter3,area3,smoothness3,compactness3,concavity3,concave_points3,symmetry3,fractal_dimension3,Labels
0,292.022814,15.477182,14.044969,157.175606,2458.454388,0.088194,0.201552,0.253932,0.079714,0.262957,...,48.092576,217.285484,1603.351036,0.074188,0.320373,0.053283,0.180954,0.529223,0.170518,1
1,370.034783,12.189366,35.032095,94.733449,2377.870772,0.058269,0.125686,0.130214,0.048556,0.153821,...,37.15484,89.120014,1385.306166,0.07685,0.445436,1.003206,0.015939,0.181158,0.055457,0
2,21.218466,9.908933,11.195172,169.602242,2055.899527,0.115983,0.077067,0.113702,0.044631,0.208883,...,41.065055,199.243027,3379.189986,0.213524,0.377584,0.238369,0.283481,0.526294,0.166011,1
3,191.941414,10.545814,19.629725,144.81028,1161.66061,0.10585,0.238691,0.07185,0.145038,0.256212,...,18.987386,163.222101,1398.921644,0.212947,0.557781,0.861042,0.26952,0.376316,0.145224,1
4,422.23541,14.192241,25.558526,118.927871,305.334812,0.139598,0.284454,0.106462,0.07059,0.177647,...,14.523506,115.292423,4233.603632,0.094338,0.050824,0.70257,0.061688,0.577931,0.196594,1


In [67]:
df = pd.read_csv('df_breast_random_1.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,radius1,texture1,perimeter1,area1,smoothness1,compactness1,concavity1,concave_points1,symmetry1,...,texture3,perimeter3,area3,smoothness3,compactness3,concavity3,concave_points3,symmetry3,fractal_dimension3,Labels
0,555,10.29,27.61,65.67,321.4,0.0903,0.07658,0.05999,0.02738,0.1593,...,34.91,69.57,357.6,0.1384,0.171,0.2,0.09127,0.2226,0.08283,0
1,160,11.75,20.18,76.1,419.8,0.1089,0.1141,0.06843,0.03738,0.1993,...,26.21,88.91,543.9,0.1358,0.1892,0.1956,0.07909,0.3168,0.07987,0
2,358,8.878,15.49,56.74,241.0,0.08293,0.07698,0.04721,0.02381,0.193,...,17.7,65.27,302.0,0.1015,0.1248,0.09441,0.04762,0.2434,0.07431,0
3,488,11.68,16.17,75.49,420.5,0.1128,0.09263,0.04279,0.03132,0.1853,...,21.59,86.57,549.8,0.1526,0.1477,0.149,0.09815,0.2804,0.08024,0
4,304,11.46,18.16,73.59,403.1,0.08853,0.07694,0.03344,0.01502,0.1411,...,21.61,82.69,489.8,0.1144,0.1789,0.1226,0.05509,0.2208,0.07638,0


### Label-Flipping

In [68]:
# flip the labels
def flip_client(path):
    client = pd.read_csv(path + '.csv')
    client['Labels'] = 1 - client['Labels']
    # split the path
    path = path.split('_')
    new_path = path[0] + '_' + path[1] + '_' + path[2] + '_DP_flip_' + path[3]
    client.to_csv(new_path + '.csv', index=False)
    print(f"Saved: {new_path}.csv of shape {client.shape}")

flip_client('df_breast_random_1')
flip_client('df_breast_random_2')
flip_client('df_diabetes_random_1')
flip_client('df_diabetes_random_2')
flip_client('df_breast_2cluster_1')
flip_client('df_breast_2cluster_2')
flip_client('df_diabetes_2cluster_1')
flip_client('df_diabetes_2cluster_2')
flip_client('df_breast_cluster_1')
flip_client('df_breast_cluster_2')
flip_client('df_diabetes_cluster_1')
flip_client('df_diabetes_cluster_2')

Saved: df_breast_random_DP_flip_1.csv of shape (96, 32)
Saved: df_breast_random_DP_flip_2.csv of shape (96, 32)
Saved: df_diabetes_random_DP_flip_1.csv of shape (12018, 22)
Saved: df_diabetes_random_DP_flip_2.csv of shape (12018, 22)
Saved: df_breast_2cluster_DP_flip_1.csv of shape (52, 32)
Saved: df_breast_2cluster_DP_flip_2.csv of shape (92, 32)
Saved: df_diabetes_2cluster_DP_flip_1.csv of shape (6474, 22)
Saved: df_diabetes_2cluster_DP_flip_2.csv of shape (9390, 22)
Saved: df_breast_cluster_DP_flip_1.csv of shape (60, 32)
Saved: df_breast_cluster_DP_flip_2.csv of shape (201, 32)
Saved: df_diabetes_cluster_DP_flip_1.csv of shape (3933, 22)
Saved: df_diabetes_cluster_DP_flip_2.csv of shape (7336, 22)


In [69]:
df = pd.read_csv('df_breast_random_DP_flip_1.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,radius1,texture1,perimeter1,area1,smoothness1,compactness1,concavity1,concave_points1,symmetry1,...,texture3,perimeter3,area3,smoothness3,compactness3,concavity3,concave_points3,symmetry3,fractal_dimension3,Labels
0,555,10.29,27.61,65.67,321.4,0.0903,0.07658,0.05999,0.02738,0.1593,...,34.91,69.57,357.6,0.1384,0.171,0.2,0.09127,0.2226,0.08283,1
1,160,11.75,20.18,76.1,419.8,0.1089,0.1141,0.06843,0.03738,0.1993,...,26.21,88.91,543.9,0.1358,0.1892,0.1956,0.07909,0.3168,0.07987,1
2,358,8.878,15.49,56.74,241.0,0.08293,0.07698,0.04721,0.02381,0.193,...,17.7,65.27,302.0,0.1015,0.1248,0.09441,0.04762,0.2434,0.07431,1
3,488,11.68,16.17,75.49,420.5,0.1128,0.09263,0.04279,0.03132,0.1853,...,21.59,86.57,549.8,0.1526,0.1477,0.149,0.09815,0.2804,0.08024,1
4,304,11.46,18.16,73.59,403.1,0.08853,0.07694,0.03344,0.01502,0.1411,...,21.61,82.69,489.8,0.1144,0.1789,0.1226,0.05509,0.2208,0.07638,1


In [70]:
df = pd.read_csv('df_breast_random_1.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,radius1,texture1,perimeter1,area1,smoothness1,compactness1,concavity1,concave_points1,symmetry1,...,texture3,perimeter3,area3,smoothness3,compactness3,concavity3,concave_points3,symmetry3,fractal_dimension3,Labels
0,555,10.29,27.61,65.67,321.4,0.0903,0.07658,0.05999,0.02738,0.1593,...,34.91,69.57,357.6,0.1384,0.171,0.2,0.09127,0.2226,0.08283,0
1,160,11.75,20.18,76.1,419.8,0.1089,0.1141,0.06843,0.03738,0.1993,...,26.21,88.91,543.9,0.1358,0.1892,0.1956,0.07909,0.3168,0.07987,0
2,358,8.878,15.49,56.74,241.0,0.08293,0.07698,0.04721,0.02381,0.193,...,17.7,65.27,302.0,0.1015,0.1248,0.09441,0.04762,0.2434,0.07431,0
3,488,11.68,16.17,75.49,420.5,0.1128,0.09263,0.04279,0.03132,0.1853,...,21.59,86.57,549.8,0.1526,0.1477,0.149,0.09815,0.2804,0.08024,0
4,304,11.46,18.16,73.59,403.1,0.08853,0.07694,0.03344,0.01502,0.1411,...,21.61,82.69,489.8,0.1144,0.1789,0.1226,0.05509,0.2208,0.07638,0


### Inverted Loss

In [71]:
# flip the labels
def inverted_client(path):
    client = pd.read_csv(path + '.csv')
    # split the path
    path = path.split('_')
    new_path = path[0] + '_' + path[1] + '_' + path[2] + '_DP_inverted_loss_' + path[3]
    client.to_csv(new_path + '.csv', index=False)
    print(f"Saved: {new_path}.csv of shape {client.shape}")

inverted_client('df_breast_random_1')
inverted_client('df_breast_random_2')
inverted_client('df_diabetes_random_1')
inverted_client('df_diabetes_random_2')
inverted_client('df_breast_2cluster_1')
inverted_client('df_breast_2cluster_2')
inverted_client('df_diabetes_2cluster_1')
inverted_client('df_diabetes_2cluster_2')
inverted_client('df_breast_cluster_1')
inverted_client('df_breast_cluster_2')
inverted_client('df_diabetes_cluster_1')
inverted_client('df_diabetes_cluster_2')


Saved: df_breast_random_DP_inverted_loss_1.csv of shape (96, 32)
Saved: df_breast_random_DP_inverted_loss_2.csv of shape (96, 32)
Saved: df_diabetes_random_DP_inverted_loss_1.csv of shape (12018, 22)
Saved: df_diabetes_random_DP_inverted_loss_2.csv of shape (12018, 22)
Saved: df_breast_2cluster_DP_inverted_loss_1.csv of shape (52, 32)
Saved: df_breast_2cluster_DP_inverted_loss_2.csv of shape (92, 32)
Saved: df_diabetes_2cluster_DP_inverted_loss_1.csv of shape (6474, 22)
Saved: df_diabetes_2cluster_DP_inverted_loss_2.csv of shape (9390, 22)
Saved: df_breast_cluster_DP_inverted_loss_1.csv of shape (60, 32)
Saved: df_breast_cluster_DP_inverted_loss_2.csv of shape (201, 32)
Saved: df_diabetes_cluster_DP_inverted_loss_1.csv of shape (3933, 22)
Saved: df_diabetes_cluster_DP_inverted_loss_2.csv of shape (7336, 22)
