## Subdivision of the dataset into N institutions 

In [3]:
# Libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.cluster import KMeans

In [4]:
# Load data
df_train = pd.read_csv('diabetes_binary_5050split_health_indicators_BRFSS2015.csv')
print(df_train.shape)

(70692, 22)


### Random Subdivision 


In [14]:
# N institutions
N = 3

def split_and_save_dataset(df, N, file_prefix='df_split_random'):
    """
    Splits a DataFrame into N parts and saves each part as a CSV file.

    Parameters:
    df (pd.DataFrame): The DataFrame to split.
    N (int): Number of parts to split the DataFrame into.
    file_prefix (str): Prefix for the output file names.
    """
    # Shuffle the DataFrame
    df_shuffled = df.sample(frac=1, random_state=1).reset_index(drop=True)

    # Split the DataFrame into N parts
    df_splits = np.array_split(df_shuffled, N)

    # Save each part as a CSV file
    for i, split in enumerate(df_splits, start=1):
        filename = f'{file_prefix}{i}.csv'
        split.to_csv(filename, index=False)
        print(f'Saved: {filename} of shape {split.shape}')


split_and_save_dataset(df_train, N)


Saved: df_split_random1.csv of shape (23564, 22)
Saved: df_split_random2.csv of shape (23564, 22)
Saved: df_split_random3.csv of shape (23564, 22)


### Cluster based Subdivision

In [15]:
# N institutions - clusters
N = 3

def split_and_save_clusters(df, N, file_prefix='df_split_cluster'):
    """
    Splits a DataFrame into N clusters and saves each cluster as a CSV file.

    Parameters:
    df (pd.DataFrame): The DataFrame to cluster.
    N (int): Number of clusters to form.
    file_prefix (str): Prefix for the output file names.
    """
    # Perform KMeans clustering
    kmeans = KMeans(n_clusters=N, random_state=1)
    clusters = kmeans.fit_predict(df)

    # Split the DataFrame based on clusters
    for i in range(N):
        cluster_df = df[clusters == i]
        filename = f'{file_prefix}{i+1}.csv'
        cluster_df.to_csv(filename, index=False)
        print(f'Saved: {filename} of shape {cluster_df.shape}')


split_and_save_clusters(df_train, N)


Saved: df_split_cluster1.csv of shape (14628, 22)
Saved: df_split_cluster2.csv of shape (11543, 22)
Saved: df_split_cluster3.csv of shape (44521, 22)


#### Double-Check 

In [18]:
# read the data
df1 = pd.read_csv('df_split_cluster1.csv')
df2 = pd.read_csv('df_split_cluster2.csv')
df3 = pd.read_csv('df_split_cluster3.csv')

df4 = pd.read_csv('df_split_random1.csv')
df5 = pd.read_csv('df_split_random2.csv')
df6 = pd.read_csv('df_split_random3.csv')

# print the shape of the data
print(f"Total shape cluster: {df1.shape[0] + df2.shape[0] + df3.shape[0]},{df1.shape[1]}")
print(f"Total shape random: {df4.shape[0] + df5.shape[0] + df6.shape[0]}, {df4.shape[1]}")

Total shape cluster: 70692,22
Total shape random: 70692, 22


In [19]:
df1.head()

Unnamed: 0,Diabetes_binary,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,0.0,1.0,0.0,1.0,58.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,3.0,3.0,3.0,0.0,1.0,10.0,4.0,6.0
1,0.0,0.0,0.0,1.0,38.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,2.0,2.0,0.0,0.0,6.0,5.0,7.0
2,0.0,0.0,1.0,1.0,40.0,1.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,0.0,0.0,0.0,0.0,7.0,5.0,8.0
3,0.0,0.0,1.0,1.0,24.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,1.0,20.0,0.0,0.0,0.0,9.0,4.0,6.0
4,0.0,0.0,0.0,1.0,36.0,1.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,0.0,0.0,0.0,1.0,7.0,6.0,6.0


In [20]:
df4.head()

Unnamed: 0,Diabetes_binary,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,0.0,1.0,0.0,1.0,30.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,5.0,2.0
1,1.0,1.0,1.0,1.0,33.0,1.0,0.0,0.0,1.0,0.0,...,1.0,0.0,3.0,0.0,0.0,0.0,1.0,4.0,5.0,7.0
2,0.0,1.0,0.0,1.0,27.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,2.0,0.0,0.0,0.0,1.0,9.0,5.0,6.0
3,0.0,0.0,0.0,1.0,25.0,1.0,0.0,0.0,1.0,1.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,13.0,6.0,8.0
4,1.0,0.0,1.0,1.0,29.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,3.0,0.0,0.0,0.0,1.0,11.0,6.0,3.0
