# Install && Import

In [None]:
!pip install ydata-synthetic
!pip install pandas-profiling

import os
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import sklearn.cluster as cluster
from numpy import array, random, sum, unique
from pandas import DataFrame, read_csv

from ydata_synthetic.synthesizers import ModelParameters, TrainParameters
from ydata_synthetic.synthesizers.regular import WGAN_GP
from pandas_profiling import ProfileReport

from sklearn.pipeline import Pipeline
from sklearn import preprocessing
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
df = pd.read_csv("diabetes_binary_health_indicators_BRFSS2015.csv")

In [None]:
df.drop_duplicates(inplace = True)

# Data Synthesizing
Creating synthetic data for minority class

In [None]:
#WGAN with Gradient Penalty is chosen as the GAN architecture
model = WGAN_GP

In [None]:
#Choosing the categorical and numerical columns
num_cols = ['BMI', 'MentHlth', 'PhysHlth']
cat_cols = list(df.drop(num_cols, axis = 1).columns)

In [None]:
#Selecting the rows of minority class from the data
train_data = df.loc[ df['Diabetes_binary']==1 ].copy()

In [None]:
#Setting the parameters of the GAN model
noise_dim = 32
dim = 128
batch_size = 128

log_step = 100
epochs = 50+1
learning_rate = 5e-4
beta_1 = 0.5
beta_2 = 0.9
models_dir = './cache'

In [None]:
#Setting the model parameters and the training step parameters of the GAN model
gan_args = ModelParameters(batch_size=batch_size, lr=learning_rate, betas=(beta_1, beta_2),
                           noise_dim=noise_dim,layers_dim=dim)

train_args = TrainParameters(epochs=epochs, sample_interval=log_step)

In [None]:
#Initializing the GAN model
synthesizer = model(gan_args, n_critic = 10)

In [None]:
#Training the GAN model
synthesizer.train(data = train_data, train_arguments = train_args, num_cols = num_cols, cat_cols = cat_cols)

  2%|▏         | 1/51 [01:27<1:12:58, 87.56s/it]

Epoch: 0 | disc_loss: -0.2939794957637787 | gen_loss: 0.18317648768424988


  4%|▍         | 2/51 [02:23<56:24, 69.07s/it]  

Epoch: 1 | disc_loss: -0.32239609956741333 | gen_loss: 0.2314671277999878


  6%|▌         | 3/51 [03:19<50:26, 63.04s/it]

Epoch: 2 | disc_loss: -0.32907941937446594 | gen_loss: 0.23169314861297607


  8%|▊         | 4/51 [04:19<48:16, 61.64s/it]

Epoch: 3 | disc_loss: -0.3277326822280884 | gen_loss: 0.24061506986618042


 10%|▉         | 5/51 [05:15<45:41, 59.61s/it]

Epoch: 4 | disc_loss: -0.3498820960521698 | gen_loss: 0.2604486644268036


 12%|█▏        | 6/51 [06:11<43:50, 58.45s/it]

Epoch: 5 | disc_loss: -0.3468162715435028 | gen_loss: 0.260185182094574


 14%|█▎        | 7/51 [07:07<42:17, 57.66s/it]

Epoch: 6 | disc_loss: -0.3442271649837494 | gen_loss: 0.25032275915145874


 16%|█▌        | 8/51 [08:05<41:21, 57.71s/it]

Epoch: 7 | disc_loss: -0.3476319909095764 | gen_loss: 0.2544635832309723


 18%|█▊        | 9/51 [09:04<40:48, 58.31s/it]

Epoch: 8 | disc_loss: -0.3525775671005249 | gen_loss: 0.2557224631309509


 20%|█▉        | 10/51 [10:02<39:39, 58.05s/it]

Epoch: 9 | disc_loss: -0.3535037934780121 | gen_loss: 0.26192164421081543


 22%|██▏       | 11/51 [10:58<38:23, 57.58s/it]

Epoch: 10 | disc_loss: -0.34947893023490906 | gen_loss: 0.27200764417648315


 24%|██▎       | 12/51 [11:55<37:14, 57.29s/it]

Epoch: 11 | disc_loss: -0.3471490740776062 | gen_loss: 0.26042449474334717


 25%|██▌       | 13/51 [12:56<37:03, 58.53s/it]

Epoch: 12 | disc_loss: -0.3522064983844757 | gen_loss: 0.2659962773323059


 27%|██▋       | 14/51 [13:52<35:34, 57.70s/it]

Epoch: 13 | disc_loss: -0.34559914469718933 | gen_loss: 0.267021119594574


 29%|██▉       | 15/51 [14:47<34:12, 57.01s/it]

Epoch: 14 | disc_loss: -0.35485216975212097 | gen_loss: 0.26373547315597534


 31%|███▏      | 16/51 [15:43<32:57, 56.49s/it]

Epoch: 15 | disc_loss: -0.3536396622657776 | gen_loss: 0.2561739385128021


 33%|███▎      | 17/51 [16:38<31:53, 56.28s/it]

Epoch: 16 | disc_loss: -0.3507246971130371 | gen_loss: 0.26967501640319824


 35%|███▌      | 18/51 [17:38<31:29, 57.27s/it]

Epoch: 17 | disc_loss: -0.35391882061958313 | gen_loss: 0.26922300457954407


 37%|███▋      | 19/51 [18:34<30:18, 56.82s/it]

Epoch: 18 | disc_loss: -0.34177204966545105 | gen_loss: 0.2618061900138855


 39%|███▉      | 20/51 [19:30<29:12, 56.53s/it]

Epoch: 19 | disc_loss: -0.3348488211631775 | gen_loss: 0.26809078454971313


 41%|████      | 21/51 [20:25<28:08, 56.28s/it]

Epoch: 20 | disc_loss: -0.35508501529693604 | gen_loss: 0.26326289772987366


 43%|████▎     | 22/51 [21:26<27:47, 57.48s/it]

Epoch: 21 | disc_loss: -0.3495475649833679 | gen_loss: 0.26383933424949646


 45%|████▌     | 23/51 [22:23<26:44, 57.31s/it]

Epoch: 22 | disc_loss: -0.351346880197525 | gen_loss: 0.2655578851699829


 47%|████▋     | 24/51 [23:17<25:26, 56.54s/it]

Epoch: 23 | disc_loss: -0.3644232153892517 | gen_loss: 0.26991984248161316


 49%|████▉     | 25/51 [24:12<24:15, 55.99s/it]

Epoch: 24 | disc_loss: -0.3559643030166626 | gen_loss: 0.27150243520736694


 51%|█████     | 26/51 [25:08<23:17, 55.91s/it]

Epoch: 25 | disc_loss: -0.34346404671669006 | gen_loss: 0.26403170824050903


 53%|█████▎    | 27/51 [26:06<22:41, 56.74s/it]

Epoch: 26 | disc_loss: -0.3394685685634613 | gen_loss: 0.2584614157676697


 55%|█████▍    | 28/51 [27:02<21:37, 56.40s/it]

Epoch: 27 | disc_loss: -0.35091498494148254 | gen_loss: 0.2786010801792145


 57%|█████▋    | 29/51 [27:58<20:38, 56.29s/it]

Epoch: 28 | disc_loss: -0.34885191917419434 | gen_loss: 0.27414470911026


 59%|█████▉    | 30/51 [28:55<19:48, 56.61s/it]

Epoch: 29 | disc_loss: -0.36406463384628296 | gen_loss: 0.2761073112487793


 61%|██████    | 31/51 [29:55<19:10, 57.53s/it]

Epoch: 30 | disc_loss: -0.3416077196598053 | gen_loss: 0.2764893174171448


 63%|██████▎   | 32/51 [30:52<18:08, 57.30s/it]

Epoch: 31 | disc_loss: -0.33100584149360657 | gen_loss: 0.28130683302879333


 65%|██████▍   | 33/51 [31:49<17:09, 57.17s/it]

Epoch: 32 | disc_loss: -0.35902824997901917 | gen_loss: 0.2706173062324524


 67%|██████▋   | 34/51 [32:45<16:05, 56.81s/it]

Epoch: 33 | disc_loss: -0.3601337671279907 | gen_loss: 0.2802868187427521


 69%|██████▊   | 35/51 [33:40<15:02, 56.40s/it]

Epoch: 34 | disc_loss: -0.36300820112228394 | gen_loss: 0.29520684480667114


 71%|███████   | 36/51 [34:40<14:19, 57.29s/it]

Epoch: 35 | disc_loss: -0.3559294641017914 | gen_loss: 0.2876632809638977


 73%|███████▎  | 37/51 [35:36<13:17, 56.98s/it]

Epoch: 36 | disc_loss: -0.3571127653121948 | gen_loss: 0.28394806385040283


 75%|███████▍  | 38/51 [36:32<12:17, 56.75s/it]

Epoch: 37 | disc_loss: -0.3524557948112488 | gen_loss: 0.2839139997959137


 76%|███████▋  | 39/51 [37:28<11:19, 56.62s/it]

Epoch: 38 | disc_loss: -0.35605311393737793 | gen_loss: 0.28404974937438965


 78%|███████▊  | 40/51 [38:28<10:34, 57.69s/it]

Epoch: 39 | disc_loss: -0.3538661599159241 | gen_loss: 0.29032471776008606


 80%|████████  | 41/51 [39:25<09:34, 57.46s/it]

Epoch: 40 | disc_loss: -0.3629951477050781 | gen_loss: 0.29291799664497375


 82%|████████▏ | 42/51 [40:22<08:34, 57.20s/it]

Epoch: 41 | disc_loss: -0.3552713394165039 | gen_loss: 0.3094062805175781


 84%|████████▍ | 43/51 [41:19<07:36, 57.03s/it]

Epoch: 42 | disc_loss: -0.35660094022750854 | gen_loss: 0.30351585149765015


 86%|████████▋ | 44/51 [42:15<06:38, 56.93s/it]

Epoch: 43 | disc_loss: -0.349928617477417 | gen_loss: 0.2903481125831604


 88%|████████▊ | 45/51 [43:14<05:45, 57.59s/it]

Epoch: 44 | disc_loss: -0.3452370762825012 | gen_loss: 0.2977404296398163


 90%|█████████ | 46/51 [44:10<04:44, 56.97s/it]

Epoch: 45 | disc_loss: -0.35558685660362244 | gen_loss: 0.29706668853759766


 92%|█████████▏| 47/51 [45:06<03:46, 56.60s/it]

Epoch: 46 | disc_loss: -0.36296549439430237 | gen_loss: 0.29458940029144287


 94%|█████████▍| 48/51 [46:03<02:49, 56.66s/it]

Epoch: 47 | disc_loss: -0.3607647716999054 | gen_loss: 0.2951633334159851


 96%|█████████▌| 49/51 [47:02<01:54, 57.37s/it]

Epoch: 48 | disc_loss: -0.35236552357673645 | gen_loss: 0.28844285011291504


 98%|█████████▊| 50/51 [47:58<00:57, 57.11s/it]

Epoch: 49 | disc_loss: -0.3626677691936493 | gen_loss: 0.30052614212036133


100%|██████████| 51/51 [48:53<00:00, 57.52s/it]

Epoch: 50 | disc_loss: -0.3473054766654968 | gen_loss: 0.2944948673248291





In [None]:
#Generating synthetic data of 100k examples of diabetic patients
minority_synth_data = synthesizer.sample(100000)

Synthetic data generation: 100%|██████████| 782/782 [00:27<00:00, 28.51it/s]


In [None]:
minority_synth_data.isna().sum()

Diabetes_binary         0
HighBP                  0
HighChol                0
CholCheck               0
BMI                     0
Smoker                  0
Stroke                  0
HeartDiseaseorAttack    0
PhysActivity            0
Fruits                  0
Veggies                 0
HvyAlcoholConsump       0
AnyHealthcare           0
NoDocbcCost             0
GenHlth                 0
MentHlth                0
PhysHlth                0
DiffWalk                0
Sex                     0
Age                     0
Education               0
Income                  0
dtype: int64

In [None]:
minority_synth_data

Unnamed: 0,Diabetes_binary,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,1.0,0.0,1.0,1.0,21.123623,0.0,0.0,1.0,0.0,0.0,...,1.0,1.0,4.0,16.791885,20.956236,1.0,1.0,8.0,3.0,7.0
1,1.0,0.0,1.0,1.0,30.929613,0.0,1.0,1.0,1.0,1.0,...,0.0,1.0,1.0,9.087981,21.697443,0.0,1.0,4.0,6.0,1.0
2,1.0,0.0,0.0,1.0,23.508165,0.0,1.0,0.0,1.0,0.0,...,1.0,0.0,1.0,6.530963,26.588053,1.0,1.0,12.0,2.0,8.0
3,1.0,0.0,1.0,0.0,22.780989,0.0,0.0,0.0,1.0,1.0,...,1.0,1.0,3.0,20.596498,23.256498,0.0,1.0,10.0,2.0,5.0
4,1.0,0.0,1.0,1.0,30.145325,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,2.0,2.845590,28.019899,1.0,1.0,2.0,5.0,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100091,1.0,0.0,1.0,1.0,23.540915,0.0,1.0,1.0,1.0,0.0,...,0.0,0.0,2.0,18.756605,21.597122,1.0,0.0,1.0,3.0,3.0
100092,1.0,1.0,1.0,0.0,26.230598,1.0,0.0,1.0,0.0,0.0,...,1.0,0.0,4.0,5.333356,25.525236,1.0,0.0,3.0,5.0,2.0
100093,1.0,0.0,1.0,0.0,23.895718,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,4.0,5.483332,15.592176,1.0,0.0,1.0,3.0,5.0
100094,1.0,0.0,0.0,0.0,29.675364,0.0,0.0,0.0,1.0,1.0,...,0.0,1.0,1.0,3.978933,27.775213,1.0,1.0,2.0,6.0,4.0


In [None]:
#Selecting the rows of majority class from the data
train_data = df.loc[ df['Diabetes_binary']==0 ].copy()

#WGAN with Gradient Penalty is chosen as the GAN architecture
model = WGAN_GP

#Setting the parameters of the GAN model
noise_dim = 32
dim = 64
batch_size = 64

log_step = 50
epochs = 10+1
learning_rate = 5e-4
beta_1 = 0.5
beta_2 = 0.9
models_dir = './cache'

#Setting the model parameters and the training step parameters of the GAN model
gan_args = ModelParameters(batch_size=batch_size, lr=learning_rate, betas=(beta_1, beta_2),
                           noise_dim=noise_dim,layers_dim=dim)

train_args = TrainParameters(epochs=epochs, sample_interval=log_step)

#Initializing the GAN model
synthesizer = model(gan_args, n_critic = 20)

#Training the GAN model
synthesizer.train(data = train_data, train_arguments = train_args, num_cols = num_cols, cat_cols = cat_cols)

#Generating synthetic data of 100k examples of non-diabetic patients
majority_synth_data = synthesizer.sample(100000)

  9%|▉         | 1/11 [08:02<1:20:24, 482.41s/it]

Epoch: 0 | disc_loss: -0.49565279483795166 | gen_loss: 0.26824840903282166


 18%|█▊        | 2/11 [15:26<1:08:56, 459.61s/it]

Epoch: 1 | disc_loss: -0.49455899000167847 | gen_loss: 0.3078387677669525


 27%|██▋       | 3/11 [22:53<1:00:31, 453.94s/it]

Epoch: 2 | disc_loss: -0.49541521072387695 | gen_loss: 0.44673609733581543


 36%|███▋      | 4/11 [30:21<52:41, 451.65s/it]  

Epoch: 3 | disc_loss: -0.5128812193870544 | gen_loss: 0.48721468448638916


 45%|████▌     | 5/11 [37:48<44:59, 449.86s/it]

Epoch: 4 | disc_loss: -0.4965490698814392 | gen_loss: 0.5170586705207825


 55%|█████▍    | 6/11 [45:09<37:14, 446.89s/it]

Epoch: 5 | disc_loss: -0.498857319355011 | gen_loss: 0.5327871441841125


 64%|██████▎   | 7/11 [52:34<29:45, 446.39s/it]

Epoch: 6 | disc_loss: -0.5031057596206665 | gen_loss: 0.5495213270187378


 73%|███████▎  | 8/11 [1:00:00<22:19, 446.39s/it]

Epoch: 7 | disc_loss: -0.49492087960243225 | gen_loss: 0.6051995754241943


 82%|████████▏ | 9/11 [1:07:27<14:53, 446.58s/it]

Epoch: 8 | disc_loss: -0.5110796093940735 | gen_loss: 0.6718908548355103


 91%|█████████ | 10/11 [1:14:52<07:25, 445.93s/it]

Epoch: 9 | disc_loss: -0.49111613631248474 | gen_loss: 0.6360602974891663


100%|██████████| 11/11 [1:22:20<00:00, 449.11s/it]


Epoch: 10 | disc_loss: -0.47223466634750366 | gen_loss: 0.6474522352218628


Synthetic data generation: 100%|██████████| 1563/1563 [00:42<00:00, 36.50it/s]


In [None]:
majority_synth_data.shape

(100032, 22)

In [None]:
minority_synth_data.shape

(100096, 22)

In [None]:
synth_df = pd.concat([majority_synth_data, minority_synth_data], ignore_index=True)

In [None]:
synth_df.to_csv(r'./diabetes_synthetic_data.csv', index = False)