### This assignment makes use of PIMA healthcare dataset. The problem revolves around predicting the degree of diabetes found in patients. Firstly the data is cleaned, encoded and then scaled via data pre-processing class. Later Pytorch is used to create the VAE model. It is then trained and the synthetic dataset is created at the end. 

In [1]:
!pip install rdt

Collecting rdt
  Downloading rdt-0.6.2-py2.py3-none-any.whl (35 kB)
Collecting psutil<6,>=5.7
  Downloading psutil-5.9.0-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (280 kB)
[K     |████████████████████████████████| 280 kB 25.7 MB/s 
Collecting scikit-learn<1,>=0.24
  Downloading scikit_learn-0.24.2-cp37-cp37m-manylinux2010_x86_64.whl (22.3 MB)
[K     |████████████████████████████████| 22.3 MB 1.3 MB/s 
[?25hCollecting pyyaml<6,>=5.4.1
  Downloading PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636 kB)
[K     |████████████████████████████████| 636 kB 60.8 MB/s 
[?25hCollecting scipy<2,>=1.5.4
  Downloading scipy-1.7.3-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (38.1 MB)
[K     |████████████████████████████████| 38.1 MB 1.2 MB/s 
[?25hCollecting numpy<2,>=1.20.0
  Downloading numpy-1.21.5-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (15.7 MB)
[K     |████████████████████████████████| 15.7 MB 32

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [11]:
import pandas as pd
from rdt.transformers import OneHotEncodingTransformer
import numpy as np
from sklearn import preprocessing
class DataProcessing(): # making a class for preprocessing of the data. 
  def __init__(self, df,disc_name,cont_name):
        super(DataProcessing, self).__init__()
        self.data=df[disc_name+cont_name]
        self.disc_colmns=disc_name
        self.cont_colmns=cont_name
  def show(self): 
    print(self.data)
  def transformData(self): # one hotencoding of the data
    new_data=[]
    for i in range(len(self.disc_colmns)):
      ohe = OneHotEncodingTransformer()
      fit_data = pd.DataFrame(self.data, columns=[self.disc_colmns[i]])
      ohe.fit(fit_data, self.disc_colmns[i])
      num_categories = len(ohe.dummies)
      print(num_categories)
      if i==0:
        new_data=ohe.transform(fit_data).to_numpy()
      else:
        new_data=np.concatenate((new_data, ohe.transform(fit_data).to_numpy()), axis=1)
    self.transformations=[]
    for i in range(len(self.cont_colmns)): # std scaling of the data
      scaler = preprocessing.StandardScaler()
      fit_data = pd.DataFrame(self.data, columns=[self.cont_colmns[i]]).to_numpy()
      scaler.fit(fit_data)
      new_data=np.concatenate((new_data, scaler.transform(fit_data)), axis=1)
      self.transformations.append((scaler.mean_,scaler.scale_))
    print('Before Transformation: ')
    print(self.data.head(5))
    print('After Transformation: ')
    df=pd.DataFrame(new_data,columns=['Outcomes1','Outcomes2','Pregnancies','Glucose','BloodPressure','SkinThickness','Insulin','BMI','DiabetesPedigree','Age'])
    print(df.head(5))
    return new_data
  def transformBack(self,data):
    clmns=columns=['Outcomes1','Outcomes2','Pregnancies','Glucose','BloodPressure','SkinThickness','Insulin','BMI','DiabetesPedigree','Age']
    df_prev=pd.DataFrame(data,columns=clmns)
    new_data=[]
    new_data.append(np.argmax(data[0,0:2]))
    print(len(self.cont_colmns))
    for i in range(len(self.cont_colmns)):
      m_,v_=self.transformations[i]
      new_data.append(np.rint(v_*data[0,1+i]+m_))
    new_data=[new_data]
    print('Synthesized Data (Normed): ')
    print(df_prev.head(5))
    print('Synthesized Data (De-Normed): ')
    df=pd.DataFrame(new_data,columns=['Outcomes','Pregnancies','Glucose','BloodPressure','SkinThickness','Insulin','BMI','DiabetesPedigree','Age'])
    print(df.head(5))
def GHData():
  df=pd.read_csv('/content/drive/MyDrive/DL/diabetes.csv')
  dp=DataProcessing(df,['Outcomes'],['Pregnancies','Glucose','BloodPressure','SkinThickness','Insulin','BMI','DiabetesPedigree','Age'])
  transform_data=dp.transformData()
  print('Dimension of Dataset: ', transform_data.shape)
  return transform_data,dp  

In [12]:
vv,dp=GHData()

2
Before Transformation: 
   Outcomes  Pregnancies  Glucose  ...   BMI  DiabetesPedigree  Age
0         1            6      148  ...  33.6             0.627   50
1         0            1       85  ...  26.6             0.351   31
2         1            8      183  ...  23.3             0.672   32
3         0            1       89  ...  28.1             0.167   21
4         1            0      137  ...  43.1             2.288   33

[5 rows x 9 columns]
After Transformation: 
   Outcomes1  Outcomes2  Pregnancies  ...       BMI  DiabetesPedigree       Age
0        1.0        0.0     0.639947  ...  0.204013          0.468492  1.425995
1        0.0        1.0    -0.844885  ... -0.684422         -0.365061 -0.190672
2        1.0        0.0     1.233880  ... -1.103255          0.604397 -0.105584
3        0.0        1.0    -0.844885  ... -0.494043         -0.920763 -1.041549
4        1.0        0.0    -1.141852  ...  1.409746          5.484909 -0.020496

[5 rows x 10 columns]
Dimension of Datas

In [13]:
import numpy as np
import torch as T
import matplotlib.pyplot as plt

device = T.device("cpu") 

#####################  Dataset ################################################

class PIMADataset(T.utils.data.Dataset):

  def __init__(self, data, n_rows=None):
    self.x_data = T.tensor(data, dtype=T.float32).to(device) 

  def __len__(self):
    return len(self.x_data)

  def __getitem__(self, idx):
    return self.x_data[idx]

################### Variational Autoencoders ###################################

class VAE(T.nn.Module):
  def __init__(self):
    super(VAE, self).__init__()

    self.input_dim = 10
    self.latent_dim = 256
    self.dec1=64
    self.dec2=64
    self.enc1=256
    self.enc2=128

    self.fc1 = T.nn.Linear(self.input_dim, self.enc1)
    self.fc11 = T.nn.Linear(self.enc1,self.enc2)
    self.fc12 = T.nn.Linear(self.enc2, 64)
    self.fc2a = T.nn.Linear(64, self.latent_dim) 
    self.fc2b = T.nn.Linear(64, self.latent_dim) 
    self.fc3 = T.nn.Linear(self.latent_dim, self.dec1)
    self.fc31 = T.nn.Linear(self.dec1, self.dec2)
    self.fc41 = T.nn.Linear(self.dec1, 2)
    self.fc5 = T.nn.Linear(self.dec1, 8)

  def encode(self, x):
    z = T.relu(self.fc1(x))    
    z = T.relu(self.fc11(z))    
    z = T.relu(self.fc12(z))    
    mean = T.sigmoid(self.fc2a(z))
    logvar = T.sigmoid(self.fc2b(z))           
    return (mean, logvar)

  def decode(self, z): 
    z = T.relu(self.fc3(z)) 
    z = T.relu(self.fc31(z))  
    z11 = T.softmax(self.fc41(z),1)  
    z2 = self.fc5(z) 
    return (z11,z2)


  def forward(self, x):
    (mean, logvar) = self.encode(x) 
    stdev = T.exp(0.5 * logvar)
    noise = T.randn_like(stdev)
    inpt = mean + (noise * stdev)   
    z11,z2 = self.decode(inpt)     
    return (z11,z2, mean, logvar)

# -----------------------------------------------------------

def cus_loss_func(z1,z2, x, mean, logvar, beta=1.0):
  bce = T.nn.functional.cross_entropy(z1, x[:,0:2])
  bmse = T.nn.functional.mse_loss(z2, x[:,2:10])
  kld = -0.5 * T.sum(1 + logvar - T.pow(mean, 2) - \
    T.exp(logvar))
  return bce + bmse+(beta * kld)  # beta weights KLD component

# -----------------------------------------------------------

def train(vae, ds, bs, me, le, lr, beta):

  vae.train()
  data_ldr = T.utils.data.DataLoader(ds, batch_size=bs,
    shuffle=True)
  opt = T.optim.Adam(vae.parameters(), lr=lr)
  print("\nStarting training")
  for epoch in range(0, me):
    epoch_loss = 0.0
    for (bat_idx, batch) in enumerate(data_ldr):
      opt.zero_grad()
      (z1,z2, mean, logvar) = vae(batch)
      loss_val = cus_loss_func(z1,z2, batch, mean, \
        logvar, beta)
      loss_val.backward()
      epoch_loss += loss_val.item() 
      opt.step()
    if epoch % le == 0:
      print("epoch = %4d   loss = %0.4f" % (epoch, epoch_loss))
  print("Done ")

################################# Main Function ############################### 

def main():
  # Set seed to reproduce results
  T.manual_seed(1)
  np.random.seed(1)
  np.set_printoptions(linewidth=36)

  # Create dataset 
  print("Creating PIMA Dataset ")
  data,dp=GHData()
  data_ds = PIMADataset(data) 

  # Create Model 
  vae = VAE().to(device)

  # Train Model
  bat_size = 64
  max_epochs = 500
  log_interval = 2
  lrn_rate = 0.0001
  beta = 1.0 

  print("\nbat_size = %3d " % bat_size)
  print("loss = custom BCE plus (beta * KLD) ")
  print("loss beta = %0.2f " % beta)
  print("optimizer = Adam")
  print("max_epochs = %3d " % max_epochs)
  print("lrn_rate = %0.3f " % lrn_rate)

  train(vae, data_ds, bat_size, max_epochs, 
    log_interval, lrn_rate, beta)

##########################Evaluation#####################################

  # create Synthetic Data 
  vae.eval()
  np.set_printoptions(linewidth=36)
  for i in range(1):
    rinpt = T.randn(1, vae.latent_dim).to(device)  # Gaussian
    with T.no_grad():
      z1,z2= vae.decode(rinpt)
    si = np.concatenate((z1,z2),1)
    print(si.shape)
    dp.transformBack(si)

if __name__ == "__main__":
  main()


Creating PIMA Dataset 
2
Before Transformation: 
   Outcomes  Pregnancies  Glucose  ...   BMI  DiabetesPedigree  Age
0         1            6      148  ...  33.6             0.627   50
1         0            1       85  ...  26.6             0.351   31
2         1            8      183  ...  23.3             0.672   32
3         0            1       89  ...  28.1             0.167   21
4         1            0      137  ...  43.1             2.288   33

[5 rows x 9 columns]
After Transformation: 
   Outcomes1  Outcomes2  Pregnancies  ...       BMI  DiabetesPedigree       Age
0        1.0        0.0     0.639947  ...  0.204013          0.468492  1.425995
1        0.0        1.0    -0.844885  ... -0.684422         -0.365061 -0.190672
2        1.0        0.0     1.233880  ... -1.103255          0.604397 -0.105584
3        0.0        1.0    -0.844885  ... -0.494043         -0.920763 -1.041549
4        1.0        0.0    -1.141852  ...  1.409746          5.484909 -0.020496

[5 rows x 10 colu