In [8]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from torch.utils.data import DataLoader, TensorDataset

#seed
torch.manual_seed(0)


<torch._C.Generator at 0x1105ce2b0>

In [2]:
class Generator(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(Generator, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(input_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, output_size),
            nn.Tanh()
        )
    
    def forward(self, x):
        return self.net(x)

class Discriminator(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(Discriminator, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(input_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, output_size),
            nn.Sigmoid()
        )
    
    def forward(self, x):
        return self.net(x)


In [25]:
input_size = 100 
hidden_size = 128
output_size = 10
batch_size = 64
num_epochs = 100
learning_rate = 0.0002

generator = Generator(input_size, hidden_size, output_size)
discriminator = Discriminator(output_size, hidden_size, 1)

criterion = nn.BCELoss()
optimizer_g = optim.Adam(generator.parameters(), lr=learning_rate)
optimizer_d = optim.Adam(discriminator.parameters(), lr=learning_rate)


In [5]:
#fake data
def generate_patient_data(num_samples):
    np.random.seed(0)
    
    # Heights in cm (normal distribution)
    heights = np.random.normal(loc=170, scale=10, size=num_samples)
    
    # Weights in kg (normal distribution)
    weights = np.random.normal(loc=70, scale=15, size=num_samples)
    
    # Fat-free mass (normally around 80% of body weight, some deviation)
    fat_free_mass = weights * np.random.uniform(0.75, 0.85, num_samples)
    
    # Body fat percentage (calculated from fat-free mass)
    body_fat_percentage = 100 * (weights - fat_free_mass) / weights
    
    # Body mass index (BMI)
    bmi = weights / (heights / 100) ** 2
    
    # Blood pressure (systolic/diastolic) in mmHg
    systolic_bp = np.random.normal(loc=120, scale=15, size=num_samples)
    diastolic_bp = np.random.normal(loc=80, scale=10, size=num_samples)
    
    # Age in years (uniform distribution between 18 and 90)
    age = np.random.randint(18, 90, size=num_samples)
    
    # Cholesterol level (normal distribution)
    cholesterol = np.random.normal(loc=200, scale=40, size=num_samples)
    
    # Glucose level (normal distribution)
    glucose = np.random.normal(loc=100, scale=15, size=num_samples)
    
    # Pack all into a DataFrame
    patient_data = pd.DataFrame({
        'Height(cm)': heights,
        'Weight(kg)': weights,
        'FatFreeMass(kg)': fat_free_mass,
        'BodyFatPercentage(%)': body_fat_percentage,
        'BMI': bmi,
        'SystolicBP(mmHg)': systolic_bp,
        'DiastolicBP(mmHg)': diastolic_bp,
        'Age(years)': age,
        'Cholesterol(mg/dL)': cholesterol,
        'Glucose(mg/dL)': glucose
    })
    
    return patient_data

In [10]:
def create_data_loader(df, batch_size):
    # Convert the DataFrame to a NumPy array
    data = df.values.astype(np.float32)
    
    # Convert the NumPy array to a PyTorch tensor
    tensor_data = torch.tensor(data)
    
    # Create a TensorDataset from the tensor data
    dataset = TensorDataset(tensor_data)
    
    # Create a DataLoader from the dataset with the specified batch size
    data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
    
    return data_loader


In [9]:
from sklearn.preprocessing import MinMaxScaler

def scale_dataframe(df, feature_range=(-1, 1)):
    scaler = MinMaxScaler(feature_range=feature_range)
    scaled_values = scaler.fit_transform(df.values)
    scaled_df = pd.DataFrame(scaled_values, columns=df.columns)
    return scaled_df, scaler

In [26]:
synthetic_patient_data = generate_patient_data(10000)
scaled_patient_data, scaler = scale_dataframe(synthetic_patient_data, feature_range=(-1, 1))

batch_size = 64
real_data_loader = create_data_loader(scaled_patient_data, batch_size)

In [27]:
for epoch in range(num_epochs):
    for real_data_batch in real_data_loader:
        # real_data_batch is a tuple, we need to extract the tensor
        real_data = real_data_batch[0]  # Extract the tensor from the tuple
        
        # Train Discriminator
        optimizer_d.zero_grad()

        real_labels = torch.ones(real_data.size(0), 1)  # Adjust labels to match batch size
        outputs = discriminator(real_data)
        d_loss_real = criterion(outputs, real_labels)
        d_loss_real.backward()

        # Generate fake data
        noise = torch.randn(real_data.size(0), input_size)
        fake_data = generator(noise)
        fake_labels = torch.zeros(real_data.size(0), 1)
        outputs = discriminator(fake_data.detach())
        d_loss_fake = criterion(outputs, fake_labels)
        d_loss_fake.backward()

        optimizer_d.step()

        # Train Generator
        optimizer_g.zero_grad()

        fake_labels = torch.ones(real_data.size(0), 1)
        outputs = discriminator(fake_data)
        g_loss = criterion(outputs, fake_labels)
        g_loss.backward()

        optimizer_g.step()

    if epoch % 10 == 0:
        print(f'Epoch [{epoch}/{num_epochs}], d_loss: {d_loss_real.item() + d_loss_fake.item()}, g_loss: {g_loss.item()}')


Epoch [0/100], d_loss: 1.2954261898994446, g_loss: 0.7078133225440979
Epoch [10/100], d_loss: 1.1561580300331116, g_loss: 0.8265936374664307
Epoch [20/100], d_loss: 1.3605216145515442, g_loss: 1.0222952365875244
Epoch [30/100], d_loss: 1.0828190445899963, g_loss: 0.885942280292511
Epoch [40/100], d_loss: 1.686695396900177, g_loss: 0.7025881409645081
Epoch [50/100], d_loss: 1.0966114401817322, g_loss: 1.1281818151474
Epoch [60/100], d_loss: 1.1407724618911743, g_loss: 1.026016116142273
Epoch [70/100], d_loss: 1.0989508032798767, g_loss: 0.9543361663818359
Epoch [80/100], d_loss: 1.6116958260536194, g_loss: 0.8197343349456787
Epoch [90/100], d_loss: 1.3687865138053894, g_loss: 0.867071807384491


In [28]:
def generate_synthetic_data(num_samples):
    noise = torch.randn(num_samples, input_size)
    synthetic_data = generator(noise)
    return synthetic_data.detach().numpy()

In [29]:
def inverse_transform_data(scaled_data, scaler):
    return scaler.inverse_transform(scaled_data)

In [30]:
num_samples = 100
synthetic_data_scaled = generate_synthetic_data(num_samples)
synthetic_data_original_scale = inverse_transform_data(synthetic_data_scaled, scaler)

synthetic_data_df = pd.DataFrame(synthetic_data_original_scale, columns=synthetic_patient_data.columns)


In [32]:
from scipy.stats import ks_2samp

def ks_test(real_data, synthetic_data):
    results = {}
    for column in real_data.columns:
        statistic, p_value = ks_2samp(real_data[column], synthetic_data[column])
        results[column] = {'KS Statistic': statistic, 'p-value': p_value}
    return pd.DataFrame(results).T

ks_test_results = ks_test(synthetic_patient_data, synthetic_data_df)
print(ks_test_results)


                      KS Statistic       p-value
Height(cm)                  0.6981  9.683222e-49
Weight(kg)                  0.6482  4.517264e-41
FatFreeMass(kg)             0.4877  3.299480e-22
BodyFatPercentage(%)        0.4197  2.602653e-16
BMI                         0.6476  5.507354e-41
SystolicBP(mmHg)            0.2133  1.976716e-04
DiastolicBP(mmHg)           0.4227  1.506139e-16
Age(years)                  0.2250  6.963363e-05
Cholesterol(mg/dL)          0.5776  8.199350e-32
Glucose(mg/dL)              0.4788  2.254870e-21


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

def plot_distribution_comparison(real_data, synthetic_data, column):
    plt.figure(figsize=(10, 6))
    sns.kdeplot(real_data[column], label='Real Data', color='blue')
    sns.kdeplot(synthetic_data[column], label='Synthetic Data', color='red')
    plt.title(f'Distribution Comparison: {column}')
    plt.legend()
    plt.show()

for column in synthetic_patient_data.columns:
    plot_distribution_comparison(synthetic_patient_data, synthetic_data_df, column)



In [36]:
def compare_correlation_matrices(real_data, synthetic_data):
    real_corr = real_data.corr()
    synthetic_corr = synthetic_data.corr()
    
    corr_diff = np.abs(real_corr - synthetic_corr)
    return corr_diff

correlation_diff = compare_correlation_matrices(synthetic_patient_data, synthetic_data_df)
print(correlation_diff)


                      Height(cm)  Weight(kg)  FatFreeMass(kg)  \
Height(cm)              0.000000    0.422021         0.061648   
Weight(kg)              0.422021    0.000000         0.172881   
FatFreeMass(kg)         0.061648    0.172881         0.000000   
BodyFatPercentage(%)    0.086516    0.769369         0.631625   
BMI                     0.066238    0.040370         0.139766   
SystolicBP(mmHg)        0.619515    0.469480         0.062338   
DiastolicBP(mmHg)       0.572654    0.328451         0.071826   
Age(years)              0.508482    0.077814         0.414998   
Cholesterol(mg/dL)      0.498977    0.003942         0.199201   
Glucose(mg/dL)          0.118964    0.152242         0.278169   

                      BodyFatPercentage(%)       BMI  SystolicBP(mmHg)  \
Height(cm)                        0.086516  0.066238          0.619515   
Weight(kg)                        0.769369  0.040370          0.469480   
FatFreeMass(kg)                   0.631625  0.139766          

In [None]:
from sklearn.decomposition import PCA

def pca_analysis(real_data, synthetic_data, n_components=2):
    pca = PCA(n_components=n_components)
    
    real_pca = pca.fit_transform(real_data)
    synthetic_pca = pca.transform(synthetic_data)
    
    plt.figure(figsize=(10, 6))
    plt.scatter(real_pca[:, 0], real_pca[:, 1], label='Real Data', alpha=0.5, color='blue')
    plt.scatter(synthetic_pca[:, 0], synthetic_pca[:, 1], label='Synthetic Data', alpha=0.5, color='red')
    plt.title('PCA Analysis')
    plt.legend()
    plt.show()

pca_analysis(synthetic_patient_data, synthetic_data_df)


In [None]:
def compare_mean_variance(real_data, synthetic_data):
    results = {}
    for column in real_data.columns:
        real_mean, real_var = real_data[column].mean(), real_data[column].var()
        synthetic_mean, synthetic_var = synthetic_data[column].mean(), synthetic_data[column].var()
        
        results[column] = {
            'Real Mean': real_mean,
            'Synthetic Mean': synthetic_mean,
            'Mean Difference': np.abs(real_mean - synthetic_mean),
            'Real Variance': real_var,
            'Synthetic Variance': synthetic_var,
            'Variance Difference': np.abs(real_var - synthetic_var)
        }
    return pd.DataFrame(results).T

mean_variance_comparison = compare_mean_variance(synthetic_patient_data, synthetic_data_df)
print(mean_variance_comparison)
