# Setup 

In [1]:
from pathlib import Path
import os

SRCDIR = "dpgai"
DIR = Path("../thesis")
DATADIR = DIR / "data"
RESULTDIR = DIR / "results"

!pwd

for p in [SRCDIR, DIR, DATADIR, RESULTDIR]:
    if not os.path.exists(p):
        print(f"{p} does not exist")

/home/ubuntu/thesis-dp-synth


In [2]:
!python3 --version

Python 3.10.11


In [3]:
!nvidia-smi

Tue May 30 22:29:55 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 515.65.01    Driver Version: 515.65.01    CUDA Version: 11.7     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            On   | 00000000:00:1E.0 Off |                    0 |
| N/A   42C    P8    16W /  70W |      2MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
# %%capture
from __future__ import print_function
import argparse
import os
from datetime import datetime
import random
import torch
import torch.nn as nn
import torch.nn.parallel
import torch.backends.cudnn as cudnn
import torch.optim as optim
import torch.utils.data
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
from IPython.display import HTML
import seaborn as sns
from tqdm import tqdm

In [5]:
%load_ext autotime

# !pip install mlflow[extras]

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

# Set sensible defaults
sns.set()
sns.set_style("ticks")
sns.set_context('paper')

time: 7.71 ms (started: 2023-05-30 22:29:57 +00:00)


In [6]:
import mlflow
import dotenv
dotenv.load_dotenv()
MLFLOW_URI = os.getenv("MLFLOW_URI")
mlflow.set_tracking_uri(MLFLOW_URI)

time: 331 ms (started: 2023-05-30 22:29:57 +00:00)


In [7]:
# Decide which device we want to run on
DEVICE = torch.device("cuda:0" if (torch.cuda.is_available()) else "cpu")
print(DEVICE)

cuda:0
time: 169 ms (started: 2023-05-30 22:29:58 +00:00)


In [8]:
# Set random seed for reproducibility
SEED = 999
# SEED = random.randint(1, 10000) # use if you want new results
print("Random (meta) seed: ", SEED)

Random (meta) seed:  999
time: 30.5 ms (started: 2023-05-30 22:29:58 +00:00)


# Synthesis loop

Data synthesis over multiple synthesisers and datasets, with persistence.

In [9]:
%load_ext autoreload
%autoreload 2

time: 23.8 ms (started: 2023-05-30 22:29:58 +00:00)


In [10]:
import sys

sys.path.append(str(SRCDIR))
print(sys.path)

['/home/ubuntu/thesis-dp-synth', '/usr/lib/python310.zip', '/usr/lib/python3.10', '/usr/lib/python3.10/lib-dynload', '', '/home/ubuntu/.cache/pypoetry/virtualenvs/thesis-dp-synth-YuPmdaUF-py3.10/lib/python3.10/site-packages', 'dpgai']
time: 19.2 ms (started: 2023-05-30 22:29:58 +00:00)


In [11]:
from dpgai.models import *

EPOCHS = 5
DIFFUSION_STEPS = 3

synthesisers = {
    "TableDiffusionDenoiser_Synthesiser": (
        TabDM_Synthesiser,
        {
            "batch_size": 1024,
            "lr": 0.005,
            "dims": (128, 128),
            "mlflow_logging": True,
            "epoch_target": EPOCHS * DIFFUSION_STEPS,
            "diffusion_steps": DIFFUSION_STEPS,
            "predict_noise": False,
        },
        {
            "n_epochs": EPOCHS,
            "verbose": True,
        },
        {
            "use_raw_data": True,
        },
    ),
    "TableDiffusion_Synthesiser": (
        TabDM_Synthesiser,
        {
            "batch_size": 1024,
            "lr": 0.005,
            "dims": (128, 128),
            "mlflow_logging": True,
            "epoch_target": EPOCHS * DIFFUSION_STEPS,
            "diffusion_steps": DIFFUSION_STEPS,
            "predict_noise": True,
        },
        {
            "n_epochs": EPOCHS,
            "verbose": True,
        },
        {
            "use_raw_data": True,
        },
    ),
    "DPautoGAN_Synthesiser": (
        DPautoGAN_Synthesiser,
        {
            'batch_size': 512,
            'latent_dim': 64,
            "gen_dims": (128, 128),
            "dis_dims": (128, 128),
            'gen_lr': 0.0001,
            'dis_lr': 0.0007,
            "ae_lr": 0.02,
            "ae_compress_dim": 16,
            "ae_eps_frac": 0.4,
            'epoch_target': EPOCHS,
            'mlflow_logging': True,
        },
        {
            "n_epochs": EPOCHS,
        },
        {
            "use_raw_data": False,
        },
    ),
    "DPWGAN_Synthesiser": (
        WGAN_Synthesiser,
        {
            "batch_size": 512,
            'gen_lr': 0.005,
            'dis_lr': 0.001,
            "latent_dim": 64,
            'n_critic': 2,
            "epoch_target": EPOCHS,
            "mlflow_logging": True,
        },
        {
            "n_epochs": EPOCHS,
        },
        {
            "use_raw_data": False,
        },
    ),
    "PATEGAN_Synthesiser": (
        PATEGAN_Synthesiser,
        {
            'batch_size': 1024,
            "gen_dims": (128, 128),
            "dis_dims": (128, 128),
            'gen_lr': 0.1,
            'dis_lr': 0.1,
            'latent_dim': 64,
            'num_teachers': 30,
            'teacher_iters': 8,
            'student_iters': 5,
            'epoch_target': EPOCHS,
            'mlflow_logging': True,
        },
        {
            'n_epochs': EPOCHS,
            'noise_multiplier': 0.0048,
        },
        {
            "use_raw_data": False,
        },
    ),
}

time: 234 ms (started: 2023-05-30 22:29:58 +00:00)


In [12]:
synthesisers

{'TableDiffusionDenoiser_Synthesiser': (models.dp_tab_dm.TabDM_Synthesiser,
  {'batch_size': 1024,
   'lr': 0.005,
   'dims': (128, 128),
   'mlflow_logging': True,
   'epoch_target': 15,
   'diffusion_steps': 3,
   'predict_noise': False},
  {'n_epochs': 5, 'verbose': True},
  {'use_raw_data': True}),
 'TableDiffusion_Synthesiser': (models.dp_tab_dm.TabDM_Synthesiser,
  {'batch_size': 1024,
   'lr': 0.005,
   'dims': (128, 128),
   'mlflow_logging': True,
   'epoch_target': 15,
   'diffusion_steps': 3,
   'predict_noise': True},
  {'n_epochs': 5, 'verbose': True},
  {'use_raw_data': True}),
 'DPautoGAN_Synthesiser': (models.dp_auto_gan.DPautoGAN_Synthesiser,
  {'batch_size': 512,
   'latent_dim': 64,
   'gen_dims': (128, 128),
   'dis_dims': (128, 128),
   'gen_lr': 0.0001,
   'dis_lr': 0.0007,
   'ae_lr': 0.02,
   'ae_compress_dim': 16,
   'ae_eps_frac': 0.4,
   'epoch_target': 5,
   'mlflow_logging': True},
  {'n_epochs': 5},
  {'use_raw_data': False}),
 'DPWGAN_Synthesiser': (model

time: 25.6 ms (started: 2023-05-30 22:29:58 +00:00)


In [None]:
from dpgai.config import datasets
from dpgai.utilities import run_synthesisers

exp_hash = datetime.now().strftime("%y%m%d_%H%M%S")
EXP_NAME = f"exp_{exp_hash}"

# Make directories for experiment EXP_NAME
EXP_PATH = RESULTDIR / EXP_NAME
FAKE_DSET_PATH = EXP_PATH / "fake_datasets"
if not os.path.exists(FAKE_DSET_PATH):
    os.makedirs(FAKE_DSET_PATH)

exp_id = mlflow.create_experiment(f"{EXP_NAME}")

print(f"\n\nRunning experiment: {EXP_NAME}\n\n")

run_synthesisers(
    datasets=datasets,
    synthesisers=synthesisers,
    exp_name=EXP_NAME,
    exp_id=exp_id,
    datadir=DATADIR,
    repodir="./",
    epsilon_values=[0.5, 1.0, 5.0, 10.0, 50.0, 100.0, 500.0, 1000.0],
    repeats=10,
    metaseed=SEED,
    generate_fakes=True,
    fake_sample_path=EXP_PATH / "samples",
    fake_data_path=FAKE_DSET_PATH,
    with_benchmark=True,
    ctgan_epochs=30,
    cuda=True,
)

mlflow.end_run()