## Download the CheXchoNet dataset

| Dataset  | Images | Size |
| -------- | ------ | ---- |
| [CheXchoNet](https://physionet.org/content/chexchonet/1.0.0/)  | 71,589 | 2.7 GB |

In [25]:
# Enter physionet username
PHYSIONET_USERNAME=input("Physionet Username:")

In [26]:
# Download the files
!wget -r -N -c -np --user {PHYSIONET_USERNAME} --ask-password https://physionet.org/files/chexchonet/1.0.0/

In [27]:
# Now move the images to the correct path
import os
OUTPUT_IMAGE_PATH = "./cxrs/"
os.rename("physionet.org/files/chexchonet/1.0.0/images/", "./cxrs")

# Now move the csv to a local chexchonet folder
import pandas as pd
OUTPUT_METADATA_PATH = "./chexchonet/"
if not os.path.exists(OUTPUT_METADATA_PATH):
  os.makedirs(OUTPUT_METADATA_PATH)
metadata_df = pd.read_csv("physionet.org/files/chexchonet/1.0.0/metadata.csv")
metadata_df.to_csv(os.path.join(OUTPUT_METADATA_PATH, "metadata.csv"), index=False)

## Separate into Training, Validation, and Testing Splits

In [18]:
import pandas as pd

In [22]:
OUTPUT_METADATA_PATH = "./chexchonet/"
RANDOM_SEED = None

# Create the output directory
import os
if not os.path.exists(OUTPUT_METADATA_PATH):
  os.makedirs(OUTPUT_METADATA_PATH)

# Seed if defined
import random
if RANDOM_SEED is not None:
  random.seed(RANDOM_SEED)

In [23]:
# Now load the data
chexchonet_df = pd.read_csv(os.path.join(OUTPUT_METADATA_PATH, "metadata.csv"))
chexchonet_df['file_path'] = chexchonet_df['cxr_filename']

def split_list(data, train_split=0.7, test_split=0.2, valid_split=0.1):
    if train_split + test_split + valid_split > 1.0:
        raise ValueError("The splits must sum up to 1.0")

    # Shuffle the list randomly
    random.shuffle(data)

    # Calculate the split indices
    train_end = int(train_split * len(data))
    test_end = train_end + int(test_split * len(data))

    # Split the data
    train_data = data[:train_end]
    test_data = data[train_end:test_end]
    valid_data = data[test_end:]

    return set(train_data), set(test_data), set(valid_data)

# Spliy into the datasets
train, test, valid = split_list(chexchonet_df.patient_id.unique())
def map_set(v):
  if v in train:
    return "train"
  elif v in test:
    return "test"
  else:
    return "valid"

# Now map and label
chexchonet_df['diffusion_set'] = chexchonet_df['patient_id'].apply(map_set)
chexchonet_df['inference_set'] = chexchonet_df['patient_id'].apply(map_set)

In [24]:
# Now output
chexchonet_df[chexchonet_df['diffusion_set'] == "train"].to_csv(os.path.join(OUTPUT_METADATA_PATH, 'diffusion_metadata_train.csv'), index=False)
chexchonet_df[chexchonet_df['diffusion_set'] == "test"].to_csv(os.path.join(OUTPUT_METADATA_PATH, 'diffusion_metadata_test.csv'), index=False)
chexchonet_df[chexchonet_df['diffusion_set'] == "eval"].to_csv(os.path.join(OUTPUT_METADATA_PATH, 'diffusion_metadata_eval.csv'), index=False)

## Train the DDPM Model

In [None]:
from run import load_file
CONFIG_FILE_PATH = "src/train/training_configs/class_diffusion_large_224.yaml"
args = load_file(CONFIG_FILE_PATH)

In [None]:
from run import run
run(args)

## Generate Synthetic Data

In [None]:
from gen_images import gen
import numpy as np

# Define paths
MODEL_PATH = "drive/MyDrive/cxr/models/class_diffusion_large_224_8_2024-07-20/"
OUTPUT_DATA_PATH = "cxr_synthetic"
NUM_BATCHES = 10 # Total number of batches to run
BATCH_SIZE = 16 # Modify this based on available GPU RAM

# Define custom sample function
def sample_context(bs):
  #   - age   (norm)
  #   - sex_m (one-hot)
  #   - sex_f (one-hot)
  #   - ivsd  (norm)
  #   - lvpwd (norm)
  #   - lvidd (norm)
  s = [np.random.choice([0,1]) for i in range(bs)]
  return [[
      np.random.normal(loc=-.5, scale=1.0),
      s[i],
      1 if s[i] == 0 else 0,
      np.random.normal(loc=.5, scale=1.0),
      np.random.normal(loc=.5, scale=1.0),
      np.random.normal(loc=.5, scale=1.0)
  ] for i in range(bs)]

df = gen(
    MODEL_PATH,
    OUTPUT_DATA_PATH,
    NUM_BATCHES,
    BATCH_SIZE,
    sample_fn=sample_context
)

In [None]:
df.head()

## Evaluate the Synthetic Data

### Inception Score

In [None]:
import torch
import torchvision.transforms as transforms
from torchvision.models import inception_v3
import numpy as np
from scipy.stats import entropy

def inception_score(imgs, cuda=True, batch_size=32, resize=False, splits=1):
    """Compute the Inception Score of generated images.

    Args:
        imgs (List or array): List or array of PIL Images.
        cuda (bool): Whether to use GPU.
        batch_size (int): Batch size for feeding into Inception v3.
        resize (bool): Resize to 299x299 before feeding into Inception.
        splits (int): Number of splits.

    Returns:
        float: The Inception Score.
    """
    assert batch_size > 0
    assert len(imgs) > 0
    assert splits > 0

    # Set up dtype
    if cuda:
        dtype = torch.cuda.FloatTensor
    else:
        dtype = torch.FloatTensor

    # Set up dataloader
    if resize:
        transform = transforms.Compose([
            transforms.Resize((299, 299)),
            transforms.ToTensor(),
        ])
    else:
        transform = transforms.Compose([
            transforms.ToTensor(),
        ])

    imgs = [transform(img) for img in imgs]
    imgs = torch.stack(imgs, 0).type(dtype)
    dataloader = torch.utils.data.DataLoader(imgs, batch_size=batch_size)

    # Load inception model
    inception_model = inception_v3(pretrained=True, transform_input=False)
    inception_model.eval()
    if cuda:
        inception_model.cuda()

    # Get predictions
    preds = []
    for batch in dataloader:
        with torch.no_grad():
            if cuda:
                batch = batch.cuda()
            pred = inception_model(batch)
            if pred.dim() == 1:
                pred = pred.unsqueeze(0)
            preds.append(pred.cpu().numpy())

    # Now compute the mean kl-div
    preds = np.concatenate(preds, 0)
    scores = []
    for i in range(splits):
        part = preds[(i * preds.shape[0] // splits):((i + 1) * preds.shape[0] // splits), :]
        p_yx = np.exp(part - np.max(part, axis=1, keepdims=True))
        p_yx /= p_yx.sum(axis=1, keepdims=True)
        p_y = np.mean(p_yx, axis=0)
        scores.append(entropy(p_yx, p_y, axis=1).mean())

    return np.exp(np.mean(scores))

In [None]:
train_is = inception_score(train_images, cuda=True)
test_is = inception_score(test_images, cuda=True)
gen_is = inception_score(gen_images, cuda=True)

print(f"Train Score: {train_is:0.4f}")
print(f"Test Score: {test_is:0.4f}")
print(f"Gen Score: {gen_is:0.4f}")

### FID Score

In [None]:
!pip install pytorch-fid

In [None]:
!python -m pytorch_fid {OUTPUT_DATA_PATH} images/chexchonet_train/ --device cuda:0