In [1]:
import os, sys, errno
import argparse
import numpy as np
import keras
import json
from misc import *
from interleave_convolutional import *
from dataset_utils import *
from gan_models import *

In [2]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


# Objective:
Create train, val, and test sets of synthetic phonocardiogram samples

# Parameters

General Parameters

In [3]:
murmur_label_key_path = '/content/drive/MyDrive/Stuff I Coded/PCG_synthesis/model_development/real_murmur_key.json'
murmur_label_key_df = pd.read_json(murmur_label_key_path)
murmur_label_key = dict(zip(murmur_label_key_df.iloc[:, 0], murmur_label_key_df.iloc[:, 1]))

In [4]:
present_generator_path = "/content/drive/MyDrive/Stuff I Coded/PCG_synthesis/out/model_g_present_500.h5"
absent_generator_path = "/content/drive/MyDrive/Stuff I Coded/PCG_synthesis/out/model_g_absent_500.h5"
unknown_generator_path = "/content/drive/MyDrive/Stuff I Coded/PCG_synthesis/out/model_g_unknown_500.h5"
norm_path = "/content/drive/MyDrive/Stuff I Coded/PCG_synthesis/out/normalization.npy"

In [5]:
NORM = norm_path     # file containing the normalization factors used
REPR = "dft"         # output type: dft, dct, or anything else for raw waveform
IL = True            # use ILConv to bulid model (always True for raw waveform)
BIAS = True          # generator output layer has a bias
LSIZE = 100          # number of latent dimensions
MSIZE = 64           # model size
R = 4000             # sample rate of output files
V = True             # output per-batch progress

In [6]:
# MODEL = model_path   # file containing the weights of the model to load

Set-Specific Parameters

In [7]:
REAL_DIR = "/content/drive/MyDrive/Stuff I Coded/PCG_synthesis/data2/real/phonocardiograms/"
REAL_TRAIN_DIR = REAL_DIR + "train/"
REAL_VAL_DIR = REAL_DIR + "val/"
REAL_TEST_DIR = REAL_DIR + "test/"

In [8]:
AI_DIR = "/content/drive/MyDrive/Stuff I Coded/PCG_synthesis/data2/ai/phonocardiograms/"
AI_TRAIN_DIR = AI_DIR + "train/"
AI_VAL_DIR = AI_DIR + "val/"
AI_TEST_DIR = AI_DIR + "test/"

In [9]:
# O = out_dir          # output directory to place wav files
# N = 14               # number of wav files to generate
# B = 5                # number of wav files to generate per network run

# Helper Functions
For a given set of real phonocardiograms, we want to record how many records were in the real set, how many records of each murmur type were in that set, and then generate a synthetic set of the same size and murmur composition.

In [10]:
# create and build model
norm_f = np.load(NORM)
if (REPR == 'dft'):
    gen_func = dft_generator if IL else dft_generator_tr
elif (IL == 'dct'):
    gen_func = dct_generator if IL else dct_generator_tr
else:
    gen_func = wave_generator

# creating generators
present_generator = gen_func(MSIZE, LSIZE, BIAS)
present_generator.load_weights(present_generator_path)

absent_generator = gen_func(MSIZE, LSIZE, BIAS)
absent_generator.load_weights(absent_generator_path)

unknown_generator = gen_func(MSIZE, LSIZE, BIAS)
unknown_generator.load_weights(unknown_generator_path)

In [11]:
# lookup table for different generators
generators = {
    "Present": present_generator,
    "Absent": absent_generator,
    "Unknown": unknown_generator
}

In [12]:
def generate_pcgs(N, B, model_type, O):
  generator = generators[model_type]
  wav_i = 0
  z_rs = np.random.RandomState(seed=177013)
  while (wav_i < N):
      batch_size = min(B, N - wav_i)
      if (V):
          print('generating files {:d}-{:d} (out of {:d})...'.format(wav_i, wav_i + batch_size - 1, N))
      z_in = z_rs.uniform(low=-1, high=1, size=(batch_size, LSIZE)).astype(np.float32)
      G_z = generator.predict(z_in, batch_size=batch_size) / norm_f
      if (REPR == 'dft'):
          G_z = dft_transform_backward(G_z)
      elif (REPR == 'dct'):
          G_z = dct_transform_backward(G_z)
      else:
          G_z = np.squeeze(G_z)
      write_wav_dataset(G_z, O, fname_init=wav_i)
      wav_i += batch_size
  if (V):
      print('done')

In [13]:
def generate_synthetic_set(real_pcg_dir, ai_pcg_dir, split):
  """Generates a synthetic PCG dataset of wav files
     Inputs:
       ai_pcg_dir (str): directory to store synthetic PCG files
       real_pcg_dir (str): reference directory. the distribution
          of present, absent, unknown murmur types from here will
          be reflected in the synthetic dataset
       split (str): whether this synthetic set is for training, validation,
          or testing
     Outputs:
       (none): populates ai_pcg_dir with synthetic wav files according to the
          murmur type distribution found in real_pcg_dir"""
  # record size of set
  set_size = len(os.listdir(real_pcg_dir))
  # record number of murmur-present, murmur-absent, and murmur-unknown pcgs
  present_count = 0
  absent_count = 0
  unknown_count = 0
  for wav_filename in os.listdir(real_pcg_dir):
    pcg_id = wav_filename[:-4] # remove ".wav"
    murmur_type = murmur_label_key[pcg_id]
    if murmur_type == "Present":
      present_count += 1
    elif murmur_type == "Absent":
      absent_count += 1
    elif murmur_type == "Unknown":
      unknown_count += 1

  print(present_count, absent_count, unknown_count)

  # and generate appropriate numbers of synthetic pcgs for each set
  print("Converting Present Murmurs")
  generate_pcgs(present_count, 100, "Present", ai_pcg_dir)
  # rename
  for wav_filename in os.listdir(ai_pcg_dir):
    if wav_filename[0].isdigit():
      os.rename(ai_pcg_dir+wav_filename, ai_pcg_dir+split+'_present_'+wav_filename)

  print("Converting Absent Murmurs")
  generate_pcgs(absent_count, 100, "Absent", ai_pcg_dir)
  # rename
  for wav_filename in os.listdir(ai_pcg_dir):
    if wav_filename[0].isdigit():
      os.rename(ai_pcg_dir+wav_filename, ai_pcg_dir+split+'_absent_'+wav_filename)

  print("Converting Unknown Murmurs")
  generate_pcgs(unknown_count, 100, "Unknown", ai_pcg_dir)
  # rename
  for wav_filename in os.listdir(ai_pcg_dir):
    if wav_filename[0].isdigit():
      os.rename(ai_pcg_dir+wav_filename, ai_pcg_dir+split+'_unknown_'+wav_filename)
  print("Done!")

In [14]:
# generate training set
generate_synthetic_set(REAL_TRAIN_DIR,AI_TRAIN_DIR,'train')

347 1643 111
Converting Present Murmurs
generating files 0-99 (out of 347)...


KeyboardInterrupt: 

In [None]:
# generate validation set
generate_synthetic_set(REAL_VAL_DIR,AI_VAL_DIR,'val')

71 383 21
Converting Present Murmurs
generating files 0-70 (out of 71)...
done
Converting Absent Murmurs
generating files 0-99 (out of 383)...
generating files 100-199 (out of 383)...
generating files 200-299 (out of 383)...
generating files 300-382 (out of 383)...
done
Converting Unknown Murmurs
generating files 0-20 (out of 21)...
done
Done!


In [None]:
# generate test set
generate_synthetic_set(REAL_TEST_DIR,AI_TEST_DIR,'test')

81 365 24
Converting Present Murmurs
generating files 0-80 (out of 81)...
done
Converting Absent Murmurs
generating files 0-99 (out of 365)...
generating files 100-199 (out of 365)...
generating files 200-299 (out of 365)...
generating files 300-364 (out of 365)...
done
Converting Unknown Murmurs
generating files 0-23 (out of 24)...
done
Done!
