In [1]:
import numpy as np
import pandas as pd

In [2]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


# Motivation

Take the CirCor Digiscope Phonocardiogram Dataset 1.0.3 summary table (indexed by patient id) and use its contents to create a summary table indexed on phonocardiogram, with train-val-test splits assigned. This will inform how we train our GAN.

The original CirCor data summary table can be found at this link:

https://physionet.org/content/circor-heart-sound/1.0.3/training_data.csv

# Reproducibility

Replace these variables with your own to reproduce the functionality of this file:
```
data_dir # str, directory in which you store circor_digiscope_by_patient.csv
```



# Loading Data

In [3]:
data_dir = "/content/drive/MyDrive/Stuff I Coded/PCG_synthesis/data"

In [4]:
patient_df = pd.read_csv(f"{data_dir}/circor_digiscope_by_patient.csv")

In [5]:
patient_df.head()

Unnamed: 0,Patient ID,Recording locations:,Age,Sex,Height,Weight,Pregnancy status,Murmur,Murmur locations,Most audible location,...,Systolic murmur pitch,Systolic murmur quality,Diastolic murmur timing,Diastolic murmur shape,Diastolic murmur grading,Diastolic murmur pitch,Diastolic murmur quality,Outcome,Campaign,Additional ID
0,2530,AV+PV+TV+MV,Child,Female,98.0,15.9,False,Absent,,,...,,,,,,,,Abnormal,CC2015,
1,9979,AV+PV+TV+MV,Child,Female,103.0,13.1,False,Present,AV+MV+PV+TV,TV,...,High,Harsh,,,,,,Abnormal,CC2015,
2,9983,AV+PV+TV+MV,Child,Male,115.0,19.1,False,Unknown,,,...,,,,,,,,Abnormal,CC2015,
3,13918,AV+PV+TV+MV,Child,Male,98.0,15.9,False,Present,TV,TV,...,Low,Blowing,,,,,,Abnormal,CC2015,
4,14241,AV+PV+TV+MV,Child,Male,87.0,11.2,False,Present,AV+MV+PV+TV,PV,...,Low,Harsh,,,,,,Abnormal,CC2015,


# Distributing Patients to Train, Val, Test Groups

In [6]:
# gathering array of all patient ids (and noting its length)
patient_ids = patient_df["Patient ID"].unique()
n = len(patient_ids)

In [7]:
# randomly shuffle all patient ids
np.random.seed(42)
np.random.shuffle(patient_ids)

In [8]:
# identifying what indexes to separate the train, val, test sets at
train_split = int(0.7 * n)
val_split = int(0.85 * n)

In [9]:
# extracting train, val, and test sets from the shuffled full id list
train_ids = patient_ids[:train_split]
val_ids = patient_ids[train_split:val_split]
test_ids = patient_ids[val_split:]

In [12]:
len(train_ids)

659

In [None]:
# sanity check: train, val, test lengths should add to original patient list len
len(train_ids) + len(val_ids) + len(test_ids) == n

True

# From Patient-Indexed to Phonocardiogram-Indexed: Indexing and Data Cleaning

In [None]:
# one patient can have multiple associated phonocardiograms (PCGs), each at
# a different location. This function extracts all those locations as a list
# for each patient
def locations_from_string(loc_str):
  """extract relevant heart locations (PV, TV, AV, MV, or Phc) from the
     special notation of the patient-indexed csv file

     inputs:
     - loc_str (str): patient-wise aggregated location data, in a string form

     outputs:
     - (list of str): list of locations extracted from loc_str

     example:
     - "PV+TV+AV+MV" -> ["PV", "TV", "AV", "MV"]
  """
  if type(loc_str) != str:
    return []
  substrings = loc_str.split('+')
  substrings = [substring.strip() for substring in substrings]
  return substrings

In [None]:
# create a dictionary whose keys are patients and whose values are
# lists of heart locations where PCGs were recorded for those patients
patients_and_locations = {}
for patient in patient_ids:
  patient_locs = locations_from_string(patient_df[patient_df["Patient ID"] == patient]["Recording locations:"].values[0])
  patients_and_locations[patient] = patient_locs

In [None]:
# create a dictionary whose keys are murmur patients and whose values are
# lists of heart locations where murmurs were detected for those patients
murmur_patient_ids = patient_df[patient_df["Murmur"] == "Present"]["Patient ID"].unique()

patients_and_murmur_locations = {}
for murmur_patient in murmur_patient_ids:
  murmur_patient_locs = locations_from_string(patient_df[patient_df["Patient ID"] == murmur_patient]["Murmur locations"].values[0])
  patients_and_murmur_locations[murmur_patient] = murmur_patient_locs

In [None]:
# for murmur patients:
# only consider pcg areas in which murmurs were also located. we will drop any
# other location from the pcg locations list for said patient
for murmur_patient in murmur_patient_ids:
  murmur_locations = patients_and_murmur_locations[murmur_patient]
  pcg_locations = patients_and_locations[murmur_patient]
  for pcg_location in pcg_locations:
    if pcg_location not in murmur_locations:
      patients_and_locations[murmur_patient] = [loc for loc in patients_and_locations[murmur_patient] if loc != pcg_location]

In [None]:
# filenames for this dataset have a convention where if a location appears more
# than once, it is given a numbered suffix. we recreate that with this function
def rename_repeat_locs(loc_list):
  """fix the names of locations if they are repeated more than once. This is
     important because it reflects the file name convention

     inputs:
     - loc_list (list of str): list of locations with relevant PCGs recorded

     outputs:
     - (list of str): modified list with new name convention for repeat elements

     example:
     - ['AV', 'AV', 'PV', 'PV', 'TV'] -> ['AV', 'AV', 'PV_1', 'PV_2', 'TV']
  """
  item_counts = {}
  output_list = []

  for item in loc_list:
      if item not in item_counts:
          item_counts[item] = 1
          output_list.append(item)
      else:
          # if item is a repeat, increment the count and rename
          item_counts[item] += 1
          new_item = f"{item}_{item_counts[item]}"
          if output_list[-1][-2] != "_":
            output_list[-1] = output_list[-1] + "_1"
          output_list.append(new_item)
  return output_list

In [None]:
# renaming location lists according to repeat naming convention for all patients
for patient in patients_and_locations.keys():
  locations_list = patients_and_locations[patient]
  updated_locations_list = rename_repeat_locs(locations_list)
  patients_and_locations[patient] = updated_locations_list

# Building the PCG-Based DF

In [None]:
# PCG (phonocardiogram) df will store records indexed by pcg, not patient id
pcg_df = pd.DataFrame(columns=['pcg_id','patient_id','pcg_location','murmur','outcome','split'])

In [None]:
# using previously cleaned locations data and indexing to create the df
for patient_id in patients_and_locations.keys():
  for pcg_loc in patients_and_locations[patient_id]:
    pcg_id = f"{str(patient_id)}_{pcg_loc}"
    patient_id = str(patient_id)
    pcg_location = pcg_loc
    murmur = patient_df[patient_df['Patient ID']==int(patient_id)]['Murmur'].values[0]
    outcome = patient_df[patient_df['Patient ID']==int(patient_id)]['Outcome'].values[0]
    if int(patient_id) in train_ids:
      split = "TRAIN"
    elif int(patient_id) in val_ids:
      split = "VAL"
    else:
      split = "TEST"
    pcg_df = pcg_df.append({'pcg_id': pcg_id, 'patient_id': patient_id, 'pcg_location': pcg_location, 'murmur': murmur, 'outcome': outcome, 'split': split}, ignore_index=True);

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  pcg_df = pcg_df.append({'pcg_id': pcg_id, 'patient_id': patient_id, 'pcg_location': pcg_location, 'murmur': murmur, 'outcome': outcome, 'split': split}, ignore_index=True);
  pcg_df = pcg_df.append({'pcg_id': pcg_id, 'patient_id': patient_id, 'pcg_location': pcg_location, 'murmur': murmur, 'outcome': outcome, 'split': split}, ignore_index=True);
  pcg_df = pcg_df.append({'pcg_id': pcg_id, 'patient_id': patient_id, 'pcg_location': pcg_location, 'murmur': murmur, 'outcome': outcome, 'split': split}, ignore_index=True);
  pcg_df = pcg_df.append({'pcg_id': pcg_id, 'patient_id': patient_id, 'pcg_location': pcg_location, 'murmur': murmur, 'outcome': outcome, 'split': split}, ignore_index=True);
  pcg_df = pcg_df.append({'pcg_id': pcg_id, 'patient_id': patient_id, 'pcg_location': pcg_location, 'murmur': murmur, 'outcome': outcome, 'split': split}, ignore_index=True);
  pcg_df = pcg_df.append({'pcg_id': pcg_id, 'patient_id': pa

# Save to CSV

In [None]:
# shuffle all the rows one last time (why not?)
pcg_df = pcg_df.sample(frac=1).reset_index(drop=True)

In [None]:
# saving csv file to same data_dir we specified earlier
pcg_df.to_csv(f'{data_dir}/circor_digiscope_by_pcg.csv', index=False)