In [None]:
# Used for processing raw data files after fetching from source
# Preprocess is performed for all data files in the directory, regardless of the CLASS


In [None]:
#### FOR ACADEMIC PROJECT WORK
#check if lib install or not, if not, install
# pip install torch
# pip install wfdb
# pip install shap
# pip install biosppy
# pip install PyWavelets
# pip install pandas
# pip install lightgbm

In [1]:
import re
from datetime import datetime
import shutil
import os
import pandas as pd
from glob import glob
import wfdb
#key parameters:
split_type = os.path.sep #newly added. Long. 23.Mar.24
dx_dict = {
    '426783006': 'SNR', # Normal sinus rhythm
    '164889003': 'AF', # Atrial fibrillation
    '270492004': 'IAVB', # First-degree atrioventricular block
    '164909002': 'LBBB', # Left bundle branch block
    '713427006': 'RBBB', # Complete right bundle branch block
    '59118001': 'RBBB', # Right bundle branch block
    '284470004': 'PAC', # Premature atrial contraction
    '63593006': 'PAC', # Supraventricular premature beats
    '164884008': 'PVC', # Ventricular ectopics
    '429622005': 'STD', # ST-segment depression
    '164931005': 'STE', # ST-segment elevation
}
classes = ['SNR', 'AF', 'IAVB', 'LBBB', 'RBBB', 'PAC', 'PVC', 'STD', 'STE']

In [2]:
#1. FUNCTION READ, AND MODIFY FILES
#Read data from each INDIVIDUAL file, modify it in the right format
# and save it back

def process_file(file_name):
    with open(file_name, 'r') as file:
        main_content = file.read()
    # main_content = re.search(r'<code>(.*?)</code>', main_content, re.DOTALL) #no need to get part of the file
    file.close()
    if main_content:
        # lines = main_content.group(1).split('\n')
        lines = main_content.split('\n')
        now = datetime.now()
        dt_string = now.strftime("%d-%b-%Y %H:%M:%S")
        lines[0] = lines[0] + ' ' + dt_string #append date time to first line
        for i in range(1, 13):
            #get location of '.mat' in the line
            mat_loc = lines[i].find('.mat')+4 #location id at beginning of '.mat' -> +4 till the end
            # print(mat_loc)
            lines[i] = lines[i][:mat_loc+3] + lines[i][mat_loc+5:] #remove 'x1'
            lines[i] = lines[i][:mat_loc+11] + lines[i][mat_loc+16:] #remove '.0(0)'
        for i in range(13, len(lines)):
            lines[i] = lines[i].replace('# ', '#') #replace "#" with "# " in the remaining lines
        new_content = '\n'.join(lines)
        # new_file_name = file_name.replace('.hea', '_new.hea') #no need to change the file name
        #overwrite the original file
        with open(file_name, 'w') as file:
            file.write(new_content)
            file.close()
        # print('New file created:', file_name)
    else:
        print('No match')

In [3]:
#2. FUNCTION MOVE FILES
#Within a folder, move all files to from subfolders to main folder
def copy_files(folder, N):
    files = os.listdir(folder)
    #if N = 0, loop all files
    if N == 0:
        N = len(files)

    for i in range(N):
        file_name = os.path.join(folder, files[i])
        new_folder = os.path.dirname(folder) #copy to upper folder
        # new_folder = os.path.join(os.path.dirname(folder), 'main-data')
        shutil.copy(file_name, new_folder)
        # print('Copied:', file_name)
    print('Done, ', N, 'files copied in', folder)

In [4]:
#3. FUNCTION MOVE AND PROCESS FILES
# Call copy_files to move files from subfolders to main folder
# Call process_file to make necessary changes for each files in the main folder
#Modification is done for all files, regardless of the class

def main_raw_files_process(data_folder, N):
    #STEP 1: Copy N files from each sub-folder to main folder

    #make a list of sub-folders within the main data folder
    folders = os.listdir(data_folder)
    #loop through the folders and copy N files from each sub-folder
    for folder in folders:
        folder_name = os.path.join(data_folder, folder)
        if os.path.isdir(folder_name): #filter dirs only
            copy_files(folder_name, N)
    print('Done copying files from subfolders to main folder:', data_folder)

    #STEP 2: Process each file in the main folder
    #loop through the files in main folder and process each file
    files = os.listdir(data_folder)
    #make full path to each file
    files = [os.path.join(data_folder, file) for file in files]
    for file in files:
        if file.endswith('.hea'):
            process_file(file)
    print('Done processing folders:', data_folder)


In [5]:
#4. function to move files .hea and .mat from one folder to another.
#Files name are extracted from 1st column of df

def move_files(df, source_folder, dest_folder):
    for index, row in df.iterrows():
        file_name = row['File']
        #append '.hea' and '.mat' to the file name, and move the files, one by one
        for ext in ['.hea', '.mat']:
            source_file = os.path.join(source_folder, file_name + ext)
            dest_file = os.path.join(dest_folder, file_name + ext)
            shutil.move(source_file, dest_file)
    print('Done moving files')

In [6]:
#5. Move files to corresponding folders based on the class
def conditional_move(data_dir,des_dir,split_type,dx_dict,classes):
    #phase 1: read and create labels for each class
    recordpaths = glob(os.path.join(data_dir, '*.hea'))
    results = []
    for recordpath in recordpaths:
        patient_id = recordpath.split(split_type)[-1][:-4]
        _, meta_data = wfdb.rdsamp(recordpath[:-4]) 
        dx = meta_data['comments'][2]
        dx = dx[4:] if dx.startswith('Dx: ') else ''
        results.append([patient_id, dx])
    df = pd.DataFrame(data=results, columns=['patient_id', 'dx'])
    # print('Check 1')
    # #view df
    # print(df.head())

    #phase 2: create labels for each class
    results = []
    for _, row in df.iterrows():
        patient_id = row['patient_id']
        dxs = [dx_dict.get(code, '') for code in row['dx'].split(',')]
        # labels = [0] * 9
        labels = [0] * len(classes) #Modified to handle additional classes. Long. 21.Apr.24
        for idx, label in enumerate(classes):
            if label in dxs:
                labels[idx] = 1
        results.append([patient_id] + labels)
    df = pd.DataFrame(data=results, columns=['patient_id'] + classes)
    # print('Check 2')
    # #view df
    # print(df.head())

    #only keep records exist in classes list. Long. 05.May.24
    df['keep'] = df[classes].sum(axis=1) #sum of all classes
    df = df[df['keep'] > 0] #at least one class is 1
    
    #print number of keep records, number of results, percentage of keep records vs results
    print('Number of keep records:', len(df), 
            '\nNumber of results:', len(results), 
            '\nPercentage of keep records:', len(df)/len(results)*100)       
    
    #after filtering those keep records,
    #move those to new destination folder:
    # des_dir = 'data/test_dataset'
    if os.path.exists(des_dir):
        for index, row in df.iterrows():
            file_name = row['patient_id'] #extract file name from 1st column
            # print('File name:', file_name)
            #append '.hea' and '.mat' to the file name, and move the files, one by one
            for ext in ['.hea', '.mat']:
                source_file = os.path.join(data_dir, file_name + ext)
                dest_file = os.path.join(des_dir, file_name + ext)
                shutil.move(source_file, dest_file)
            print('Done moving files')
    else:
        print('Destination folder does not exist:', des_dir)

In [None]:
#cpsc_2018_extra
#modify files from raw downloaded data, select files based on the class, move files to train_dataset or test_dataset
source_folder = 'data\cpsc_2018_extra'
des_folder = 'data\op_09_classes\\test_dataset'
main_raw_files_process(source_folder, 0)
conditional_move(source_folder,des_folder,split_type,dx_dict,classes)

In [None]:
#cpsc_2018
#modify files from raw downloaded data, select files based on the class, move files to train_dataset or test_dataset
source_folder = 'data\cpsc_2018'
des_folder = 'data\op_09_classes\\test_dataset'
main_raw_files_process(source_folder, 0)
conditional_move(source_folder,des_folder,split_type,dx_dict,classes)

In [None]:
#ptb-xl
#modify files from raw downloaded data, select files based on the class, move files to train_dataset or test_dataset
source_folder = 'data\ptb-xl'
des_folder = 'data\op_09_classes\\train_dataset'
main_raw_files_process(source_folder, 0)
conditional_move(source_folder,des_folder,split_type,dx_dict,classes)

In [7]:
#Handle zip files, after uploading from local to codespaces

#loop through each zip file in data folder, and unzip it
zip_files = glob('/scratch/project_2010942/data/*.zip')
for zip_file in zip_files:
    print('Unzipping:', zip_file)
    shutil.unpack_archive(zip_file, '/scratch/project_2010942/data')
    print('Unzipped:', zip_file)
    
#create folder archived within data folder, and move all zip files to this folder, to backup
if not os.path.exists('/scratch/project_2010942/data/archived'):
    os.makedirs('/scratch/project_2010942/data/archived')
    print('Folder created: /scratch/project_2010942/data/archived')
for zip_file in zip_files:
    shutil.move(zip_file, '/scratch/project_2010942/data/archived')
    print('Moved:', zip_file)

Unzipping: /scratch/project_2010942/data/op_09_classes.zip
Unzipped: /scratch/project_2010942/data/op_09_classes.zip
Unzipping: /scratch/project_2010942/data/op_08_classes.zip
Unzipped: /scratch/project_2010942/data/op_08_classes.zip
Unzipping: /scratch/project_2010942/data/cpsc_processed.zip
Unzipped: /scratch/project_2010942/data/cpsc_processed.zip
Folder created: /scratch/project_2010942/data/archived
Moved: /scratch/project_2010942/data/op_09_classes.zip
Moved: /scratch/project_2010942/data/op_08_classes.zip
Moved: /scratch/project_2010942/data/cpsc_processed.zip


In [None]:
#Verify results
#check number of files in each subfolder within data folder
folders = os.listdir('data')

for folder in folders:
    #if not folder name archived, then proceed
    if folder == 'op_08_classes' or folder == 'op_09_classes':     
        folder_name = os.path.join('data', folder)    
        subfolders = os.listdir(folder_name)
        for subfolder in subfolders:
            subfolder_name = os.path.join(folder_name, subfolder)    
            #print number of files in each subfolder
            files = os.listdir(subfolder_name)
            print('Number of files in', subfolder_name, ':', len(files))
    #else if
    elif folder == 'archived' and folder == '.gitkeep':
        folder_name = os.path.join('data', folder)    
        #print number of files in each subfolder
        files = os.listdir(folder_name)
        print('Number of files in', folder_name, ':', len(files))   




In [2]:
# Check distribution of classes in each dataset
#count number of classes in each dataset
def count_classes(label_file, classes):
    df = pd.read_csv(label_file)
    results = []
    for label in classes:
        count = df[label].sum()
        results.append([label, count])
    df = pd.DataFrame(data=results, columns=['Class', 'Count'])
    return df

#define sequence of classes, and label
classes_8 = ['SNR', 'AF', 'IAVB', 'LBBB', 'RBBB', 'PAC', 'STD', 'STE']
classes_9 = ['SNR', 'AF', 'IAVB', 'LBBB', 'RBBB', 'PAC', 'PVC', 'STD', 'STE']



In [4]:
#Count class distribution in ALL TRAIN dataset:

#define label file location for checking string
# label_cpsc_8 = 'data/cpsc_processed/labels_8_classes.csv'
# label_cpsc_9 = 'data/cpsc_processed/labels_9_classes.csv'
# label_new_8 = 'data/op_08_classes/train_dataset/labels_8_classes.csv'
# label_new_9 = 'data/op_09_classes/train_dataset/labels_9_classes.csv'
label_new_8_test = '/scratch/project_2010942/data/op_08_classes/test_dataset/labels_8_classes.csv'
label_new_9_test = '/scratch/project_2010942/data/op_09_classes/test_dataset/labels_9_classes.csv'

#count classes in each dataset
# df_cpsc_8 = count_classes(label_cpsc_8, classes_8)
# df_cpsc_9 = count_classes(label_cpsc_9, classes_9)
# df_new_8 = count_classes(label_new_8, classes_8)
# df_new_9 = count_classes(label_new_9, classes_9)
df_new_8_test = count_classes(label_new_8_test, classes_8)
df_new_9_test = count_classes(label_new_9_test, classes_9)

#view results
# print('CSPC 8 classes:')
# print(df_cpsc_8.transpose())
# print('CSPC 9 classes:')
# print(df_cpsc_9.transpose())
# print('New 8 classes:')
# print(df_new_8.transpose())
# print('New 9 classes:')
# print(df_new_9.transpose())
print('New 8 classes in TEST:')
print(df_new_8_test.transpose())
print('New 9 classes in TEST:')
print(df_new_9_test.transpose())


New 8 classes in TEST:
          0     1     2     3     4    5    6    7
Class   SNR    AF  IAVB  LBBB  RBBB  PAC  STD  STE
Count  1826  1780   247   205   454  258  402  176
New 9 classes in TEST:
         0     1     2     3     4    5    6    7    8
Class  SNR    AF  IAVB  LBBB  RBBB  PAC  PVC  STD  STE
Count  922  1374   828   274  1971  740  700  926  286


In [6]:
#define folds used to split train and validation parts
train_folds = [1, 2, 3, 5, 6, 8, 9, 10]
validations_folds = [4, 7]
#count sum of each class in train folds and validation folds of each dataseT
def count_classes_folds(label_file, classes, train_folds, validations_folds):
    df = pd.read_csv(label_file)
    results = []
    for label in classes:
        count_train = df[df['fold'].isin(train_folds)][label].sum()
        count_valid = df[df['fold'].isin(validations_folds)][label].sum()
        results.append([label, count_train, count_valid])
    df = pd.DataFrame(data=results, columns=['Class', 'Train', 'Validation'])
    return df
#count classes in each dataset
# df_cpsc_8 = count_classes_folds(label_cpsc_8, classes_8, train_folds, validations_folds)
# df_cpsc_9 = count_classes_folds(label_cpsc_9, classes_9, train_folds, validations_folds)
# df_new_8 = count_classes_folds(label_new_8, classes_8, train_folds, validations_folds)
# df_new_9 = count_classes_folds(label_new_9, classes_9, train_folds, validations_folds)
df_new_8_test = count_classes_folds(label_new_8_test, classes_8, train_folds, validations_folds)
df_new_9_test = count_classes_folds(label_new_9_test, classes_9, train_folds, validations_folds)

#view results
# print('CSPC 8 classes:')
# print(df_cpsc_8.transpose())
# print('CSPC 9 classes:')
# print(df_cpsc_9.transpose())
# print('New 8 classes:')
# print(df_new_8.transpose())
# print('New 9 classes:')
# print(df_new_9.transpose())
print('New 8 classes in TEST:')
print(df_new_8_test.transpose())
print('New 9 classes in TEST:')
print(df_new_9_test.transpose())



New 8 classes in TEST:
               0     1     2     3     4    5    6    7
Class        SNR    AF  IAVB  LBBB  RBBB  PAC  STD  STE
Train       1452  1420   202   164   365  209  322  142
Validation   374   360    45    41    89   49   80   34
New 9 classes in TEST:
              0     1     2     3     4    5    6    7    8
Class       SNR    AF  IAVB  LBBB  RBBB  PAC  PVC  STD  STE
Train       727  1096   658   223  1613  590  563  724  237
Validation  195   278   170    51   358  150  137  202   49
