Notebook to process the data from the AF-Raw-Data folder into a single csv file

In [1]:
import pandas as pd
import glob
import os
import matplotlib.pyplot as plt
from tqdm import tqdm

path_data = 'data/AF-Raw-Data/AF Data/ECG_data'
path_class = 'data/AF-Raw-Data/AF Data/Class'

data_files = glob.glob(os.path.join(path_data, 'Data*'))
class_files = glob.glob(os.path.join(path_class, 'Control*'))

#sort the files alphabetically
data_files.sort()
class_files.sort()

# custom function to handle the inconsistent fields
def custom_parser(row):
    parts = row.split()
    time = parts[0]
    value = parts[1] #R-R interval value in ms
    letter = parts[2] 
    description = ' '.join(parts[3:])
    return pd.Series([time, value, letter, description])

def process_data_file(data_filename):
    df_ = pd.read_csv(data_filename, sep='\x1e', header=None, names=['Row']) #read everything as a single column
    df_ = df_['Row'].apply(custom_parser)
    df_.columns = ['Time', 'Value', 'Letter', 'Description']
    df_['Time'] = pd.to_datetime(df_['Time'], format='%H:%M:%S')
    df_['Value'] = pd.to_numeric(df_['Value'], errors='coerce')
    return df_

def process_control_file(control_filename):
    df_ = pd.read_csv(control_filename, header=None, names=['Time', 'Label'], sep='\s+', engine='python')
    df_['Time'] = pd.to_datetime(df_['Time'], format='%H:%M:%S:%f')
    return df_

In [2]:
# Check if the data and class files are in the same order
correct = 0
numbers = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '0']
#Loop through all file pairs
for data_file, class_file in zip(data_files, class_files):
    # Remove all letters and keep only numbers in the strings
    data_file_number = ''.join([i for i in data_file if i in numbers])
    class_file_number = ''.join([i for i in class_file if i in numbers])
    #convert to int and compare
    data_file_number = int(data_file_number)
    class_file_number = int(class_file_number)
    
    
    if data_file_number != class_file_number:
        print(f'{data_file_number} != {class_file_number}')
    
        #print(f'{data_file_number} == {class_file_number}')
    elif data_file_number == class_file_number:
        correct += 1

print(f'Correct: {correct}')

Correct: 804


In [3]:
### OPTION WITH JOIN

windows_size = 40
columns = [f's_{i}' for i in range(windows_size)] + ['Class_Label']
data_samples_list = []
n_discard = 0
n = 0

for data_file, class_file in tqdm(zip(data_files, class_files), total=len(data_files), desc="Processing Files"):
    n += 1
    
    data_df = process_data_file(data_file).drop(columns=['Letter', 'Description'])
    control_df = process_control_file(class_file)

    joined_df = data_df.merge(control_df, on='Time', how='left') # Join the data and control dataframes, on time column but keep all rows from data_df
    joined_df['Label'] = joined_df['Label'].ffill() # repeat the label for all rows in the window

    # divide joined_df into windows of size 45 and make new dataframe where each row is a window
    for i in range(0, len(joined_df), windows_size):
        window_data = joined_df.iloc[i:i + windows_size]

        # label is majority vote of the window unless there's a NaN or -1, then we discard the window
        if -1 in window_data['Label'].values or window_data['Label'].isnull().any():
            n_discard += 1
            continue
        label = window_data['Label'].mode().iloc[0]

        window_data = window_data.drop(columns=['Label'])
        if len(window_data) == windows_size:
            row_data = window_data['Value'].tolist() + [label]
            data_samples_list.append(row_data)
        else:
            n_discard += 1
            
    #save the data every 50 files
    if n % 50 == 0:
        data_samples = pd.DataFrame(data_samples_list, columns=columns)
        data_samples.to_csv(f'data/AF-Raw-Data/AF Data/data_samples_40_{n-50}-{n}.csv', index=False)
        data_samples_list = []
        print('Saved', n-50, n)
    

data_samples = pd.DataFrame(data_samples_list, columns=columns)
print(f'#Discarded samples: {n_discard}')
print(f'#samples: {len(data_samples)}')

data_samples.to_csv('data/AF-Raw-Data/AF Data/data_samples_40_last.csv', index=False)

Processing Files:  31%|███       | 250/804 [03:46<44:10,  4.78s/it]  

Saved 200 250


Processing Files:  37%|███▋      | 300/804 [07:48<45:08,  5.37s/it]

Saved 250 300


Processing Files:  44%|████▎     | 350/804 [12:02<55:17,  7.31s/it]

Saved 300 350


Processing Files:  50%|████▉     | 400/804 [15:45<25:41,  3.82s/it]

Saved 350 400


Processing Files:  56%|█████▌    | 450/804 [20:07<34:09,  5.79s/it]

Saved 400 450


Processing Files:  62%|██████▏   | 500/804 [24:04<40:41,  8.03s/it]

Saved 450 500


Processing Files:  68%|██████▊   | 550/804 [28:36<27:09,  6.42s/it]

Saved 500 550


Processing Files:  75%|███████▍  | 600/804 [32:52<13:55,  4.10s/it]

Saved 550 600


Processing Files:  81%|████████  | 650/804 [37:14<12:25,  4.84s/it]

Saved 600 650


Processing Files:  87%|████████▋ | 700/804 [40:39<09:22,  5.41s/it]

Saved 650 700


Processing Files:  93%|█████████▎| 750/804 [44:33<06:02,  6.71s/it]

Saved 700 750


Processing Files: 100%|█████████▉| 800/804 [48:18<00:17,  4.32s/it]

Saved 750 800


Processing Files: 100%|██████████| 804/804 [48:34<00:00,  3.62s/it]

#Discarded samples: 18572
#samples: 7760





In [4]:
data_samples

Unnamed: 0,s_0,s_1,s_2,s_3,s_4,s_5,s_6,s_7,s_8,s_9,...,s_31,s_32,s_33,s_34,s_35,s_36,s_37,s_38,s_39,Class_Label
0,5000,5000,5000,5000,5000,5000,5000,5000,5000,5000,...,5000,5000,5000,5000,5000,5000,5000,5000,5000,0.0
1,5000,5000,5000,5000,5000,5000,5000,5000,5000,5000,...,5000,5000,5000,5000,5000,5000,5000,5000,5000,0.0
2,5000,5000,5000,5000,5000,5000,5000,5000,5000,5000,...,5000,5000,5000,5000,5000,5000,5000,5000,5000,0.0
3,5000,5000,5000,5000,5000,5000,5000,5000,5000,5000,...,5000,5000,5000,5000,4425,365,325,255,500,0.0
4,480,265,305,600,490,270,645,950,230,5000,...,240,350,430,235,295,415,280,330,1425,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7755,710,700,710,710,715,715,720,710,720,720,...,705,720,700,720,720,725,725,725,730,0.0
7756,720,735,730,730,730,725,725,725,720,715,...,710,710,720,710,720,715,715,705,720,0.0
7757,710,710,705,705,710,705,700,700,715,705,...,715,725,715,720,720,725,715,730,725,0.0
7758,730,725,720,720,730,715,715,720,715,725,...,720,715,705,715,715,705,715,705,710,0.0


In [11]:
# concatenate all the files into one dataframe
# load in the following files:

filepath = 'data/AF-Raw-Data/AF Data/'
files_ = ['data_samples_40_200-250.csv', 'data_samples_40_250-300.csv', 'data_samples_40_300-350.csv', 'data_samples_40_350-400.csv', 'data_samples_40_400-450.csv', 'data_samples_40_450-500.csv', 'data_samples_40_500-550.csv', 'data_samples_40_550-600.csv', 'data_samples_40_600-650.csv', 'data_samples_40_650-700.csv', 'data_samples_40_700-750.csv', 'data_samples_40_750-800.csv', 'data_samples_40_last.csv']

new_df = pd.read_csv(filepath + 'data_samples_40_200.csv')

# concate all the files
for f in files_:
    df = pd.read_csv(filepath + f)
    new_df = pd.concat([new_df, df], ignore_index=True)
    
print(len(new_df))
new_df.to_csv(filepath + 'data_samples_40_all_.csv', index=False)

1728431
