# Check for Missing Values

This script below checks whether each folder in the PTB-XL ECG image dataset contains the expected number of image files (each ending in _lr-0.png). All folders from 00000 to 20000 are expected to have 1000 files each, except for the folder 21000, which is expected to contain 838 files (from 21000 to 21837). It reports any folders that are missing files.

In [None]:
# The original dataset can be accessed here: https://physionet.org/content/ptb-xl/1.0.3/
# Kaggle version available at: https://www.kaggle.com/datasets/khyeh0719/ptb-xl-dataset

def check_missing_files_in_folders(base_path):
    folders_with_missing_files = []

    # Loop through folders named '00000', '01000', ..., up to '20000'
    for folder_num in range(0, 21000, 1000):  # The last standard folder is 21000
        folder_name = f"{folder_num:05d}"  # Format number with leading zeros (e.g., '00000')
        folder_path = os.path.join(base_path, folder_name)
        
        if os.path.exists(folder_path):
            # Count the number of image files ending with '_lr-0.png' in the folder
            file_count = len([f for f in os.listdir(folder_path) if f.endswith("_lr-0.png")])
            # If the folder has less than 1000 files, add it to the list of problematic folders
            if file_count < 1000:
                folders_with_missing_files.append(folder_name)
        else:
            # Print a message if the folder does not exist
            print(f"Folder {folder_name} not found!")

    # Special handling for folder '21000', which contains files from 21000 to 21837 (total 838 files expected)
    folder_21000_path = os.path.join(base_path, "21000")
    if os.path.exists(folder_21000_path):
        # Generate the set of expected filenames in '21000'
        expected_files = {f"{i:05d}_lr-0.png" for i in range(21000, 21838)}  # Files: 21000_lr-0.png to 21837_lr-0.png
        # Get the set of actual files in the folder
        existing_files = set(os.listdir(folder_21000_path))
        # Identify missing files by set difference
        missing_files = expected_files - existing_files
        if missing_files:
            folders_with_missing_files.append("21000")
    
    return folders_with_missing_files

# Define the path to the base directory containing all folders
base_path = "/kaggle/input/ptb-xl-ecg-image-gmc2024"
# Check for folders that have fewer image files than expected
missing_folders = check_missing_files_in_folders(base_path)
# Print the result
if missing_folders:
    print("Folders with missing files detected:")
    for folder in missing_folders:
        print(f"Folder {folder}")
else:
    print("All folders contain 1,000 or more files.")

This script below is used to identify missing image files in a specific folder of the PTB-XL ECG image dataset, specifically the folder named 00000. Each file in the dataset follows a sequential naming convention such as 00001_lr-0.png, 00002_lr-0.png, and so on. The script generates a list of all expected filenames within a given range (from 00001 to 00999 in this case), then compares it with the actual files present in the folder. By calculating the difference between the expected and existing files, the script identifies which files are missing. It then prints out the total number of missing files along with their specific filenames. 

In [None]:
def find_missing_files(folder_path, start_num, end_num):
    # List of files that are expected to be present
    expected_files = {f"{i:05d}_lr-0.png" for i in range(start_num, end_num + 1)}
    # List of files that currently exist in the folder
    existing_files = set(os.listdir(folder_path))
    # Find the missing files by subtracting the sets
    missing_files = expected_files - existing_files
    return sorted(missing_files)

# Path to folder '00000'
folder_path = "/kaggle/input/ptb-xl-ecg-image-gmc2024/00000"

# Expected range of file numbers
start_num = 1    # Because the first expected file is 00001
end_num = 999    # Because the last expected file is 00999

# Find the missing files in the folder
missing_files = find_missing_files(folder_path, start_num, end_num)

# Display the result
print(f"Number of missing files: {len(missing_files)}")
print("Missing files:")
for file in missing_files:
    print(file)

In [None]:
def find_missing_files(folder_path, start_num, end_num):
    # List of files that are expected to be present
    expected_files = {f"{i:05d}_lr-0.png" for i in range(start_num, end_num + 1)}
    # List of files that currently exist in the folder
    existing_files = set(os.listdir(folder_path))
    # Find missing files by comparing expected and existing files
    missing_files = expected_files - existing_files
    return sorted(missing_files)

# Path to folder '02000'
folder_path = "/kaggle/input/ptb-xl-ecg-image-gmc2024/02000"

# Expected range of file numbers
start_num = 2000   # Because the first expected file is 02000
end_num = 2999     # Because the last expected file is 02999

# Find missing files in the folder
missing_files = find_missing_files(folder_path, start_num, end_num)

# Display the results
print(f"Number of missing files: {len(missing_files)}")
print("Missing files:")
for file in missing_files:
    print(file)

This script checks for missing ECG image files in a specific folder (02000) of the PTB-XL dataset. Each image file follows a consistent naming pattern such as 02000_lr-0.png to 02999_lr-0.png, which corresponds to numerical identifiers from 2000 to 2999. The script generates the full list of expected filenames within that range and compares it with the actual files found in the folder. It identifies any missing files and prints out both the total number of missing files and their specific names. This verification process is typically repeated for all folders in the dataset, starting from 00000, 01000, 02000, and so on, in increments of 1000, up to the folder 20000. 

# Lowercase and Translation Process

In [None]:
# Overview of the entire CSV file
ptbxlcsv_path = '/kaggle/input/ptb-xl-dataset/ptb-xl-a-large-publicly-available-electrocardiography-dataset-1.0.1/ptbxl_database.csv'
ptbxl = pd.read_csv(ptbxlcsv_path)
ptbxl.head()

In [None]:
# Print the number of rows in the dataset
print(f"Number of rows: {ptbxl.shape[0]}")
# Print the number of columns in the dataset
print(f"Number of columns: {ptbxl.shape[1]}")

In [None]:
# Display the columns/features in the dataset
print("Columns/features in the dataset:")
print(ptbxl.columns)

# Display the number of missing values in each column (overall)
print("\nNumber of missing values in each column:")
print(ptbxl.isnull().sum())

In [None]:
# Focus on ECG ID, File Name (filename_lr), and Caption (report)
ptbxl[['ecg_id', 'filename_lr', 'report']].head()

In [None]:
# Rename the columns: 'ecg_id' to 'id', 'filename_lr' to 'name', and 'report' to 'caption'
ptbxl = ptbxl.rename(columns={'ecg_id': 'id', 'filename_lr': 'name', 'report': 'caption'})
# Keep only the 'id', 'name', and 'caption' columns in the dataset
ptbxl = ptbxl[['id', 'name', 'caption']]
# Display the first few rows of the updated dataset
ptbxl.head()

In [None]:
# Define a function to generate the full image path for each row
def generate_image_path(row):
    folder = row['name'].split('/')[1][:5]  # Extract the folder name from the 'name' column
    return f"/kaggle/input/ptb-xl-ecg-image-gmc2024/{folder}/{row['name'].split('/')[-1]}-0.png"

# Apply the function to each row to create a new 'images' column with the full image path
ptbxl['images'] = ptbxl.apply(generate_image_path, axis=1)
# Convert all text in the 'caption' column to lowercase
ptbxl['caption'] = ptbxl['caption'].str.lower()
# Keep only the relevant columns in the DataFrame
ptbxl = ptbxl[['id', 'name', 'caption', 'images']]
# Display the first 11 rows of the updated dataset
ptbxl.head(11)

In [None]:
# Manually input the list of missing files that were previously inspected
missing_files = [
    "00137_lr-0.png", "00139_lr-0.png", "00140_lr-0.png", "00141_lr-0.png", 
    "00142_lr-0.png", "00143_lr-0.png", "00145_lr-0.png", "00456_lr-0.png", 
    "00458_lr-0.png", "00459_lr-0.png", "00461_lr-0.png", "00462_lr-0.png", 
    "02506_lr-0.png", "02511_lr-0.png", "03795_lr-0.png", "03798_lr-0.png", 
    "03800_lr-0.png", "03801_lr-0.png", "03832_lr-0.png", "05817_lr-0.png", 
    "07777_lr-0.png", "07779_lr-0.png", "07782_lr-0.png", "09821_lr-0.png", 
    "09825_lr-0.png", "09888_lr-0.png", "11810_lr-0.png", "11814_lr-0.png", 
    "11815_lr-0.png", "11817_lr-0.png", "11838_lr-0.png", "13791_lr-0.png", 
    "13793_lr-0.png", "13796_lr-0.png", "13797_lr-0.png", "13799_lr-0.png", 
    "15742_lr-0.png", "17872_lr-0.png", "17873_lr-0.png", "17874_lr-0.png", 
    "17875_lr-0.png", "17876_lr-0.png", "17877_lr-0.png", "17878_lr-0.png",
    "17879_lr-0.png", "17880_lr-0.png", "17881_lr-0.png", "17882_lr-0.png",
    "17883_lr-0.png", "17884_lr-0.png", "17885_lr-0.png", "17886_lr-0.png",
    "17887_lr-0.png", "17888_lr-0.png", "17889_lr-0.png", "17890_lr-0.png",
    "17891_lr-0.png", "17892_lr-0.png", "17893_lr-0.png", "17894_lr-0.png",
    "17895_lr-0.png", "17896_lr-0.png", "17897_lr-0.png", "17898_lr-0.png",
    "17899_lr-0.png", "17900_lr-0.png", "17901_lr-0.png", "17902_lr-0.png",
    "17903_lr-0.png", "17904_lr-0.png", "17905_lr-0.png", "17906_lr-0.png",
    "17907_lr-0.png", "17908_lr-0.png", "17909_lr-0.png", "17910_lr-0.png",
    "17911_lr-0.png", "17912_lr-0.png", "17913_lr-0.png", "17914_lr-0.png",
    "17915_lr-0.png", "17916_lr-0.png", "17917_lr-0.png", "17918_lr-0.png",
    "17919_lr-0.png", "17920_lr-0.png", "17921_lr-0.png", "17922_lr-0.png",
    "17923_lr-0.png", "17924_lr-0.png", "17925_lr-0.png", "17926_lr-0.png",
    "17927_lr-0.png", "17928_lr-0.png", "17929_lr-0.png", "17930_lr-0.png",
    "17931_lr-0.png", "17932_lr-0.png", "17933_lr-0.png", "17934_lr-0.png",
    "17935_lr-0.png", "17936_lr-0.png", "17937_lr-0.png", "17938_lr-0.png",
    "17939_lr-0.png", "17940_lr-0.png", "17941_lr-0.png", "17942_lr-0.png",
    "17943_lr-0.png", "17944_lr-0.png", "17945_lr-0.png", "17946_lr-0.png",
    "17947_lr-0.png", "17948_lr-0.png", "17949_lr-0.png", "17950_lr-0.png",
    "17951_lr-0.png", "17952_lr-0.png", "17953_lr-0.png", "17954_lr-0.png",
    "17955_lr-0.png", "17956_lr-0.png", "17957_lr-0.png", "17958_lr-0.png",
    "17959_lr-0.png", "17960_lr-0.png", "17961_lr-0.png", "17962_lr-0.png",
    "17963_lr-0.png", "17964_lr-0.png", "17965_lr-0.png", "17966_lr-0.png",
    "17967_lr-0.png", "17968_lr-0.png", "17969_lr-0.png", "17970_lr-0.png",
    "17971_lr-0.png", "17972_lr-0.png", "17973_lr-0.png", "17974_lr-0.png",
    "17975_lr-0.png", "17976_lr-0.png", "17977_lr-0.png", "17978_lr-0.png",
    "17979_lr-0.png", "17980_lr-0.png", "17981_lr-0.png", "17982_lr-0.png",
    "17983_lr-0.png", "17984_lr-0.png", "17985_lr-0.png", "17986_lr-0.png",
    "17987_lr-0.png", "17988_lr-0.png", "17989_lr-0.png", "17990_lr-0.png",
    "17991_lr-0.png", "17992_lr-0.png", "17993_lr-0.png", "17994_lr-0.png",
    "17995_lr-0.png", "17996_lr-0.png", "17997_lr-0.png", "17998_lr-0.png",
    "17999_lr-0.png", "18150_lr-0.png",
]

In [None]:
# Number of rows in the cleaned DataFrame
jumlah_baris = ptbxl_cleaned.shape[0]
print(f"Number of rows after cleaning: {jumlah_baris}")

In [None]:
# Modify the 'name' column to match the PNG file naming format used in the image dataset
ptbxl_cleaned.loc[:, 'name'] = (
    ptbxl_cleaned['name'].str.split('/').str[-1].str.rstrip('-0.png') + '-0.png'
)

# Display the first 11 rows of the result
ptbxl_cleaned.head(11)

In [None]:
pip install deep-translator

In [None]:
from deep_translator import GoogleTranslator
from tqdm import tqdm

# Initialize tqdm for pandas to show a progress bar
tqdm.pandas()

# Create a copy to avoid SettingWithCopyWarning
ptbxl_cleaned = ptbxl_cleaned.copy()

# Translate the 'caption' column from German to English with a progress bar
ptbxl_cleaned['caption_en'] = ptbxl_cleaned['caption'].progress_apply(
    lambda x: GoogleTranslator(source='de', target='en').translate(x)
)

In [None]:
# Remove the old 'caption' column
ptbxl_cleaned.drop(columns=['caption'], inplace=True)

# Rename the 'caption_en' column to 'caption'
ptbxl_cleaned.rename(columns={'caption_en': 'caption'}, inplace=True)

# Display the first 11 rows of the updated DataFrame
ptbxl_cleaned.head(11)

In [None]:
# Save the translated DataFrame to a CSV file
ptbxl_cleaned.to_csv('/kaggle/working/ptbxl_translated.csv', index=False)

# Standardization of Medical (ECG) Terms and Removal of Punctuation

The file `ptbxl_translated.csv` was originally saved in the Kaggle working directory. It was downloaded and then uploaded as input in the Kaggle Notebook.

In [None]:
# Define the new path where the translated CSV is stored in Kaggle input
translated_csv_path = '/kaggle/input/ptbxl_translated.csv'
# Read the CSV file into a DataFrame
ptbxl_translated = pd.read_csv(translated_csv_path)
# Display the first few rows
ptbxl_translated.head()

In [None]:
import re

# List of abbreviations to check
abbreviations = ["@", "+", "&", "pac", "pacs", "sve", "sves", "apc", "apcs", 
                 "pvc", "pvcs", "vpc", "vpcs", "ves", "ectopic", "ectopics", "ectopy",
                 "brady", "sb", "tachy", "tachycardia", "st", "svt", "nsvt", "sr", "nsr",
                 "af", "afib", "a-fib", "afl", "a-flutter", "aflutter", "cw", "rvr", "ppm",
                 "bpm", "pat", "pt", "bts", "wo", "w/o", "w", "w/", "hr", "avb"]

# Escape special characters for regex
escaped_abbreviations = [re.escape(abbr) for abbr in abbreviations]

# Create a dictionary to store the number of occurrences of each abbreviation
abbreviation_counts = {abbr: ptbxl["caption"].str.contains(rf"\b{escaped}\b", case=False, na=False).sum()
                       for abbr, escaped in zip(abbreviations, escaped_abbreviations)}

# Show only abbreviations that are present in the dataset
abbreviation_counts = {abbr: count for abbr, count in abbreviation_counts.items() if count > 0}
for abbr, count in abbreviation_counts.items():
    print(f"{abbr}: found {count} times")

In [None]:
abbreviation_dict = {
    "@": "at",
    "+": "and", "&": "and",
    "pac": "premature atrial contraction", "pacs": "premature atrial contraction", "sve": "premature atrial contraction", "sves": "premature atrial contraction",
    "apc": "premature atrial contraction", "apcs": "premature atrial contraction",
    "pvc": "premature ventricular contraction", "pvcs": "premature ventricular contraction",
    "vpc": "premature ventricular contraction", "vpcs": "premature ventricular contraction", "ves": "premature ventricular contraction",
    "ectopic": "premature contraction", "ectopics": "premature contraction", "ectopy": "premature contraction",
    "brady": "bradycardia",
    "sb": "sinus bradycardia",
    "tachy": "tachycardia", "tachycardia": "tachycardia",
    "st": "sinus tachycardia",
    "svt": "supraventricular tachycardia",
    "nsvt": "nonsustained ventricular tachycardia",
    "sr": "sinus rhythm",
    "nsr": "normal sinus rhythm",
    "af": "atrial fibrillation", "afib": "atrial fibrillation", "a-fib": "atrial fibrillation",
    "afl": "atrial flutter", "a-flutter": "atrial flutter", "aflutter": "atrial flutter",
    "cw": "continuous wave",
    "rvr": "rapid ventricular rate",
    "ppm": "permanent pacemaker",
    "bpm": "beats per minute",
    "pat": "patient", "pt": "patient",
    "bts": "beats",
    "wo": "without", "w/o": "without",
    "w": "with", "w/": "with",
    "hr": "heart rate",
    "avb": "atrioventricular block"
}

In [None]:
import re

def replace_abbreviations(text, abbrev_dict):
    words = text.split()  # Split text into a list of words
    replaced_words = [abbrev_dict[word.lower()] if word.lower() in abbrev_dict else word for word in words]
    return " ".join(replaced_words)

ptbxl['caption'] = ptbxl['caption'].apply(lambda x: replace_abbreviations(x, abbreviation_dict))

In [None]:
import string

# Create a regex pattern of all punctuation except hyphen (-)
punct_to_remove = string.punctuation.replace('-', '')  # keep hyphen
regex_pattern = f"[{re.escape(punct_to_remove)}]"

# Find all unique punctuation characters in the caption column (excluding hyphens)
unique_punctuations = set()
for caption in ptbxl['caption'].dropna():
    punct_in_caption = re.findall(rf"[{re.escape(punct_to_remove + '-')}]",
                                  caption)
    unique_punctuations.update(punct_in_caption)

print("Unique punctuation marks found (including hyphens):", unique_punctuations)

# Remove all punctuation except hyphens from captions
ptbxl['caption'] = ptbxl['caption'].apply(lambda x: re.sub(regex_pattern, '', x) if isinstance(x, str) else x)

In [None]:
output_csv_path = 'ptbxlengfinal.csv'
ptbxl.to_csv(output_csv_path, index=False)
print(f"CSV saved successfully as {output_csv_path}")