## PID to CDM_ID File Renaming

In [1]:
import os
import pandas as pd
from tqdm import tqdm
import shutil  # 파일 복사를 위해 shutil 모듈 사용

# 경로 설정
csv_path = r'C:\github\CDM\Holter\holter_pid.csv'
file_dir = r'C:\old_sig'
rename_dir = r'c:\old_rename'

# old_rename 폴더가 없으면 생성
if not os.path.exists(rename_dir):
    os.makedirs(rename_dir)

# CSV 파일 로드 및 딕셔너리 생성
df = pd.read_csv(csv_path)
pid_to_cdm_id = df.set_index('pid')['cdm_id'].to_dict()

# 파일 리스트 가져오기
files = [f for f in os.listdir(file_dir) if '_' in f]

converted_count = 0

# 파일 이름 변경 및 복사
for file in tqdm(files, desc="Renaming files"):
    try:
        parts = file.split('_')
        index_number, pid = parts[0], parts[1].split('.')[0]  # 파일명에서 인덱스 번호와 pid 추출
        extension = parts[1].split('.')[1]  # 파일 확장자 추출
        cdm_id = pid_to_cdm_id.get(int(pid))  # pid에 대응하는 cdm_id 찾기

        if cdm_id:
            # cdm_id를 사용해 새로운 파일명 생성
            new_filename = f'{index_number}_{int(cdm_id)}.{extension}'  

            old_file_path = os.path.join(file_dir, file)
            new_file_path = os.path.join(rename_dir, new_filename)
            # 파일 복사
            shutil.copy(old_file_path, new_file_path)
            converted_count += 1  
    except ValueError:
        print(f'Error processing file: {file}')

print(f'Task completed. A total of {converted_count} files were converted and copied to {rename_dir}.')


Renaming files:   5%|▍         | 225/4632 [00:00<00:01, 2209.13it/s]

Error processing file: 40016_09040928.hea
Error processing file: 40016_09040928.pdf
Error processing file: 40016_09040928.SIG
Error processing file: 40213_09105609.hea
Error processing file: 40213_09105609.pdf
Error processing file: 40213_09105609.SIG
Error processing file: 40402_07768718.hea
Error processing file: 40402_07768718.pdf
Error processing file: 40402_07768718.SIG
Error processing file: 40500_07746794.hea
Error processing file: 40500_07746794.pdf
Error processing file: 40500_07746794.SIG
Error processing file: 40511_09128789.hea
Error processing file: 40511_09128789.pdf
Error processing file: 40511_09128789.SIG
Error processing file: 40691_52171019.hea
Error processing file: 40691_52171019.pdf
Error processing file: 40691_52171019.SIG
Error processing file: 40710_08492797.hea
Error processing file: 40710_08492797.pdf
Error processing file: 40710_08492797.SIG
Error processing file: 40715_08634779.hea
Error processing file: 40715_08634779.pdf
Error processing file: 40715_08634

Renaming files:  13%|█▎        | 611/4632 [00:00<00:03, 1031.19it/s]

Error processing file: 44036_09291740.hea
Error processing file: 44036_09291740.pdf
Error processing file: 44036_09291740.SIG


Renaming files:  20%|██        | 929/4632 [00:00<00:04, 838.95it/s] 

Error processing file: 44135_08696809.hea
Error processing file: 44135_08696809.pdf
Error processing file: 44135_08696809.SIG
Error processing file: 44189_08264853.hea
Error processing file: 44189_08264853.pdf
Error processing file: 44189_08264853.SIG
Error processing file: 44202_08264853.hea
Error processing file: 44202_08264853.pdf
Error processing file: 44202_08264853.SIG


Renaming files:  34%|███▎      | 1555/4632 [00:01<00:02, 1281.21it/s]

Error processing file: 44296_09769919.hea
Error processing file: 44296_09769919.pdf
Error processing file: 44296_09769919.SIG
Error processing file: 44343_09187360.hea
Error processing file: 44343_09187360.pdf
Error processing file: 44343_09187360.SIG


Renaming files:  44%|████▎     | 2019/4632 [00:01<00:01, 1414.88it/s]

Error processing file: 44489_32105789.hea
Error processing file: 44489_32105789.pdf
Error processing file: 44489_32105789.SIG


Renaming files:  63%|██████▎   | 2903/4632 [00:02<00:01, 1334.77it/s]

Error processing file: 44810_75157856.hea
Error processing file: 44810_75157856.pdf
Error processing file: 44810_75157856.SIG
Error processing file: 44822_09040928.hea
Error processing file: 44822_09040928.pdf
Error processing file: 44822_09040928.SIG


Renaming files:  81%|████████  | 3737/4632 [00:02<00:00, 1564.68it/s]

Error processing file: 45109_09722237.hea
Error processing file: 45109_09722237.pdf
Error processing file: 45109_09722237.SIG
Error processing file: 45135_07768718.hea
Error processing file: 45135_07768718.pdf
Error processing file: 45135_07768718.SIG


Renaming files:  87%|████████▋ | 4036/4632 [00:03<00:00, 1331.95it/s]

Error processing file: 45229_08098474.hea
Error processing file: 45229_08098474.pdf
Error processing file: 45229_08098474.SIG


Renaming files:  97%|█████████▋| 4470/4632 [00:03<00:00, 1345.47it/s]

Error processing file: 45377_26594568.hea
Error processing file: 45377_26594568.pdf
Error processing file: 45377_26594568.SIG


Renaming files: 100%|██████████| 4632/4632 [00:03<00:00, 1264.98it/s]

Task completed. A total of 4515 files were converted and moved to c:\old_rename.


Renaming files:   5%|▌         | 792/14764 [08:38<3:08:54,  1.23it/s]

Error processing file: 41547_09310108.hea
Error processing file: 41547_09310108.pdf
Error processing file: 41547_09310108.SIG


Renaming files:  13%|█▎        | 1963/14764 [21:38<1:15:09,  2.84it/s]

Error processing file: 41942_09590315.hea
Error processing file: 41942_09590315.pdf
Error processing file: 41942_09590315.SIG


Renaming files:  13%|█▎        | 1969/14764 [21:40<1:03:00,  3.38it/s]

Error processing file: 41944_34416489.hea
Error processing file: 41944_34416489.pdf
Error processing file: 41944_34416489.SIG


Renaming files:  17%|█▋        | 2533/14764 [27:53<1:07:03,  3.04it/s]

Error processing file: 42135_48875905.hea
Error processing file: 42135_48875905.pdf
Error processing file: 42135_48875905.SIG


Renaming files:  19%|█▉        | 2770/14764 [30:28<1:09:14,  2.89it/s]

Error processing file: 42216_09295595.hea
Error processing file: 42216_09295595.pdf
Error processing file: 42216_09295595.SIG


Renaming files:  26%|██▌       | 3796/14764 [41:59<1:04:05,  2.85it/s]

Error processing file: 42575_08492797.hea
Error processing file: 42575_08492797.pdf
Error processing file: 42575_08492797.SIG


Renaming files:  28%|██▊       | 4195/14764 [46:36<1:57:25,  1.50it/s]


OSError: [Errno 22] Invalid argument

## Update .hea Filename 

In [10]:
import os
from tqdm import tqdm

def replace_filename_in_hea_files(directory):
    converted_files_count = 0
    
    # Traverse the directory and find .hea files
    for filename in tqdm(os.listdir(directory)):
        if filename.endswith('.hea'):
            file_path = os.path.join(directory, filename)

            try:
                with open(file_path, 'r', encoding='utf-8') as file:
                    lines = file.readlines()
                
                # Modify the line
                base_filename = filename.replace('.hea', '')
                lines[0] = lines[0].replace(lines[0].split()[0], base_filename)
                
                for i in range(1, len(lines)):
                    lines[i] = lines[i].replace(lines[i].split()[0], f"{base_filename}.SIG")
                
                with open(file_path, 'w', encoding='utf-8') as file:
                    file.writelines(lines)
                
                converted_files_count += 1
            
            except Exception as e:
                print(f"Error processing file: {filename}. Error: {e}")
    
    print(f"Total number of converted files: {converted_files_count}")

directory_path = r'C:\old_rename'
replace_filename_in_hea_files(directory_path)


  0%|          | 0/10434 [00:00<?, ?it/s]

100%|██████████| 10434/10434 [00:02<00:00, 3547.28it/s]

Total number of converted files: 3468





## Report to XML

In [12]:
import os
import re
import fitz  # PyMuPDF
from xml.etree.ElementTree import Element, SubElement, tostring
from xml.dom.minidom import parseString
from tqdm import tqdm

def extract_match(pattern, text, default="Unknown"):
    match = re.search(pattern, text)
    return match.group(1) if match else default

def extract_grouped_matches(pattern, text, groups, default="Unknown"):
    match = re.search(pattern, text)
    if match:
        return [match.group(i + 1) for i in range(groups)]
    return [default] * groups

def parse_general_section(text):
    general_section = re.search(r"General\n(.+?)Heart Rates", text, re.DOTALL)
    if general_section:
        general_text = general_section.group(1)
        qrs_complexes = extract_match(r"(\d+) QRS complexes", general_text)
        ventricular_beats = extract_match(r"(\d+) Ventricular beats", general_text)
        supraventricular_beats = extract_match(r"(\d+) Supraventricular beats", general_text)
        noise_percentage = extract_match(r"(<\s*\d+|\d+) % of total time classified as noise", general_text, "0")
        paced_beats = extract_match(r"(\d+) Paced beats", general_text)
        af_afl_percentage = extract_match(r"(<\s*\d+|\d+) % of total time in AF/AFL", general_text)
        bb_beats = extract_match(r"(\d+) BB beats", general_text)
        junctional_beats = extract_match(r"(\d+) Junctional beats", general_text)
        aberrant_beats = extract_match(r"(\d+) Aberrant beats", general_text)
    else:
        qrs_complexes = ventricular_beats = supraventricular_beats = noise_percentage = "Unknown"
        paced_beats = af_afl_percentage = bb_beats = junctional_beats = aberrant_beats = "Unknown"
    return {
        'QRScomplexes': qrs_complexes,
        'VentricularBeats': ventricular_beats,
        'SupraventricularBeats': supraventricular_beats,
        'NoisePercentage': noise_percentage,
        'PacedBeats': paced_beats,
        'AFAFLPercentage': af_afl_percentage,
        'BBBeats': bb_beats,
        'JunctionalBeats': junctional_beats,
        'AberrantBeats': aberrant_beats
    }

def parse_heart_rates_section(text):
    heart_rates_data = {}
    patterns = [
        (r"(\d+) Minimum at ([\d:]+ \d+-\w+)", 'MinimumRate', 'Timestamp'),
        (r"(\d+) Average", 'AverageRate', None),
        (r"(\d+) Maximum at ([\d:]+ \d+-\w+)", 'MaximumRate', 'Timestamp'),
        (r"(\d+)\s*Beats in tachycardia \(>=?\d+\s*bpm\),\s*(\d+)% total", 'TachycardiaBeats', 'TachycardiaPercentage'),
        (r"(\d+)\s*Beats in bradycardia \(<=?\d+\s*bpm\),\s*(\d+)% total", 'BradycardiaBeats', 'BradycardiaPercentage')
    ]
    for pattern, main_tag, sub_tag in patterns:
        match = re.search(pattern, text)
        if match:
            heart_rates_data[main_tag] = (match.group(1), match.group(2) if sub_tag else None)
        else:
            heart_rates_data[main_tag] = ("Unknown", "Unknown" if sub_tag else None)
    return heart_rates_data

def parse_section(section_text, patterns):
    section_data = {}
    for pattern, tags in patterns:
        matches = extract_grouped_matches(pattern, section_text, len(tags))
        for tag_index, tag in enumerate(tags):
            section_data[tag] = matches[tag_index]
    return section_data

def create_xml(patient_info, general_data, heart_rates_data, ventriculars_data, supraventriculars_data, xml_path):
    root = Element('HolterReport')
    patient_info_element = SubElement(root, 'PatientInfo')
    for key, value in patient_info.items():
        SubElement(patient_info_element, key).text = value

    general_element = SubElement(root, 'General')
    for key, value in general_data.items():
        SubElement(general_element, key).text = value

    heart_rates_element = SubElement(root, 'HeartRates')
    for key, (value, sub_value) in heart_rates_data.items():
        element = SubElement(heart_rates_element, key)
        if sub_value:
            SubElement(element, 'Timestamp').text = sub_value
        element.text = value

    ventriculars_element = SubElement(root, 'Ventriculars')
    for key, value in ventriculars_data.items():
        SubElement(ventriculars_element, key).text = value

    supraventriculars_element = SubElement(root, 'Supraventriculars')
    for key, value in supraventriculars_data.items():
        SubElement(supraventriculars_element, key).text = value

    xml_str = tostring(root, 'utf-8')
    parsed_str = parseString(xml_str)
    pretty_xml_str = parsed_str.toprettyxml(indent="   ")

    with open(xml_path, "w") as xml_file:
        xml_file.write(pretty_xml_str)

def process_pdf_files(file_dirs, xml_dir):
    pdf_files = []
    for file_dir in file_dirs:
        for root, _, files in os.walk(file_dir):
            for file in files:
                if file.endswith('.pdf'):
                    pdf_files.append(os.path.join(root, file))
    
    failed_files = []

    for pdf_path in tqdm(pdf_files, desc="Processing PDF Files"):
        try:
            filename = os.path.basename(pdf_path)
            pdf_doc = fitz.open(pdf_path)
            page = pdf_doc.load_page(0)
            extracted_text = page.get_text()

            patient_info = {
                'PID': extract_match(r"Patient Name:?\n(\d+)\nID:?", extracted_text, filename.split('_')[-1].replace('.pdf', '')),
                'HookupDate': extract_match(r"Medications:?\n(\d+-\w+-\d+)\nHookup Date:?", extracted_text, "Unknown"),
                'HookupTime': extract_match(r"Hookup Date:?\n(\d+:\d+:\d+)\nHookup Time:?", extracted_text, "Unknown"),
                'Duration': extract_match(r"Hookup Time:?\n(\d+:\d+:\d+)\nDuration:?", extracted_text, "Unknown"),
                'Age': extract_match(r"(\d+)\s*yr\s*Age:", extracted_text, "Unknown"),
                'Gender': extract_match(r"(Male|Female)\s*Gender:", extracted_text, "Unknown")
            }

            general_data = parse_general_section(extracted_text)

            heart_rates_data = parse_heart_rates_section(extracted_text)

            ventriculars_section = extract_match(r"Ventriculars \(V, F, E, I\)\n([\s\S]+?)\nSupraventriculars \(S, J, A\)", extracted_text, "")
            supraventriculars_section = extract_match(r"Supraventriculars \(S, J, A\)\n([\s\S]+?)Interpretation", extracted_text, "")

            ventriculars_patterns = [
                (r"(\d+) Isolated", ['Isolated']),
                (r"(\d+) Couplets", ['Couplets']),
                (r"(\d+) Bigeminal cycles", ['BigeminalCycles']),
                (r"(\d+) Runs totaling (\d+) beats", ['Runs', 'TotalBeats']),
                (r"(\d+) Beats longest run (\d+) bpm ([\d:]+ \d+-\w+)", ['LongestRunBeats', 'LongestRunBPM', 'LongestRunTimestamp']),
                (r"(\d+) Beats fastest run (\d+) bpm ([\d:]+ \d+-\w+)", ['FastestRunBeats', 'FastestRunBPM', 'FastestRunTimestamp'])
            ]

            supraventriculars_patterns = [
                (r"(\d+) Isolated", ['Isolated']),
                (r"(\d+) Couplets", ['Couplets']),
                (r"(\d+) Bigeminal cycles", ['BigeminalCycles']),
                (r"(\d+) Runs totaling (\d+) beats", ['Runs', 'TotalBeats']),
                (r"(\d+) Beats longest run (\d+) bpm ([\d:]+ \d+-\w+)", ['LongestRunBeats', 'LongestRunBPM', 'LongestRunTimestamp']),
                (r"(\d+) Beats fastest run (\d+) bpm ([\d:]+ \d+-\w+)", ['FastestRunBeats', 'FastestRunBPM', 'FastestRunTimestamp'])
            ]

            ventriculars_data = parse_section(ventriculars_section, ventriculars_patterns)
            supraventriculars_data = parse_section(supraventriculars_section, supraventriculars_patterns)

            xml_path = os.path.join(xml_dir, os.path.splitext(filename)[0] + '.xml')
            create_xml(patient_info, general_data, heart_rates_data, ventriculars_data, supraventriculars_data, xml_path)
            

        except Exception as e:
            print(f"Failed to process {filename}: {e}")
            failed_files.append(filename)

    return failed_files

def main():
    base_dirs = [
        r'C:\old_rename'
    ]
    xml_dir = r'C:\old_rename'

    if not os.path.exists(xml_dir):
        os.makedirs(xml_dir)

    print("Starting to process PDF files...")
    failed_files_record = process_pdf_files(base_dirs, xml_dir)

    if failed_files_record:
        print("\nFailed to process the following files:")
        for failed_file in failed_files_record:
            print(failed_file)
    else:
        print("\nAll PDF files processed successfully.")

    print("Completed processing all files.")

if __name__ == "__main__":
    main()


Starting to process PDF files...


Processing PDF Files:   8%|▊         | 273/3495 [00:04<00:42, 75.75it/s]

Failed to process 40289_2217257.pdf: Cannot open empty file: filename='C:\\old_rename\\40289_2217257.pdf'.


Processing PDF Files:  10%|▉         | 341/3495 [00:05<00:47, 66.32it/s]

Failed to process 40356_2058072.pdf: Failed to open file 'C:\\old_rename\\40356_2058072.pdf'.


Processing PDF Files:  32%|███▏      | 1104/3495 [00:14<00:32, 72.63it/s]

Failed to process 42714_3708617.pdf: Failed to open file 'C:\\old_rename\\42714_3708617.pdf'.


Processing PDF Files: 100%|██████████| 3495/3495 [00:43<00:00, 79.48it/s]


Failed to process the following files:
40289_2217257.pdf
40356_2058072.pdf
42714_3708617.pdf
Completed processing all files.





##  XML 파일을 읽고, PID 값을 파일명에 있는 값으로 변경

In [16]:
import os
import xml.etree.ElementTree as ET
from tqdm import tqdm

# 폴더 경로
folder_path = r'C:\old_rename'

# 폴더 내의 모든 XML 파일을 확인
xml_files = [file for file in os.listdir(folder_path) if file.endswith('.xml')]

# tqdm을 사용하여 진행 상황 표시
for file_name in tqdm(xml_files, desc="Processing XML files"):
    file_path = os.path.join(folder_path, file_name)
    
    # 파일명에서 '_' 뒤의 값을 추출
    new_pid = file_name.split('_')[-1].split('.')[0]
    
    # XML 파일 로드
    tree = ET.parse(file_path)
    root = tree.getroot()
    
    # PID 태그를 찾고 값을 새로운 PID로 변경
    for pid in root.iter('PID'):
        pid.text = new_pid
    
    # 변경된 XML을 파일에 다시 저장
    tree.write(file_path)

print("모든 XML 파일의 PID 값이 변경되었습니다.")


Processing XML files: 100%|██████████| 3492/3492 [00:10<00:00, 341.00it/s]

모든 XML 파일의 PID 값이 변경되었습니다.



