In [1]:
import random
import numpy as np
import glob
import os
from pydicom import dcmread
import shutil
import pandas as pd
from os.path import expanduser
from datetime import datetime

In [2]:
source_base_dir = '/Users/E119562/Downloads/Test-GBM-Sorted-3'
destination_base_dir = '/Users/E119562/Downloads/Test-GBM-Anon'
mapping_file = '{}dicom-anon-mapping.xlsx'.format(expanduser('~')+os.sep)

In [3]:
mapping_file

'/Users/E119562/dicom-anon-mapping.xlsx'

In [4]:
patient_dirs_l = [ name for name in os.listdir(source_base_dir) if os.path.isdir(os.path.join(source_base_dir, name)) ]
patient_dirs_l

['210846_SIM^JEFFREY^R']

In [5]:
if os.path.isfile(mapping_file):
    mapping_df = pd.read_excel(mapping_file, index_col=0)
else:
    mapping_df = None
mapping_df

Unnamed: 0,patient_id,anon_patient_dir_name,anon_patient_id,total_session_count,last_updated
0,190854,Brain-0001,1,3,2024-05-01 14:05:47
1,210523,Brain-0002,2,8,2024-05-01 14:08:15
2,210846,Brain-0003,3,44,2024-05-01 14:09:43


In [12]:
def get_anon_patient_id(patient_id, mapping_df):
    new_patient = False
    if mapping_df is None:
        print('no previous mapping - starting')
        anon_patient_id = 1
        new_patient = True
    else:
        # have we seen this patient ID before?
        row_df = mapping_df[mapping_df.patient_id == patient_id]
        if len(row_df) > 0:
            anon_patient_id = row_df.iloc[0].anon_patient_id
            new_patient = False
            print('patient ID {} seen previously (anon ID {}) - appending to existing directory'.format(patient_id, anon_patient_id))
        else:
            anon_patient_id = mapping_df.anon_patient_id.max()+1
            new_patient = True
            print('patient ID {} not seen previously - adding new anon ID {} directory'.format(patient_id, anon_patient_id))
    return new_patient, anon_patient_id

In [7]:
for patient_idx,patient_dir in enumerate(patient_dirs_l):
    patient_id = int(patient_dir.split('_')[0])
    new_patient, anon_patient_id = get_anon_patient_id(patient_id, mapping_df)
    anon_patient_folder_name = 'Brain-{:04d}'.format(anon_patient_id)
    anon_patient_dir = destination_base_dir + os.sep + anon_patient_folder_name
    patient_full_dir = source_base_dir + os.sep + patient_dir
    dicom_files = glob.glob('{}/**/*.dcm'.format(patient_full_dir), recursive=True)
    print('found {} source files'.format(len(dicom_files)))
    for source_file in dicom_files:
        rel_path = os.path.relpath(source_file, patient_full_dir)  # use the same relative path for source and target
        anon_patient_file = anon_patient_dir + os.sep + rel_path   # add the relative path to the 'Brain-nnnn' directory
        # load and process the file
        try:
            ds = dcmread(source_file)
        except Exception as e:
            print(e)
            invalid_file_count += 1
        else:
            # create the anon folder if it doesn't exist
            target_dir = os.path.dirname(anon_patient_file)  # create the missing directories all the way to the DICOM file
            if not os.path.exists(target_dir):
                os.makedirs(target_dir)
            ds.save_as(anon_patient_file)
    # count the total sessions anonymised for this patient
    anon_patient_sessions_l = [ name for name in os.listdir(anon_patient_dir) if os.path.isdir(os.path.join(anon_patient_dir, name)) ]
    session_count = len(anon_patient_sessions_l)
    # add or update the mapping
    date_str = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    if new_patient:
        row = pd.Series({'patient_id':patient_id, 'anon_patient_dir_name':anon_patient_folder_name, 'anon_patient_id':anon_patient_id, 'total_session_count':session_count, 'last_updated':date_str})
        mapping_df = pd.concat([mapping_df, pd.DataFrame([row], columns=row.index)]).reset_index(drop=True)
    else:
        row_index = mapping_df.loc[mapping_df['patient_id'] == patient_id].index[0]
        mapping_df.loc[row_index, 'total_session_count'] = session_count
        mapping_df.loc[row_index, 'last_updated'] = date_str

patient ID 210846 seen previously (anon ID 3) - appending existing anon directory
found 84 source files


In [8]:
mapping_df

Unnamed: 0,patient_id,anon_patient_dir_name,anon_patient_id,total_session_count,last_updated
0,190854,Brain-0001,1,3,2024-05-01 14:05:47
1,210523,Brain-0002,2,8,2024-05-01 14:08:15
2,210846,Brain-0003,3,45,2024-05-01 14:16:05


In [9]:
mapping_df.to_excel(mapping_file)

In [10]:
dff = pd.read_excel(mapping_file, index_col=0)

In [11]:
dff

Unnamed: 0,patient_id,anon_patient_dir_name,anon_patient_id,total_session_count,last_updated
0,190854,Brain-0001,1,3,2024-05-01 14:05:47
1,210523,Brain-0002,2,8,2024-05-01 14:08:15
2,210846,Brain-0003,3,45,2024-05-01 14:16:05
