In [49]:
import dicom
import os
import sys
import pandas as pd
import numpy as np
import glob
import re

# Define scraper function.
def get_dcm_info(scan_dir, temp_path):
    # Set parameters to scrape from common header fields (and how they should be labeled).
    params_dict = {'scan_date': 'AcquisitionDate','scanner_institution':'InstitutionName',
                   'scanner_manufacturer':'Manufacturer','scanner_model':'ManufacturerModelName',
                   'field_strength':'MagneticFieldStrength','sequence_name':'SequenceName',
                   'repetition_time':'RepetitionTime','echo_time':'EchoTime','flip_angle':'FlipAngle',
                   'pixel_spacing':'PixelSpacing','slice_thickness':'SliceThickness',
                   'slice_spacing':'SpacingBetweenSlices','matrix_size':'AcquisitionMatrix'}
    # Set parameters to scrape from Siemens CSA header fields.
    csa_dict = {'n_slices':[0x19, 0x100a],'field_of_view':[0x51, 0x100c]}
    # Get a list of image files in the scan directory.
    img_list = os.listdir(scan_dir)
    # Initialize a dictionary with n_volumes as the first value.
    info_dict = {'n_volumes':len(img_list)}
    # If there are image files in the scan directory... 
    if len(img_list) > 0:
        # Get the path to the first file in the image list.
        file_path= '/'.join([scan_dir, img_list[0]])
        # Gracefully handle compressed DICOM files.
        if file_path.endswith('dcm.gz'):
            read_from_temp = True
            # Unzip the compressed DICOM to the temporary image path.
            os.system('gunzip -c {} > {}'.format(file_path, temp_path))
            print('1')
        else:
            read_from_temp = False
        # Try to read the DICOM.
        try:
            if read_from_temp == True:
                dcm_data = dicom.read_file(temp_path)
            else:
                dcm_data = dicom.read_file(file_path)
        # If unable to read the DICOM...
        except:
            # Print an error message.
            print('......unable to load DICOM, continuing to next scan.')
            # Fill dictionary values.
            for k in params_dict.keys():
                info_dict[k] = 'DICOM could not be loaded'
            for k in csa_dict.keys():
                info_dict[k] = 'DICOM could not be loaded'
            # Return the dictionary.
            return info_dict
        # If DICOM is loaded successfully...
        else:
            # Try to get information from standard DICOM headers.
            for k, v in params_dict.iteritems():
                try:
                    info_dict[k] = dcm_data.get(v)
                except:
                    info_dict[k] = 'Not Found'    
            # Try to get information from Siemens CSA headers.
            for k, v in csa_dict.iteritems():
                try:
                    info_dict[k] = dcm_data[hex(csa_dict[k][0]), hex(csa_dict[k][1])].value
                except:
                    info_dict[k] = 'Not Found'
            return info_dict
        finally:
            if read_from_temp == True:
                # Delete the temporary uncompressed file.
                os.system('rm {}'.format(temp_path))
    # If there are no images in the scan directory...
    else:
        # Fill dictionary values.
        for k in params_dict.keys():
            info_dict[k] = 'No Images Found'
        for k in csa_dict.keys():
            info_dict[k] = 'No Images Found'
        # Return the dictionary.
        return info_dict

In [2]:
rest_folders = glob.glob('/home/despo/rstate/data/CardiaData/*/CARDIA_CARDIA_Final*/BOLD_resting_*')

In [3]:
id_search = re.compile('(?<=CardiaData\/)(\S+)(?=\/CARDIA_CARDIA)')

In [54]:
os.path.basename(rest_folders[0])

'BOLD_resting_12'

In [51]:
get_dcm_info(rest_folders[0], '/home/despo/dlurie/tmp/tmp.dcm')

('/home/despo/rstate/data/CardiaData/20197/CARDIA_CARDIA_Final - 2/BOLD_resting_12/IM-0012-0001.dcm.gz', '/home/despo/dlurie/tmp/tmp.dcm')


CalledProcessError: Command '['gunzip', '-c', '/home/despo/rstate/data/CardiaData/20197/CARDIA_CARDIA_Final - 2/BOLD_resting_12/IM-0012-0001.dcm.gz', '>', '/home/despo/dlurie/tmp/tmp.dcm']' returned non-zero exit status 1

In [14]:
id_search.search(rest_folders[0]).groups()[0]

'20197'

In [None]:
tmp_path, out_path = sys.argv[1:]

rest_folders = glob.glob('/home/despo/rstate/data/CardiaData/*/CARDIA_CARDIA_Final*/BOLD_resting_*')
id_search = re.compile('(?<=CardiaData\/)(\S+)(?=\/CARDIA_CARDIA)')

dict_store = []
for sub_folder in rest_folders:
    sub_id = id_search.search(sub_folder).groups()[0]
    print('Processing subject {}...'.format(sub_id))
    print('...attempting to extract header information.')
    scan_dict = get_dcm_info(sub_folder, tmp_path)
    scan_dict['subject_id'] = sub_id
    scan_dict['scan_name'] = os.path.basename(sub_folder)
    dict_store.append(scan_dict)
    os.path.pardir

info_df = pd.DataFrame.from_dict(dict_store)
info_df.to_csv(out_path)