In [2]:
import os

import re
import csv

import numpy as np
import pandas as pd

In [3]:
data_dir = '/labs/gevaertlab/users/hackhack/RTOG/scratch'

# get the raw names of the folders to classify
all_files = []
all_files_path = []
for study in os.listdir(data_dir):  
    if 'Studies' in study:
        study_path = os.path.join(data_dir, study)
        for filename in os.listdir(study_path):
            filename_path = os.path.join(study_path, filename)
            # exclude folders with no dicoms
            if len(os.listdir(filename_path)) > 0:
                full_name = study + '/' + filename
                all_files.append(full_name)
                all_files_path.append(filename_path)

In [6]:
types_of_scan = ['MR', 'CT', 'PR']
AXIAL_VIEWS = ['ax', 'axial']
SAGITAL_VIEWS = ['sag', 'sagital']
CORONAL_VIEWS = ['cor', 'coronal']
MODALITIES = ['t1', 't2', 'flair', 'dwi', 'gre', 'blade', 'propeller', 'lava', 'fame', 'mprage']
# mprage instead of rage, otherwise match on 'average'

In [7]:
# helper functions

def find_view(raw_file_name):
    if any(x in raw_file_name for x in AXIAL_VIEWS):
        view = 'axial'
    elif any(x in raw_file_name for x in SAGITAL_VIEWS):
        view = 'sagital'
    elif any(x in raw_file_name for x in CORONAL_VIEWS):
        view = 'coronal'
    else:
        view = 'N/A'
    return view


def find_modality(raw_file_name):
    modalities = [x for x in MODALITIES if x in raw_file_name]
    if len(modalities) == 0:
        modality = 'N/A'
    elif len(modalities) == 1:
        if 't1' in modalities:
            if 'pre' in raw_file_name:
                modality = 't1pre'
            elif any(x in raw_file_name for x in ['post', 't1c']):
                modality = 't1post'
            else:
                modality = 't1 pre or post?'
        else:
            modality = modalities[0]
    else:
        if set(modalities) == set(['flair', 't2']):
            modality = 'flair'
        else:
            modality = 'multiple modalities detected'

    return modality


def classify_patient_file(raw_file_name):
    patient_id = 'N/A'
    view = 'N/A'
    modality = 'N/A'

    if 'MR' in raw_file_name:
        tokens = raw_file_name.split('MR')
    elif 'CT' in raw_file_name:
        tokens = raw_file_name.split('CT')
    else:
        return patient_id, view, modality

    if re.match(r'(\d+)(\^)(\d+)(\^)(\d+)(\_)(\d+)', tokens[0]):
        patient_id = tokens[0]
        truncate_pos = len(patient_id) + len('MR')
        truncated_path = raw_file_name[truncate_pos:]
        truncated_path = truncated_path.lower()
        view = find_view(truncated_path)
        modality = find_modality(truncated_path)
    return patient_id, view, modality

In [9]:
csv_path = '/home/romains/rtog_patients.csv'

# save csv on disk to export to speadsheet after and  check results
with open(csv_path, 'w') as csvfile:
    fieldnames = ['study', 'raw_name', 'curated_id', 'curated_view', 'curated_modality', 'nb_dicoms']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()

    for file in all_files_path:
        study = file.strip().split('/')[-2]
        raw_file = file.strip().split('/')[-1]
        all_dicoms = os.listdir(file)
        all_dicoms = [x for x in all_dicoms if x[-4:] == '.dcm']
        nb_dicoms = len(all_dicoms)
        assert(nb_dicoms != 0)
        curated_id, curated_view, curated_modality = classify_patient_file(raw_file)
        writer.writerow({'study': study, 'raw_name': raw_file, 'curated_id': curated_id,
                         'curated_view': curated_view, 'curated_modality': curated_modality, 'nb_dicoms': nb_dicoms})

In [10]:
# load csv to run some analyses on the number of patients for each view/modality
df = pd.read_csv(csv_path)

# drop any row where there is a Nan (missing id, view or modality)
df2 = df.dropna(axis=0, how='any') 

# drop any row for which view is not axial
df3 = df2.replace('sagital', pd.np.nan)
df3 = df3.replace('coronal', pd.np.nan)
df3 = df3.dropna(axis=0, how='any')

In [71]:
# group rows by patient id
dfgroup = df3.groupby(['study', 'curated_id'])

In [14]:
dfpivot = df3.loc[:,['study', 'curated_id', 'curated_modality', 'raw_name', 'nb_dicoms']]
# TODO: figure out which folder to keep in case one patient has several folders for the same view and modality
dfpivot.drop_duplicates(subset=['study', 'curated_id', 'curated_modality'], keep='first', inplace=True)
dfpivot = dfpivot.set_index(['study', 'curated_id', 'curated_modality'])
dfpivot = dfpivot.unstack(fill_value=0)
dfpivot = dfpivot.loc[:, 'nb_dicoms']

dfpivot


Unnamed: 0_level_0,curated_modality,blade,dwi,flair,gre,mprage,multiple modalities detected,t1 pre or post?,t1post,t1pre,t2
study,curated_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0101-01__Studies,727^1027^825_727_,0,0,26,0,0,0,0,26,26,0
0101-01__Studies,874^8061^825_874_,0,0,11,0,0,0,0,50,24,25
1899-12__Studies,204^7870^825_204_,0,43,22,0,0,22,642,128,101,0
1899-12__Studies,215^603^825_215_,0,24,10,0,0,0,30,0,0,22
1899-12__Studies,295^2403^825_295_,0,52,26,0,0,26,660,136,136,0
1899-12__Studies,935^2403^825_935_,0,0,24,0,0,0,0,132,0,0
2004-03__Studies,125^126^825_125_,0,0,25,0,0,25,25,25,0,25
2008-02__Studies,836^3459^825_836_,0,0,20,0,0,0,20,0,0,20
2009-04__Studies,203^2503^825_203_,0,0,23,0,0,0,0,0,0,0
2009-06__Studies,10^7017^825_10_,0,0,0,0,0,76,0,76,76,76


In [72]:
patients = []

all_groups = dfgroup.groups
for group_name in all_groups.keys():
    group = dfgroup.get_group(group_name)
    patients.append(list(group.curated_modality.values))

In [73]:
# number of patients with axial view that have at least one modality
print(len(patients))

400


In [77]:
new_modalities = df3.curated_modality.unique()
patients_per_modality = dict([(mod, len([pat for pat in patients if mod in pat])) for mod in new_modalities])
print(patients_per_modality)

{'t1 pre or post?': 314, 't2': 305, 'mprage': 4, 't1post': 183, 'multiple modalities detected': 86, 'dwi': 91, 'blade': 1, 'flair': 332, 't1pre': 47, 'gre': 54}


In [11]:
len([pat for pat in patients if (all(x in pat for x in ['t1post', 'flair']) or all(x in pat for x in ['gre', 'flair'])) ])

177

- 155 patients with flair and t1post
- 177 patients with flair + t1post or flair + gre

- find patients where number of slices is the same across modality
- check for slice thickness
- need t1post and flair
- take the most anterior date in case they are several