In [24]:
import os

import re
import csv

import numpy as np
import pandas as pd

import dicom
from collections import Counter

In [10]:
data_dir = '/labs/gevaertlab/users/hackhack/RTOG/scratch'

# get the raw names of the folders to classify
all_files = []
all_files_path = []
for study in os.listdir(data_dir):  
    if 'Studies' in study:
        study_path = os.path.join(data_dir, study)
        for filename in os.listdir(study_path):
            filename_path = os.path.join(study_path, filename)
            # exclude folders with no dicoms
            if len(os.listdir(filename_path)) > 0:
                full_name = study + '/' + filename
                all_files.append(full_name)
                all_files_path.append(filename_path)

In [11]:
types_of_scan = ['MR', 'CT', 'PR']
AXIAL_VIEWS = ['axl', 'axial', 'ax']
SAGITAL_VIEWS = ['sag', 'sagital']
CORONAL_VIEWS = ['cor', 'coronal']
MODALITIES = ['t1', 't2', 'flair', 'dwi', 'gre', 'blade', 'propeller', 'lava', 'fame', 'mprage']
# mprage instead of rage, otherwise match on 'average'

In [12]:
# helper functions

def find_view(description):
    description = description.lower()
    if any(x in description for x in AXIAL_VIEWS):
        view = 'axial'
    elif any(x in description for x in SAGITAL_VIEWS):
        view = 'sagital'
    elif any(x in description for x in CORONAL_VIEWS):
        view = 'coronal'
    else:
        view = 'N/A'
    return view


def find_modality(description):
    description = description.lower()
    modalities = [x for x in MODALITIES if x in description]
    if len(modalities) == 0:
        modality = 'N/A'
    elif len(modalities) == 1:
        if 't1' in modalities:
            if 'pre' in description:
                modality = 't1pre'
            elif any(x in description for x in ['post', 't1c', '+c', '+ c', 'c+', 'con', 'gd', 'gad']):
                modality = 't1post'
            else:
                modality = 't1 pre or post?'
        else:
            modality = modalities[0]
    else:
        if set(modalities) == set(['flair', 't2']):
            modality = 'flair'
        else:
            modality = 'multiple modalities detected'

    return modality


def collect_info_patient_folder(patient_folder):  
    dicoms = os.listdir(patient_folder)
    dicoms = [dic for dic in dicoms if dic[-4:] == '.dcm']
    sample = dicoms[0]
    sample = os.path.join(patient_folder, sample)
    sample = dicom.read_file(sample)

    # collect information
    patient_id = str(sample.PatientName)
    date = str(sample.AcquisitionDate)
    thickness = str(sample.SliceThickness)
    rows = int(sample.Rows)
    columns = int(sample.Columns)
    nb_dicoms = len(dicoms)
    description = str(sample.SeriesDescription)
    view = find_view(description)
    modality = find_modality(description)

    return patient_id, view, modality, date, thickness, rows, columns, nb_dicoms, description

In [13]:
csv_path = '/home/romains/rtog_patients_v5.csv'

In [14]:
# save csv on disk to export to speadsheet after and check results
with open(csv_path, 'w') as csvfile:
    fieldnames = ['study', 'id', 'description', 'view', 'modality',\
                  'nb_dicoms', 'date', 'thickness', 'rows', 'columns', 'raw']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()

    for k, raw_file in enumerate(all_files_path):
        if k % 500 == 0:
            print('file %d out of %d'%(k + 1, len(all_files_path)))
        study = raw_file.strip().split('/')[-2]
        try:
            patient_id, view, modality, date, thickness, rows,\
                columns, nb_dicoms, description = collect_info_patient_folder(raw_file)
        except:
            continue
        patient_id = os.path.join(study, patient_id)
        writer.writerow({'study': study, 'id': patient_id, 'description': description,\
                         'view': view, 'modality': modality, 'nb_dicoms': nb_dicoms,\
                         'date': date, 'thickness': thickness, 'rows': rows,\
                         'columns': columns, 'raw': raw_file})

file 1 out of 5547
file 501 out of 5547
file 1001 out of 5547
file 1501 out of 5547
file 2001 out of 5547
file 2501 out of 5547
file 3001 out of 5547
file 3501 out of 5547
file 4001 out of 5547
file 4501 out of 5547
file 5001 out of 5547
file 5501 out of 5547


In [15]:
# load csv to run some analyses on the number of patients for each view/modality
df = pd.read_csv(csv_path)

# drop any row where there is a Nan (missing id, view or modality)
df2 = df.dropna(axis=0, how='any') 

# drop any row for which view is not axial
df3 = df2.replace('sagital', pd.np.nan)
df3 = df3.replace('coronal', pd.np.nan)
df3 = df3.dropna(axis=0, how='any')

In [21]:
df4 = df3[df3.modality == "t1 pre or post?"]
df4.drop_duplicates(subset=['description'], keep='first', inplace=True)
df4.loc[:, ['description', 'raw']]
list(df4.description)
#list(np.unique(np.array(sorted(list(df3[df3.modality == "t1 pre or post?"].description)))))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


['AX T1 DYNAMIC (DCE)',
 'AX T1 DYNAMIC TDCE',
 'AXIAL T1',
 'T1_SE_AX_FIL_1',
 'T1_SE_AX_FLOWCOMP_FIL_1',
 'Ax T1',
 'Ax T1 FSE',
 'AX T1 SE 512',
 '+Ax T1',
 'Ax T1 SE',
 '* Brain    *AX T1/SE',
 'T1 AX  SE',
 'T1 AX SE',
 'AX T1',
 'ROUTINE/T1-AX',
 'AXL T1',
 'T1 AXIAL',
 'PosDisp: [10] se t1 axial',
 'se t1 axial',
 'T1 AX 2mm',
 'T1  AXIAL',
 'AX FSE T1 HEAD',
 'eT1 AX CLEAR',
 'T1 AX',
 't1_ax_se_FIL',
 't1_se_axial PG',
 't1_se_axial',
 'AX T1 A',
 'AX T1 B',
 'AX T1 C',
 'AX T1 D',
 'B AX 3D T1',
 'O-Ax T1 SE',
 'T1 AXIAL SE',
 'AX SPGR T1',
 'AX T1 SE',
 'HD T1 Ax',
 't1_se_ax',
 't1_ax_flow comp',
 'T1AXBRAINLAB',
 'SE_t1 ax',
 'HD AX T1',
 'Axial T1 FSE',
 'Axial 3D T1',
 'Axial T1',
 'HD T1 Ax FOR SPECTROSCOPY',
 'AXIAL  SE T1',
 '1. AX SE T1',
 '2. AX SE T1 FS W',
 'Ax T1 FSPGR 3D*',
 'T1 Axials',
 'T1W_SE AX',
 'AxT1',
 '+AX T1 SE',
 't1 se axial',
 'T1 AX C-',
 'T1 SE AX',
 'AX T1 PG W/MTC',
 '1. AX TSE T1',
 'AX  T1',
 'T1 SE Axial',
 'HD T1 Ax THIN',
 'H2:AX T1 IR',
 

In [16]:
dfpivot = df3.loc[:,['id', 'modality', 'nb_dicoms', 'date']]
# TODO: figure out which folder to keep in case one patient has several folders for the same view and modality
dfpivot = dfpivot.sort_values(by=['date'])
dfpivot.drop_duplicates(subset=['id', 'modality'], keep='first', inplace=True)
dfpivot = dfpivot.loc[:, ['id', 'modality', 'nb_dicoms']]
dfpivot = dfpivot.set_index(['id', 'modality'])
dfpivot = dfpivot.unstack(fill_value=0)
dfpivot = dfpivot.loc[:, 'nb_dicoms']

dfpivot

modality,blade,dwi,flair,gre,mprage,multiple modalities detected,t1 pre or post?,t1post,t1pre,t2
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0101-01__Studies/727^1027^825^^,0,0,25,0,0,0,0,25,25,0
0101-01__Studies/874^8061^825^^,0,0,11,0,0,0,0,50,24,25
1899-12__Studies/204^7870^825^^,0,43,22,0,0,22,642,128,101,0
1899-12__Studies/295^2403^825^^,0,52,26,0,0,26,660,136,136,0
1899-12__Studies/935^2403^825^^,0,0,0,0,0,0,0,132,0,0
2004-03__Studies/125^126^825^^,0,0,25,0,0,25,25,25,0,25
2008-02__Studies/836^3459^825^^,0,0,20,0,0,0,20,0,0,20
2009-04__Studies/203^2503^825^^,0,0,23,0,0,0,0,0,0,0
2009-06__Studies/10^7017^825^^,0,0,0,0,0,76,0,76,76,76
2009-06__Studies/16^7758^825^^,0,0,30,0,0,0,22,22,0,22


In [17]:
dfcomp = df3.loc[:, ['id', 'thickness', 'rows', 'columns']]
dfcomp = dfcomp.drop_duplicates()
dfcomp = dfcomp.set_index('id')

dfpivot.join(dfcomp)

Unnamed: 0_level_0,blade,dwi,flair,gre,mprage,multiple modalities detected,t1 pre or post?,t1post,t1pre,t2,thickness,rows,columns
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0101-01__Studies/727^1027^825^^,0,0,25,0,0,0,0,25,25,0,5.0,512,512
0101-01__Studies/874^8061^825^^,0,0,11,0,0,0,0,50,24,25,5.0,512,512
1899-12__Studies/204^7870^825^^,0,43,22,0,0,22,642,128,101,0,5.0,256,256
1899-12__Studies/204^7870^825^^,0,43,22,0,0,22,642,128,101,0,5.0,512,512
1899-12__Studies/204^7870^825^^,0,43,22,0,0,22,642,128,101,0,1.5,256,256
1899-12__Studies/295^2403^825^^,0,52,26,0,0,26,660,136,136,0,5.0,256,256
1899-12__Studies/295^2403^825^^,0,52,26,0,0,26,660,136,136,0,5.0,512,512
1899-12__Studies/295^2403^825^^,0,52,26,0,0,26,660,136,136,0,1.5,256,256
1899-12__Studies/935^2403^825^^,0,0,0,0,0,0,0,132,0,0,1.5,256,256
1899-12__Studies/935^2403^825^^,0,0,0,0,0,0,0,132,0,0,5.0,512,512


In [4]:
# check that the 2 modalities for one patient have the same date
dfdate = df3.loc[:,['id', 'modality', 'date']]
# TODO: figure out which folder to keep in case one patient has several folders for the same view and modality
dfdate = dfdate.sort_values(by=['date'])
dfdate.drop_duplicates(subset=['id', 'modality'], keep='first', inplace=True)
dfdate = dfdate[dfdate.modality.isin(['flair', 't1post'])]
dfdate = dfdate.groupby('id').date.nunique()
dfdate
Counter(list(dfdate))

Counter({1: 358, 2: 5})

out of the 363 patients that have either t1 post, either flair or both:
- 358 have the same date for one or two modalities
- 5 patients have different dates for the 2 modalities

In [25]:
dfpivot.loc[(dfpivot.flair > 0) & (dfpivot.t1post > 0)]
dfpivot.loc[(dfpivot.flair > 0) & (dfpivot.t1post > 0) & (dfpivot.flair == dfpivot.t1post)]
Counter(dfpivot.loc[(dfpivot.flair > 0) & (dfpivot.t1post > 0) & (dfpivot.flair == dfpivot.t1post)].flair)
# dfpivot.loc[(dfpivot.flair > 0) & (dfpivot.t1post > 0) | (dfpivot.flair > 0) & (dfpivot["t1 pre or post?"] > 0)]
# dfpivot.loc[dfpivot.flair > 0]

Counter({1: 2,
         20: 6,
         21: 12,
         22: 7,
         23: 27,
         24: 32,
         25: 19,
         26: 30,
         27: 12,
         28: 18,
         29: 6,
         30: 12,
         31: 2,
         32: 4,
         36: 1,
         40: 3})

- 265 patients with flair and t1post
- 193 patients with flair + t1post AND same number of slices for the 2 modalities. 191 of these patients have between 20 and 40 slices