In [2]:
import os

import re
import csv

import numpy as np
import pandas as pd

import dicom
from collections import Counter

import shutil

from data_utils import arr_to_im_path
from classify_utils import collect_info_patient_folder

In [3]:
data_dir = '/labs/gevaertlab/users/hackhack/RTOG/scratch'

# get the raw names of the folders to classify
all_files = []
all_files_path = []
for study in os.listdir(data_dir):  
    if 'Studies' in study:
        study_path = os.path.join(data_dir, study)
        for filename in os.listdir(study_path):
            filename_path = os.path.join(study_path, filename)
            # exclude folders with no dicoms
            if len(os.listdir(filename_path)) > 0:
                full_name = study + '/' + filename
                all_files.append(full_name)
                all_files_path.append(filename_path)

In [4]:
ex = os.listdir(all_files_path[0])[0]
sample = dicom.read_file(os.path.join(all_files_path[0], ex))
print(sample)

(0008, 0005) Specific Character Set              CS: 'ISO_IR 100'
(0008, 0008) Image Type                          CS: ['ORIGINAL', 'PRIMARY', 'OTHER']
(0008, 0016) SOP Class UID                       UI: MR Image Storage
(0008, 0018) SOP Instance UID                    UI: 1.2.840.113619.2.244.3596.13650034.1926.1291036028.100
(0008, 0020) Study Date                          DA: '010101'
(0008, 0021) Series Date                         DA: '20101130'
(0008, 0022) Acquisition Date                    DA: '20101130'
(0008, 0023) Content Date                        DA: '20101130'
(0008, 0030) Study Time                          TM: '063719'
(0008, 0031) Series Time                         TM: '070200'
(0008, 0032) Acquisition Time                    TM: '070200'
(0008, 0033) Content Time                        TM: '070200'
(0008, 0050) Accession Number                    SH: '0'
(0008, 0060) Modality                            CS: 'MR'
(0008, 0070) Manufacturer                        LO: 

In [8]:
sample.SeriesDescription

'3 PLANE LOC'

In [5]:
csv_path = '/home/romains/rtog/rtog_patients_v7.csv'

In [6]:
# save csv on disk to export to speadsheet after and check results
with open(csv_path, 'w') as csvfile:
    fieldnames = ['study', 'id', 'description', 'view', 'modality',\
                  'nb_dicoms', 'date', 'thickness', 'rows', 'columns', 'raw']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()

    for k, raw_file in enumerate(all_files_path):
        if k % 500 == 0:
            print('file %d out of %d'%(k + 1, len(all_files_path)))
        study = raw_file.strip().split('/')[-2]
        try:
            patient_id, view, modality, date, thickness, rows,\
                columns, nb_dicoms, description = collect_info_patient_folder(raw_file)
        except:
            continue
        patient_id = os.path.join(study, patient_id)
        writer.writerow({'study': study, 'id': patient_id, 'description': description,\
                         'view': view, 'modality': modality, 'nb_dicoms': nb_dicoms,\
                         'date': date, 'thickness': thickness, 'rows': rows,\
                         'columns': columns, 'raw': raw_file})

FileNotFoundError: [Errno 2] No such file or directory: '/home/romains/rtog/rtog_patients_v7.csv'

In [5]:
# load csv to run some analyses on the number of patients for each view/modality
df = pd.read_csv(csv_path)

# drop any row where there is a Nan (missing id, view or modality)
df2 = df.dropna(axis=0, how='any') 

# drop any row for which view is not axial
df3 = df2.replace('sagital', pd.np.nan)
df3 = df3.replace('coronal', pd.np.nan)
df3 = df3.dropna(axis=0, how='any')

In [4]:
# list of descriptions for which we can't classify between T1 post and T1 pre
df4 = df3[df3.modality == "t1 pre or post?"]
df4.drop_duplicates(subset=['description'], keep='first', inplace=True)
df4.loc[:, ['description', 'raw']]
list(df4.description)
#list(np.unique(np.array(sorted(list(df3[df3.modality == "t1 pre or post?"].description)))))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


['AX T1 DYNAMIC (DCE)',
 'AX T1 DYNAMIC TDCE',
 'AXIAL T1',
 'T1_SE_AX_FIL_1',
 'T1_SE_AX_FLOWCOMP_FIL_1',
 'Ax T1',
 'Ax T1 FSE',
 'AX T1 SE 512',
 '+Ax T1',
 'Ax T1 SE',
 '* Brain    *AX T1/SE',
 'T1 AX  SE',
 'T1 AX SE',
 'AX T1',
 'ROUTINE/T1-AX',
 'AXL T1',
 'T1 AXIAL',
 'PosDisp: [10] se t1 axial',
 'se t1 axial',
 'T1 AX 2mm',
 'T1  AXIAL',
 'AX FSE T1 HEAD',
 'eT1 AX CLEAR',
 'T1 AX',
 't1_ax_se_FIL',
 't1_se_axial PG',
 't1_se_axial',
 'AX T1 A',
 'AX T1 B',
 'AX T1 C',
 'AX T1 D',
 'B AX 3D T1',
 'O-Ax T1 SE',
 'T1 AXIAL SE',
 'AX SPGR T1',
 'AX T1 SE',
 'HD T1 Ax',
 't1_se_ax',
 't1_ax_flow comp',
 'T1AXBRAINLAB',
 'SE_t1 ax',
 'HD AX T1',
 'Axial T1 FSE',
 'Axial 3D T1',
 'Axial T1',
 'HD T1 Ax FOR SPECTROSCOPY',
 'AXIAL  SE T1',
 '1. AX SE T1',
 '2. AX SE T1 FS W',
 'Ax T1 FSPGR 3D*',
 'T1 Axials',
 'T1W_SE AX',
 'AxT1',
 '+AX T1 SE',
 't1 se axial',
 'T1 AX C-',
 'T1 SE AX',
 'AX T1 PG W/MTC',
 '1. AX TSE T1',
 'AX  T1',
 'T1 SE Axial',
 'HD T1 Ax THIN',
 'H2:AX T1 IR',
 

In [6]:
dfpivot = df3.loc[:,['id', 'modality', 'nb_dicoms', 'date']]
# TODO: figure out which folder to keep in case one patient has several folders for the same view and modality
dfpivot = dfpivot.sort_values(by=['date'])
dfpivot.drop_duplicates(subset=['id', 'modality'], keep='first', inplace=True)
dfpivot = dfpivot.loc[:, ['id', 'modality', 'nb_dicoms']]
dfpivot = dfpivot.set_index(['id', 'modality'])
dfpivot = dfpivot.unstack(fill_value=0)
dfpivot = dfpivot.loc[:, 'nb_dicoms']

dfpivot

modality,blade,dwi,flair,gre,mprage,multiple modalities detected,t1 pre or post?,t1post,t1pre,t2
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0101-01__Studies/727^1027^825^^,0,0,25,0,0,0,0,25,25,0
0101-01__Studies/874^8061^825^^,0,0,11,0,0,0,0,50,24,25
1899-12__Studies/204^7870^825^^,0,43,22,0,0,22,642,128,101,0
1899-12__Studies/295^2403^825^^,0,52,26,0,0,26,660,136,136,0
1899-12__Studies/935^2403^825^^,0,0,0,0,0,0,0,132,0,0
2004-03__Studies/125^126^825^^,0,0,25,0,0,25,25,25,0,25
2008-02__Studies/836^3459^825^^,0,0,20,0,0,0,20,0,0,20
2009-04__Studies/203^2503^825^^,0,0,23,0,0,0,0,0,0,0
2009-06__Studies/10^7017^825^^,0,0,0,0,0,76,0,76,76,76
2009-06__Studies/16^7758^825^^,0,0,30,0,0,0,22,22,0,22


In [17]:
dfcomp = df3.loc[:, ['id', 'thickness', 'rows', 'columns']]
dfcomp = dfcomp.drop_duplicates()
dfcomp = dfcomp.set_index('id')

dfpivot.join(dfcomp)

Unnamed: 0_level_0,blade,dwi,flair,gre,mprage,multiple modalities detected,t1 pre or post?,t1post,t1pre,t2,thickness,rows,columns
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0101-01__Studies/727^1027^825^^,0,0,25,0,0,0,0,25,25,0,5.0,512,512
0101-01__Studies/874^8061^825^^,0,0,11,0,0,0,0,50,24,25,5.0,512,512
1899-12__Studies/204^7870^825^^,0,43,22,0,0,22,642,128,101,0,5.0,256,256
1899-12__Studies/204^7870^825^^,0,43,22,0,0,22,642,128,101,0,5.0,512,512
1899-12__Studies/204^7870^825^^,0,43,22,0,0,22,642,128,101,0,1.5,256,256
1899-12__Studies/295^2403^825^^,0,52,26,0,0,26,660,136,136,0,5.0,256,256
1899-12__Studies/295^2403^825^^,0,52,26,0,0,26,660,136,136,0,5.0,512,512
1899-12__Studies/295^2403^825^^,0,52,26,0,0,26,660,136,136,0,1.5,256,256
1899-12__Studies/935^2403^825^^,0,0,0,0,0,0,0,132,0,0,1.5,256,256
1899-12__Studies/935^2403^825^^,0,0,0,0,0,0,0,132,0,0,5.0,512,512


In [4]:
# check that the 2 modalities for one patient have the same date
dfdate = df3.loc[:,['id', 'modality', 'date']]
# TODO: figure out which folder to keep in case one patient has several folders for the same view and modality
dfdate = dfdate.sort_values(by=['date'])
dfdate.drop_duplicates(subset=['id', 'modality'], keep='first', inplace=True)
dfdate = dfdate[dfdate.modality.isin(['flair', 't1post'])]
dfdate = dfdate.groupby('id').date.nunique()
dfdate
Counter(list(dfdate))

Counter({1: 358, 2: 5})

out of the 363 patients that have either t1 post, either flair or both:
- 358 have the same date for one or two modalities
- 5 patients have different dates for the 2 modalities

In [6]:
dfpivot.loc[(dfpivot.flair > 0) & (dfpivot.t1post > 0)]
dfpivot.loc[(dfpivot.flair > 0) & (dfpivot.t1post > 0) & (dfpivot.flair == dfpivot.t1post)]
Counter(dfpivot.loc[(dfpivot.flair > 0) & (dfpivot.t1post > 0) & (dfpivot.flair == dfpivot.t1post)].flair)
# dfpivot.loc[(dfpivot.flair > 0) & (dfpivot.t1post > 0) | (dfpivot.flair > 0) & (dfpivot["t1 pre or post?"] > 0)]
# dfpivot.loc[dfpivot.flair > 0]

Counter({1: 2,
         20: 6,
         21: 12,
         22: 7,
         23: 27,
         24: 32,
         25: 19,
         26: 30,
         27: 12,
         28: 18,
         29: 6,
         30: 12,
         31: 2,
         32: 4,
         36: 1,
         40: 3})

- 265 patients with flair and t1post
- 193 patients with flair + t1post AND same number of slices for the 2 modalities. 191 of these patients have between 20 and 40 slices

## MOVE FOLDERS AND CREATE PATIENTS

In [82]:
# helper functionss
def get_modalities_for_patient(pat, dataframe):
    t1post = dataframe[(dataframe.id == pat) & (dataframe.modality == 't1post')]\
                      .sort_values(by=['date'])\
                      .drop_duplicates(subset=['id', 'modality'], keep='first', inplace=False)\
                      .raw.values[0]
    flair = dataframe[(dataframe.id == pat) & (dataframe.modality == 'flair')]\
                      .sort_values(by=['date'])\
                      .drop_duplicates(subset=['id', 'modality'], keep='first', inplace=False)\
                      .raw.values[0]
    return t1post, flair


def create_modality_file(modality_path):
    dicoms = os.listdir(modality_path)
    dicoms = [dic.strip() for dic in dicoms if dic.strip()[-4:] == '.dcm']
    dicoms = [os.path.join(modality_path, dic) for dic in dicoms]
    nb_dicoms = len(dicoms)
    
    modality = None
    for dic in dicoms:
        arr = dicom.read_file(dic).pixel_array
        stack_id = int(dicom.read_file(dic).InstanceNumber) - 1
        if modality is None:
            shape = [nb_dicoms] + list(arr.shape)
            modality = np.zeros(shape)
        modality[stack_id, :, :] = arr
    
    return modality
    

In [33]:
# get path of t1post anf flair modalities for each patient
selection = dfpivot.loc[(dfpivot.flair > 0) & (dfpivot.t1post > 0) & (dfpivot.flair == dfpivot.t1post)]
patients_to_select = list(selection.index)
pat_with_modalities = {}
for pat in patients_to_select:
    pat_name = pat.strip()[:-2]
    pat_name = '_'.join(pat_name.split('/'))
    t1post, flair = get_modalities_for_patient(pat, df3)
    pat_with_modalities[pat_name] = {'t1post': t1post, 'flair': flair}

In [86]:
# copy and create modality files
out_path = '/local-scratch/romains_scratch/rtog/'

if os.path.exists(out_path):
    shutil.rmtree(out_path)
os.makedirs(out_path)

for pat_name in pat_with_modalities:
    pat_path = os.path.join(out_path, pat_name)
    os.makedirs(pat_path)
    pat = pat_with_modalities[pat_name]
    t1post = create_modality_file(pat['t1post'])
    flair = create_modality_file(pat['flair'])
    out_t1post = os.path.join(pat_path, 't1c.nii')
    out_flair = os.path.join(pat_path, 'flair.nii')
    arr_to_im_path(t1post, out_t1post)
    arr_to_im_path(flair, out_flair)