In [1]:
import os

import re
import csv

import numpy as np
import pandas as pd

import dicom
from collections import Counter

import shutil

from data_utils import arr_to_im_path
from classify_utils import collect_info_patient_folder

In [2]:
data_dir = '/labs/gevaertlab/data/radiology/StanfordGBM/2018-03-05/GBM'

# get the patient folders to classify
all_files_path = []
for patient in os.listdir(data_dir): 
    patient_path = os.path.join(data_dir, patient)
    if os.path.isdir(patient_path):
        folders = os.listdir(patient_path)
        folders = [os.path.join(patient_path, fold) for fold in folders]
        folders = [fold for fold in folders if os.path.isdir(fold)]
        patient_path = os.path.join(patient_path, folders[0])
        folders = folders = os.listdir(patient_path)
        folders = [os.path.join(patient_path, fold) for fold in folders]
        folders = [fold for fold in folders if os.path.isdir(fold)]
        all_files_path.extend(folders)

In [3]:
# check that all dicoms of the same folder have the same metadata
count = 0
weird = []
metadatas = []

for k, ex in enumerate(all_files_path):
    if k % 300 == 0:
        print('file %d out of %d'%(k + 1, len(all_files_path)))
    ex_dicoms = os.listdir(ex)
    ex_dicoms = [dic for dic in ex_dicoms if dic[-3:] == 'dcm']
    ex_dicoms = [os.path.join(ex, dic) for dic in ex_dicoms]
    metadata = set()
    for ex_dic in ex_dicoms:
        pat = dicom.read_file(ex_dic)
        try:
            patient_id = str(pat.PatientName)
            date = str(pat.AcquisitionDate)
            thickness = str(pat.SliceThickness)
            rows = int(pat.Rows)
            columns = int(pat.Columns)
            description = str(pat.SeriesDescription)
            all_info = (patient_id, date, thickness, rows, columns, description)
            metadata.add(all_info)
        except AttributeError:
            continue
    if len(metadata) != 1:
        weird.append(ex)
        count += 1
        metadatas.append(metadata)

file 1 out of 2315
file 301 out of 2315
file 601 out of 2315
file 901 out of 2315
file 1201 out of 2315
file 1501 out of 2315
file 1801 out of 2315
file 2101 out of 2315


In [5]:
weird

['/labs/gevaertlab/data/radiology/StanfordGBM/2018-03-05/GBM/Patient-02 - Patient-02/Mr Stereotactic On-Call/RAPID Summary - 1161',
 '/labs/gevaertlab/data/radiology/StanfordGBM/2018-03-05/GBM/Patient-02 - Patient-02/Mr Stereotactic On-Call/RAPID Summary - 1261',
 '/labs/gevaertlab/data/radiology/StanfordGBM/2018-03-05/GBM/Patient-03 - Patient-03/Mr Brain Reference Only/PACS nomination form - 801',
 '/labs/gevaertlab/data/radiology/StanfordGBM/2018-03-05/GBM/Patient-03 - Patient-03/Mr Brain Reference Only/Study acquired outside hospital - 1',
 '/labs/gevaertlab/data/radiology/StanfordGBM/2018-03-05/GBM/Patient-04 - Patient-04/Mr Brain Wandwo Contrast/RAPID Summary - 1161',
 '/labs/gevaertlab/data/radiology/StanfordGBM/2018-03-05/GBM/Patient-04 - Patient-04/Mr Brain Wandwo Contrast/SCREENSAVE - 1000',
 '/labs/gevaertlab/data/radiology/StanfordGBM/2018-03-05/GBM/Patient-05 - Patient-05/Mr Brain Reference Only/MRI BRAIN W CONTRAST - 999999',
 '/labs/gevaertlab/data/radiology/StanfordGBM/2

In [3]:
csv_path = '/home/romains/stanford_gbm_v3.csv'

In [4]:
# save csv on disk to export to speadsheet after and check results
with open(csv_path, 'w') as csvfile:
    fieldnames = ['id', 'description', 'view', 'modality',\
                  'nb_dicoms', 'date', 'thickness', 'rows', 'columns', 'raw']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()

    for k, folder in enumerate(all_files_path):
        if k % 300 == 0:
            print('file %d out of %d'%(k + 1, len(all_files_path)))
        try:
            patient_id, view, modality, date, thickness, rows,\
                columns, nb_dicoms, description = collect_info_patient_folder(folder)
        except:
            continue
        writer.writerow({'id': patient_id, 'description': description,\
                         'view': view, 'modality': modality, 'nb_dicoms': nb_dicoms,\
                         'date': date, 'thickness': thickness, 'rows': rows,\
                         'columns': columns, 'raw': folder})

file 1 out of 2315
file 301 out of 2315
file 601 out of 2315
file 901 out of 2315
file 1201 out of 2315
file 1501 out of 2315
file 1801 out of 2315
file 2101 out of 2315


In [5]:
# load csv to run some analyses on the number of patients for each view/modality
df = pd.read_csv(csv_path)

# drop any row for which view is not axial
df3 = df.replace('sagital', pd.np.nan)
df3 = df3.replace('coronal', pd.np.nan)
df3 = df3.dropna(axis=0, how='any')

In [6]:
df.modality.unique()

array([nan, 'dwi', 'flair', 'gre', 't1post', 't1pre', 't2',
       'multiple modalities detected', 't1 pre or post?'], dtype=object)

In [7]:
df.shape

(2200, 10)

In [11]:
dfpivot = df3.loc[:,['id', 'modality', 'nb_dicoms', 'date']]
# TODO: figure out which folder to keep in case one patient has several folders for the same view and modality
dfpivot = dfpivot.sort_values(by=['date'])
dfpivot.drop_duplicates(subset=['id', 'modality'], keep='first', inplace=True)
dfpivot = dfpivot.loc[:, ['id', 'modality', 'nb_dicoms']]
dfpivot = dfpivot.set_index(['id', 'modality'])
dfpivot = dfpivot.unstack(fill_value=0)
dfpivot = dfpivot.loc[:, 'nb_dicoms']

dfpivot

modality,dwi,flair,gre,multiple modalities detected,t1 pre or post?,t1post,t1pre,t2
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Patient-01,60,52,30,0,0,30,30,30
Patient-02,64,52,31,0,0,31,13,32
Patient-03,0,24,0,24,24,24,0,24
Patient-04,62,52,32,0,0,32,32,32
Patient-05,0,36,0,36,36,32,32,0
Patient-06,176,68,0,0,68,0,0,68
Patient-07,48,49,24,0,24,24,0,24
Patient-08,26,26,0,26,0,0,0,26
Patient-09,54,0,0,27,27,27,0,0
Patient-10,40,43,0,0,43,43,0,43


In [15]:
dfpivot.loc[(dfpivot.flair > 0)].shape
dfpivot.loc[(dfpivot.flair > 0) & (dfpivot.t1post > 0)].shape
dfpivot.loc[(dfpivot.flair > 0) & (dfpivot.t1post > 0) & (dfpivot.flair == dfpivot.t1post)].shape
# dfpivot.loc[(dfpivot.flair > 0) & (dfpivot.t1post > 0) & (dfpivot.t1pre > 0) & (dfpivot.t2 > 0)].shape
#Counter(dfpivot.loc[(dfpivot.flair > 0) & (dfpivot.t1post > 0) & (dfpivot.flair == dfpivot.t1post)].flair)
# dfpivot.loc[(dfpivot.flair > 0) & (dfpivot.t1post > 0) | (dfpivot.flair > 0) & (dfpivot["t1 pre or post?"] > 0)]
# dfpivot.loc[dfpivot.flair > 0]

(29, 8)

In [10]:
# list of descriptions for which we can't classify between T1 post and T1 pre or mutliple modalities
df4 = df3[df3.modality == "t1 pre or post?"]
#df4 = df3[df3.modality == "multiple modalities detected"]
df4.drop_duplicates(subset=['description'], keep='first', inplace=True)
df4.loc[:, ['description', 'raw']]
df4.description.tolist()
#list(np.unique(np.array(sorted(list(df3[df3.modality == "t1 pre or post?"].description)))))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


['AX T1',
 'AX T1 SE',
 'Ax T1',
 'AXIAL T1 SE',
 'AX T1 FLASH',
 'AX T1 FFE',
 'AX T1 FSE',
 'AX T1 Variable flip SE',
 'AX T1 SE STEREO WB+C',
 'K:AX T1W',
 'Ax T1 Variable flip SE',
 'AX T1 STEREO',
 'H2:AX T1',
 'AX T1 STEREO+C']