In [8]:
import os

import re
import csv

import numpy as np
import pandas as pd

import dicom
from collections import Counter

import shutil

from data_utils import arr_to_im_path

In [9]:
# helper functions - same as in classify_rtog
# TODO: put these functions in separate file

types_of_scan = ['MR', 'CT', 'PR']
AXIAL_VIEWS = ['axl', 'axial', 'ax']
SAGITAL_VIEWS = ['sag', 'sagital']
CORONAL_VIEWS = ['cor', 'coronal']
MODALITIES = ['t1', 't2', 'flair', 'dwi', 'gre', 'blade', 'propeller', 'lava', 'fame', 'mprage']
# mprage instead of rage, otherwise match on 'average'


def find_view(description):
    description = description.lower()
    if any(x in description for x in AXIAL_VIEWS):
        view = 'axial'
    elif any(x in description for x in SAGITAL_VIEWS):
        view = 'sagital'
    elif any(x in description for x in CORONAL_VIEWS):
        view = 'coronal'
    else:
        view = 'N/A'
    return view


def find_modality(description):
    description = description.lower()
    modalities = [x for x in MODALITIES if x in description]
    if len(modalities) == 0:
        modality = 'N/A'
    elif len(modalities) == 1:
        if 't1' in modalities:
            if 'pre' in description:
                modality = 't1pre'
            elif any(x in description for x in ['post', 't1c', '+c', '+ c', 'c+', 'con', 'gd', 'gad']):
                modality = 't1post'
            else:
                modality = 't1 pre or post?'
        else:
            modality = modalities[0]
    else:
        if set(modalities) == set(['flair', 't2']):
            modality = 'flair'
        else:
            modality = 'multiple modalities detected'

    return modality


def collect_info_patient_folder(patient_folder):  
    dicoms = os.listdir(patient_folder)
    dicoms = [dic for dic in dicoms if dic[-4:] == '.dcm']
    sample = dicoms[0]
    sample = os.path.join(patient_folder, sample)
    sample = dicom.read_file(sample)

    # collect information
    patient_id = str(sample.PatientName)
    date = str(sample.AcquisitionDate)
    thickness = str(sample.SliceThickness)
    rows = int(sample.Rows)
    columns = int(sample.Columns)
    nb_dicoms = len(dicoms)
    description = str(sample.SeriesDescription)
    view = find_view(description)
    modality = find_modality(description)

    return patient_id, view, modality, date, thickness, rows, columns, nb_dicoms, description

In [3]:
data_dir = '/labs/gevaertlab/data/radiology/StanfordGBM/2018-03-05/GBM'

# get the patient folders to classify
all_files_path = []
for patient in os.listdir(data_dir): 
    patient_path = os.path.join(data_dir, patient)
    if os.path.isdir(patient_path):
        folders = os.listdir(patient_path)
        folders = [os.path.join(patient_path, fold) for fold in folders]
        folders = [fold for fold in folders if os.path.isdir(fold)]
        patient_path = os.path.join(patient_path, folders[0])
        folders = folders = os.listdir(patient_path)
        folders = [os.path.join(patient_path, fold) for fold in folders]
        folders = [fold for fold in folders if os.path.isdir(fold)]
        all_files_path.extend(folders)

In [10]:
csv_path = '/home/romains/stanford_gbm.csv'

In [7]:
# save csv on disk to export to speadsheet after and check results
with open(csv_path, 'w') as csvfile:
    fieldnames = ['id', 'description', 'view', 'modality',\
                  'nb_dicoms', 'date', 'thickness', 'rows', 'columns', 'raw']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()

    for k, folder in enumerate(all_files_path):
        if k % 300 == 0:
            print('file %d out of %d'%(k + 1, len(all_files_path)))
        try:
            patient_id, view, modality, date, thickness, rows,\
                columns, nb_dicoms, description = collect_info_patient_folder(folder)
        except:
            continue
        writer.writerow({'id': patient_id, 'description': description,\
                         'view': view, 'modality': modality, 'nb_dicoms': nb_dicoms,\
                         'date': date, 'thickness': thickness, 'rows': rows,\
                         'columns': columns, 'raw': folder})

file 1 out of 2315
file 301 out of 2315
file 601 out of 2315
file 901 out of 2315
file 1201 out of 2315
file 1501 out of 2315
file 1801 out of 2315
file 2101 out of 2315


In [17]:
# load csv to run some analyses on the number of patients for each view/modality
df = pd.read_csv(csv_path)

# drop any row for which view is not axial
df3 = df.replace('sagital', pd.np.nan)
df3 = df3.replace('coronal', pd.np.nan)
df3 = df3.dropna(axis=0, how='any')

In [18]:
dfpivot = df3.loc[:,['id', 'modality', 'nb_dicoms', 'date']]
# TODO: figure out which folder to keep in case one patient has several folders for the same view and modality
dfpivot = dfpivot.sort_values(by=['date'])
dfpivot.drop_duplicates(subset=['id', 'modality'], keep='first', inplace=True)
dfpivot = dfpivot.loc[:, ['id', 'modality', 'nb_dicoms']]
dfpivot = dfpivot.set_index(['id', 'modality'])
dfpivot = dfpivot.unstack(fill_value=0)
dfpivot = dfpivot.loc[:, 'nb_dicoms']

dfpivot

modality,dwi,flair,gre,multiple modalities detected,t1 pre or post?,t1post,t1pre,t2
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Patient-01,60,52,30,0,0,30,30,30
Patient-02,64,52,31,0,0,31,13,32
Patient-03,0,24,0,24,24,24,0,24
Patient-04,62,52,32,0,0,32,32,32
Patient-05,0,36,0,36,36,32,32,0
Patient-06,176,68,0,0,68,0,0,68
Patient-07,48,49,24,0,24,24,0,24
Patient-08,26,26,0,26,0,0,0,26
Patient-09,54,0,0,27,27,27,0,0
Patient-10,80,43,0,0,43,43,0,43


In [23]:
dfpivot.loc[(dfpivot.flair > 0)].shape
dfpivot.loc[(dfpivot.flair > 0) & (dfpivot.t1post > 0)].shape
dfpivot.loc[(dfpivot.flair > 0) & (dfpivot.t1post > 0) & (dfpivot.flair == dfpivot.t1post)].shape
# dfpivot.loc[(dfpivot.flair > 0) & (dfpivot.t1post > 0) & (dfpivot.t1pre > 0) & (dfpivot.t2 > 0)].shape
#Counter(dfpivot.loc[(dfpivot.flair > 0) & (dfpivot.t1post > 0) & (dfpivot.flair == dfpivot.t1post)].flair)
# dfpivot.loc[(dfpivot.flair > 0) & (dfpivot.t1post > 0) | (dfpivot.flair > 0) & (dfpivot["t1 pre or post?"] > 0)]
# dfpivot.loc[dfpivot.flair > 0]

(30, 8)

In [25]:
# list of descriptions for which we can't classify between T1 post and T1 pre or mutliple modalities
df4 = df3[df3.modality == "t1 pre or post?"]
#df4 = df3[df3.modality == "multiple modalities detected"]
df4.drop_duplicates(subset=['description'], keep='first', inplace=True)
df4.loc[:, ['description', 'raw']]
df4.description.tolist()
#list(np.unique(np.array(sorted(list(df3[df3.modality == "t1 pre or post?"].description)))))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


['AX T1',
 'AX T1 SE',
 'Ax T1',
 'AXIAL T1 SE',
 'AX T1 FLASH',
 'AX T1 FFE',
 'AX T1 FSE',
 'AX T1 Variable flip SE',
 'hAXIAL SE T1 +',
 'K:AX T1W',
 'Ax T1 Variable flip SE',
 'AX T1 STEREO',
 'H2:AX T1']