In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os

import re
import csv
import pickle

import numpy as np
import pandas as pd
from tqdm import tqdm_notebook, tqdm

import pydicom
from collections import Counter

import shutil

from utils.data_utils import arr_to_im_path
from utils.classify_utils import collect_info_patient_folder

### Load all folder names

In [33]:
data_dirs = ['/labs/gevaertlab/users/hackhack/RTOG/RTOG_duplicate/0825-6686 DAR',
             '/labs/gevaertlab/users/hackhack/RTOG/RTOG_duplicate/0825-6686 DAR 2of2']

# get the raw names of the folders to classify
all_files = []
all_files_path = []
bar = tqdm_notebook(total=sum([len(os.listdir(data_dir)) for data_dir in data_dirs]))
for data_dir in data_dirs:
    for study in os.listdir(data_dir):
        if 'Studies' in study:
            study_path = os.path.join(data_dir, study)
            for filename in os.listdir(study_path):
                filename_path = os.path.join(study_path, filename)
                # exclude folders with no dicoms
                if len(os.listdir(filename_path)) > 0:
                    full_name = study + '/' + filename
                    all_files.append(full_name)
                    all_files_path.append(filename_path)
        bar.update(1)

HBox(children=(IntProgress(value=0, max=103), HTML(value='')))

KeyboardInterrupt: 

In [42]:
pickle.dump(all_files_path, open("all_files_path.pkl", "wb"))

Load pickle

In [7]:
all_files_path = pickle.load(open("all_files_path.pkl", "rb"))

In [8]:
ex = os.listdir(all_files_path[0])[0]
sample = pydicom.read_file(os.path.join(all_files_path[0], ex))
print(sample)

(0008, 0005) Specific Character Set              CS: 'ISO_IR 100'
(0008, 0008) Image Type                          CS: ['ORIGINAL', 'PRIMARY', 'OTHER']
(0008, 0016) SOP Class UID                       UI: MR Image Storage
(0008, 0018) SOP Instance UID                    UI: 1.2.840.113619.2.244.3596.13650034.1926.1291036028.100
(0008, 0020) Study Date                          DA: '010101'
(0008, 0021) Series Date                         DA: '20101130'
(0008, 0022) Acquisition Date                    DA: '20101130'
(0008, 0023) Content Date                        DA: '20101130'
(0008, 0030) Study Time                          TM: '063719'
(0008, 0031) Series Time                         TM: '070200'
(0008, 0032) Acquisition Time                    TM: '070200'
(0008, 0033) Content Time                        TM: '070200'
(0008, 0050) Accession Number                    SH: '0'
(0008, 0060) Modality                            CS: 'MR'
(0008, 0070) Manufacturer                        LO: 

In [9]:
sample[('0019', '109d')]

(0019, 109d) [Pulse Sequence Date]               DT: '20100224074652'

In [10]:
sample.SeriesDescription

'3 PLANE LOC'

### Extract fields from DCM files

In [11]:
extracted_fields = []
for k, raw_file in enumerate(tqdm_notebook(all_files_path)):
    study = raw_file.strip().split('/')[-2]
    try:
        patient_id, view, modality, date, thickness, rows,\
            columns, nb_dicoms, description = collect_info_patient_folder(raw_file)
    except Exception as e:
        print(e)
        continue
    full_id = os.path.join(study, patient_id)
    extracted_fields.append({'study': study, 'patient_id': patient_id, 'full_id': full_id,
                             'description': description, 'view': view, 'modality': modality, 
                             'nb_dicoms': nb_dicoms, 'date': date, 'thickness': thickness, 'rows': rows,
                             'columns': columns, 'raw_path': raw_file})
images_fields0 = pd.DataFrame(extracted_fields)

HBox(children=(IntProgress(value=0, max=29394), HTML(value='')))




KeyboardInterrupt: 

In [18]:
images_fields0.to_csv("images_fields.csv")

Load from csv

In [3]:
images_fields = pd.read_csv("images_fields.csv")

In [4]:
print(len(images_fields.patient_id.unique()), 'patients')

541 patients


In [5]:
len(images_fields)

29394

In [6]:
images_fields.head()

Unnamed: 0.1,Unnamed: 0,columns,date,description,full_id,modality,nb_dicoms,patient_id,raw_path,rows,study,thickness,view
0,0,256,20101130.0,3 PLANE LOC,0101-01__Studies/727^1027^825^^,,15,727^1027^825^^,/labs/gevaertlab/users/hackhack/RTOG/RTOG_dupl...,256,0101-01__Studies,5.0,
1,1,256,20101130.0,AXIAL SPGR-BRAIN LAB,0101-01__Studies/727^1027^825^^,,124,727^1027^825^^,/labs/gevaertlab/users/hackhack/RTOG/RTOG_dupl...,256,0101-01__Studies,1.6,axial
2,2,256,20101130.0,AXIAL SPGR-BRAIN LAB,0101-01__Studies/727^1027^825^^,,124,727^1027^825^^,/labs/gevaertlab/users/hackhack/RTOG/RTOG_dupl...,256,0101-01__Studies,1.6,axial
3,3,256,20101130.0,DTI 25 directions 1000b,0101-01__Studies/727^1027^825^^,,338,727^1027^825^^,/labs/gevaertlab/users/hackhack/RTOG/RTOG_dupl...,256,0101-01__Studies,5.0,
4,4,512,20101130.0,SCREENSAVE,0101-01__Studies/727^1027^825^^,,11,727^1027^825^^,/labs/gevaertlab/users/hackhack/RTOG/RTOG_dupl...,512,0101-01__Studies,5.0,
