In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os

import re
import csv
import pickle

import numpy as np
import pandas as pd
from tqdm import tqdm_notebook, tqdm

import pydicom
from collections import Counter

import shutil

from utils.data_utils import arr_to_im_path
from utils.classify_utils import collect_info_patient_folder

## Allow loading of radiologist annotated information

In [14]:
annotation_to_modality = {"T1 pre ": "t1pre",
                         "T1 pre": "t1pre",
                         "T1 post": "t1post"}

In [15]:
def enrich_with_dataframe_annotation(images_fields, annotation_df, verbose=False): # in-place
    unknown_modalities = set()
    for _, row in annotation_df.iterrows():
        row_descr = row.description
        if row.modality in annotation_to_modality:
            row_mod = annotation_to_modality[row.modality]
            images_fields.loc[images_fields.description == row_descr, "modality"] = row_mod
        else:
            unknown_modalities.add(row.modality)
            if verbose:
                print("Unrecognized labelled modality:", row.modality)
    print(list(unknown_modalities))

In [16]:
def enrich_with_radiologist_annotation(images_fields): # in-place
    def print_number_images_with_unknown_modality(images_fields):
        print("Images with unknown modality: {}".format(
            (images_fields.modality.str.contains("multiple modalities detected") | 
             images_fields.modality.str.contains("N/A") |
             images_fields.modality.str.contains("t1 pre or post?")
        ).sum()))


    annotation1 = pd.read_csv("radiologist_classification/descriptions_unclassified_t1_preVpost-annotated.csv")
    annotation2 = pd.read_csv("radiologist_classification/descriptions_unclassified_multiple_modalities-annotated.csv")
    annotation3 = pd.read_csv("radiologist_classification/descriptions_unclassified_no_modalities-annotated.csv")

    enrich_with_dataframe_annotation(images_fields, annotation1)
    print_number_images_with_unknown_modality(images_fields)

    enrich_with_dataframe_annotation(images_fields, annotation2)
    print_number_images_with_unknown_modality(images_fields)

    enrich_with_dataframe_annotation(images_fields, annotation3)
    print_number_images_with_unknown_modality(images_fields)


### Load all DICOM files metadata

Load from csv

In [19]:
images_fields = pd.read_csv("images_fields.csv", na_filter=False)
# enrich_with_radiologist_annotation(images_fields)
images_fields0 = images_fields

In [18]:
print(len(images_fields.patient_id.unique()), 'patients')

541 patients


In [159]:
len(images_fields)

29394

In [160]:
images_fields.head()

Unnamed: 0.1,Unnamed: 0,columns,date,description,full_id,modality,nb_dicoms,patient_id,raw_path,rows,study,thickness,view
0,0,256,20101130,3 PLANE LOC,0101-01__Studies/727^1027^825^^,,15,727^1027^825^^,/labs/gevaertlab/users/hackhack/RTOG/RTOG_dupl...,256,0101-01__Studies,5.0,
1,1,256,20101130,AXIAL SPGR-BRAIN LAB,0101-01__Studies/727^1027^825^^,,124,727^1027^825^^,/labs/gevaertlab/users/hackhack/RTOG/RTOG_dupl...,256,0101-01__Studies,1.6,axial
2,2,256,20101130,AXIAL SPGR-BRAIN LAB,0101-01__Studies/727^1027^825^^,,124,727^1027^825^^,/labs/gevaertlab/users/hackhack/RTOG/RTOG_dupl...,256,0101-01__Studies,1.6,axial
3,3,256,20101130,DTI 25 directions 1000b,0101-01__Studies/727^1027^825^^,,338,727^1027^825^^,/labs/gevaertlab/users/hackhack/RTOG/RTOG_dupl...,256,0101-01__Studies,5.0,
4,4,512,20101130,SCREENSAVE,0101-01__Studies/727^1027^825^^,,11,727^1027^825^^,/labs/gevaertlab/users/hackhack/RTOG/RTOG_dupl...,512,0101-01__Studies,5.0,


# Extracting patients for which we have full information

#### Looking for two modalities available at the first date

In [81]:
images_fields = images_fields0

In [82]:
indexed_images_fields = images_fields.set_index(["patient_id", "modality"]).sort_index()
indexed_images_field_notfirstdate = indexed_images_fields

In [83]:
print(len(indexed_images_fields.index.get_level_values(0).unique()), 'patients')

541 patients


Keep patients with both modalities available

In [84]:
indexed_images_fields = indexed_images_fields.groupby("patient_id").filter(lambda x:
                                 'flair' in x.index.get_level_values(1).values)
print(len(indexed_images_fields.index.get_level_values(0).unique()), 'patients')
indexed_images_fields = indexed_images_fields.groupby("patient_id").filter(lambda x:
                                 't1post' in x.index.get_level_values(1).values)
print(len(indexed_images_fields.index.get_level_values(0).unique()), 'patients')
indexed_images_fields = indexed_images_fields.query("modality in ['t1post', 'flair']")

485 patients
443 patients


Keep patients with axial view available

In [85]:
indexed_images_fields = indexed_images_fields.query("view == 'axial'")
print(len(indexed_images_fields.index.get_level_values(0).unique()), 'patients')
indexed_images_fields.reset_index().set_index(['patient_id', 
                                              'date', 
                                              'modality']).sort_index()

426 patients


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 0,columns,description,full_id,nb_dicoms,raw_path,rows,study,thickness,view
patient_id,date,modality,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
101^8007^825^^,20101227.0,flair,13033,256,T2 FLAIR AXIAL,2010-12__Studies/101^8007^825^^,22,/labs/gevaertlab/users/hackhack/RTOG/RTOG_dupl...,256,2010-12__Studies,5.00,axial
101^8007^825^^,20101227.0,t1post,13028,512,T1 AX+C,2010-12__Studies/101^8007^825^^,22,/labs/gevaertlab/users/hackhack/RTOG/RTOG_dupl...,512,2010-12__Studies,5.00,axial
102^9598^825^^,20091015.0,flair,843,512,+C AX FLAIR,2009-10__Studies/102^9598^825^^,28,/labs/gevaertlab/users/hackhack/RTOG/RTOG_dupl...,512,2009-10__Studies,5.00,axial
102^9598^825^^,20091015.0,t1post,844,512,+c AXIAL T1,2009-10__Studies/102^9598^825^^,28,/labs/gevaertlab/users/hackhack/RTOG/RTOG_dupl...,512,2009-10__Studies,5.00,axial
102^9598^825^^,20100203.0,flair,2406,512,+C AX FLAIR,2010-02__Studies/102^9598^825^^,28,/labs/gevaertlab/users/hackhack/RTOG/RTOG_dupl...,512,2010-02__Studies,5.00,axial
102^9598^825^^,20100203.0,t1post,2407,512,+c AXIAL T1,2010-02__Studies/102^9598^825^^,28,/labs/gevaertlab/users/hackhack/RTOG/RTOG_dupl...,512,2010-02__Studies,5.00,axial
103^7796^825^^,20100317.0,t1post,2950,384,AX T1 IR GAD,2010-03__Studies/103^7796^825^^,26,/labs/gevaertlab/users/hackhack/RTOG/RTOG_dupl...,384,2010-03__Studies,5.00,axial
103^7796^825^^,20100609.0,t1post,5742,384,AX T1 IR GAD,2010-06__Studies/103^7796^825^^,26,/labs/gevaertlab/users/hackhack/RTOG/RTOG_dupl...,384,2010-06__Studies,5.00,axial
104^601^825^^,20090926.0,flair,630,320,T2 AX FLAIR,2009-09__Studies/104^601^825^^,28,/labs/gevaertlab/users/hackhack/RTOG/RTOG_dupl...,320,2009-09__Studies,5.00,axial
104^601^825^^,20091231.0,flair,1467,512,AX OBL FLAIR TO AC/PC LINE,2009-12__Studies/104^601^825^^,26,/labs/gevaertlab/users/hackhack/RTOG/RTOG_dupl...,512,2009-12__Studies,5.00,axial


For two images with same view and modality, keep the most recent one.

In [86]:
indexed_images_fields = indexed_images_fields.sort_values(by=['date'])
indexed_images_fields = indexed_images_fields.loc[~indexed_images_fields.index.duplicated(keep='first')]

In [87]:
print(len(indexed_images_fields.index.get_level_values(0).unique()), 'patients')
indexed_images_fields.head()

426 patients


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 0,columns,date,description,full_id,nb_dicoms,raw_path,rows,study,thickness,view
patient_id,modality,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
125^126^825^^,t1post,119,236,20040318.0,AXIAL T1 POST_FIL,2004-03__Studies/125^126^825^^,25,/labs/gevaertlab/users/hackhack/RTOG/RTOG_dupl...,320,2004-03__Studies,5.0,axial
125^126^825^^,flair,118,512,20040318.0,AXIAL FLAIR,2004-03__Studies/125^126^825^^,25,/labs/gevaertlab/users/hackhack/RTOG/RTOG_dupl...,640,2004-03__Studies,5.0,axial
203^2503^825^^,flair,138,256,20090403.0,AX FLAIR,2009-04__Studies/203^2503^825^^,23,/labs/gevaertlab/users/hackhack/RTOG/RTOG_dupl...,256,2009-04__Studies,5.0,axial
9^7627^825^^,flair,210,256,20090602.0,Ax FLAIR,2009-06__Studies/9^7627^825^^,20,/labs/gevaertlab/users/hackhack/RTOG/RTOG_dupl...,256,2009-06__Studies,5.0,axial
10^7017^825^^,t1post,147,512,20090603.0,AX T1 POST SPIN ECHO,2009-06__Studies/10^7017^825^^,76,/labs/gevaertlab/users/hackhack/RTOG/RTOG_dupl...,512,2009-06__Studies,5.0,axial


Keeping patients with two modalities taken at the same date

In [88]:
indexed_images_fields = indexed_images_fields.groupby("patient_id").filter(lambda x: len(x) > 1 and x.date[0] == x.date[1])
print(len(indexed_images_fields.index.get_level_values(0).unique()), 'patients')
indexed_images_fields.head()

255 patients


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 0,columns,date,description,full_id,nb_dicoms,raw_path,rows,study,thickness,view
patient_id,modality,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
125^126^825^^,t1post,119,236,20040318.0,AXIAL T1 POST_FIL,2004-03__Studies/125^126^825^^,25,/labs/gevaertlab/users/hackhack/RTOG/RTOG_dupl...,320,2004-03__Studies,5.0,axial
125^126^825^^,flair,118,512,20040318.0,AXIAL FLAIR,2004-03__Studies/125^126^825^^,25,/labs/gevaertlab/users/hackhack/RTOG/RTOG_dupl...,640,2004-03__Studies,5.0,axial
6^2421^825^^,t1post,196,416,20090622.0,AX T1 SE FS POST 512,2009-06__Studies/6^2421^825^^,24,/labs/gevaertlab/users/hackhack/RTOG/RTOG_dupl...,512,2009-06__Studies,5.0,axial
6^2421^825^^,flair,197,448,20090622.0,AX T2 FLAIR,2009-06__Studies/6^2421^825^^,24,/labs/gevaertlab/users/hackhack/RTOG/RTOG_dupl...,512,2009-06__Studies,5.0,axial
20^3722^825^^,t1post,173,512,20090625.0,Ax T1 FSE + GAD,2009-06__Studies/20^3722^825^^,23,/labs/gevaertlab/users/hackhack/RTOG/RTOG_dupl...,512,2009-06__Studies,5.0,axial


In [90]:
indexed_images_fields.to_csv("rtog_kept_dicoms_T1post_flair.csv")

In [92]:
patients_shortlist = indexed_images_fields.index.get_level_values(0).unique().values
print(len(patients_shortlist), "patients")
patients_shortlist

255 patients


array(['125^126^825^^', '6^2421^825^^', '20^3722^825^^', '16^7758^825^^',
       '25^7504^825^^', '27^5715^825^^', '21^3722^825^^', '46^2508^825^^',
       '48^2503^825^^', '52^8907^825^^', '79^154^825^^', '56^2424^825^^',
       '75^9682^825^^', '55^1324^825^^', '53^2408^825^^', '54^2421^825^^',
       '89^3442^825^^', '85^7829^825^^', '122^9672^825^^',
       '88^1515^825^^', '105^7017^825^^', '111^601^825^^',
       '115^9666^825^^', '5^131^825^^', '118^2503^825^^', '119^399^825^^',
       '141^5901^825^^', '102^9598^825^^', '23^2232^825^^',
       '137^7774^825^^', '121^7751^825^^', '150^5901^825^^',
       '123^7774^825^^', '129^7902^825^^', '42^9649^825^^',
       '142^611^825^^', '160^9709^825^^', '167^8701^825^^',
       '170^137^825^^', '138^7001^825^^', '145^2217^825^^',
       '168^7830^825^^', '173^1324^825^^', '151^137^825^^',
       '177^7728^825^^', '180^2503^825^^', '179^7821^825^^',
       '193^7723^825^^', '158^7004^825^^', '226^1523^825^^',
       '205^7627^825^^', '

# Investigating un-classified fields

In [165]:
images_fields = images_fields0

### Filtering out patients for which we have full information

In [166]:
images_fields = images_fields.loc[~images_fields.patient_id.isin(patients_shortlist)]
print(len(images_fields.patient_id.unique()), 'patients')

286 patients


### Droping unwanted views

In [167]:
images_fields.view.unique()

array(['N/A', 'axial', 'coronal', 'sagital'], dtype=object)

In [168]:
# drop any row for which view is known not to be axial.
images_fields = images_fields.query("view not in ('sagital', 'coronal')")

In [169]:
print(len(images_fields.patient_id.unique()), 'patients')

285 patients


In [170]:
len(images_fields)

11251

### Droping unwanted modalities

In [171]:
images_fields.modality.unique()

array(['N/A', 't2', 'flair', 't1pre', 't1post', 't1 pre or post?',
       'multiple modalities detected: t2/gre', 'gre', 'dwi',
       'multiple modalities detected: t2/blade',
       'multiple modalities detected: dwi/propeller',
       'multiple modalities detected: t2/flair/propeller',
       'multiple modalities detected: t2/propeller',
       'multiple modalities detected: t1/mprage',
       'multiple modalities detected: t1/flair', 'mprage',
       'multiple modalities detected: t1/blade',
       'multiple modalities detected: flair/blade',
       'multiple modalities detected: t1/gre',
       'multiple modalities detected: t1/t2', 'blade',
       'multiple modalities detected: t1/flair/propeller',
       'multiple modalities detected: t2/mprage'], dtype=object)

In [172]:
# drop any row for which modality is known not to be in (T1post, FLAIR).
images_fields = images_fields.query("modality not in ('t1pre', 't2')")

In [173]:
print(len(images_fields.patient_id.unique()), 'patients')

284 patients


In [174]:
len(images_fields)

9917

### Unclassified modality cases

##### T1post vs. T1pre

List of descriptions for which we can't classify between T1 post and T1 pre

In [204]:
pre_or_post_images_fields = images_fields[images_fields.modality == "t1 pre or post?"]
print(len(pre_or_post_images_fields.description.unique()))
print(len(pre_or_post_images_fields))
sorted(list(pre_or_post_images_fields.description.unique()))

213
1030


['* Brain    *AX T1/SE',
 '*T1 AXIAL INC NOSE/FACE/TOP HEAD',
 '+AX T1',
 '+AX T1 FSE',
 '+AX T1 SE',
 '+Ax T1',
 '+Ax T1 SE',
 '1. AX SE T1',
 '1. AX TSE T1',
 '1MM AX SPGR 3D T1+16C',
 '2. AX SE T1 FS W',
 '2. AX SE T1 FS W  10 ML GADAVIST',
 '2. AX SE T1 FS W  15ML MAGNEVIST',
 '2. AX SE T1 W FS',
 '2. AX TSE T1 FS W  10ML GADAVIST',
 '2. AX TSE T1 FS W  12ML GADAVIST',
 '2. AX TSE T1 FS W  20ML MAGNEVIST',
 '2. AX TSE T1 FS W 20 ML MAGNEVIST',
 '3d T1 FSPGR (INCLUDE NOSE TIP',
 '7 -Ax T1',
 '7-Ax T1',
 'AX 2D T1',
 'AX 3D FSPGR T1 BRAINLAB',
 'AX 3D T1',
 'AX 3D T1 / Corrected rCBV',
 'AX 3D T1 / Leakage',
 'AX FSE T1',
 'AX FSPGR 3D T1 +16mL',
 'AX FSPGR 3D T1 +17mL',
 'AX FSPGR 3D T1 +20mL',
 'AX SE T1',
 'AX SE T1 512',
 'AX SPGR T1',
 'AX SPGR T1_ND',
 'AX T1',
 'AX T1 3D FLASH  THIN 1.0MM',
 'AX T1 3D FLASH THIN 1.0 MM',
 'AX T1 3D FSPGR +17mL',
 'AX T1 BRAIN',
 'AX T1 C+magnevist',
 'AX T1 FC PG MTC',
 'AX T1 FLASH',
 'AX T1 FLASH 2D',
 'AX T1 FSPGR',
 'AX T1 FSPGR 3D',
 'AX 

Description for which we can't classify between T1 post and T1 pre; and related DICOM folder paths.

In [205]:
pre_or_post_images_fields = pre_or_post_images_fields.loc[:, ["patient_id", "description", "raw_path"]].sort_values("description").reset_index(drop=True)
pre_or_post_images_fields

Unnamed: 0,patient_id,description,raw_path
0,84^7504^825^^,* Brain *AX T1/SE,/labs/gevaertlab/users/hackhack/RTOG/RTOG_dupl...
1,339^9598^825^^,*T1 AXIAL INC NOSE/FACE/TOP HEAD,/labs/gevaertlab/users/hackhack/RTOG/RTOG_dupl...
2,373^9598^825^^,*T1 AXIAL INC NOSE/FACE/TOP HEAD,/labs/gevaertlab/users/hackhack/RTOG/RTOG_dupl...
3,654^9598^825^^,*T1 AXIAL INC NOSE/FACE/TOP HEAD,/labs/gevaertlab/users/hackhack/RTOG/RTOG_dupl...
4,332^7729^825^^,+AX T1,/labs/gevaertlab/users/hackhack/RTOG/RTOG_dupl...
5,541^7729^825^^,+AX T1,/labs/gevaertlab/users/hackhack/RTOG/RTOG_dupl...
6,332^7729^825^^,+AX T1,/labs/gevaertlab/users/hackhack/RTOG/RTOG_dupl...
7,332^7729^825^^,+AX T1 FSE,/labs/gevaertlab/users/hackhack/RTOG/RTOG_dupl...
8,541^7729^825^^,+AX T1 FSE,/labs/gevaertlab/users/hackhack/RTOG/RTOG_dupl...
9,199^7729^825^^,+AX T1 FSE,/labs/gevaertlab/users/hackhack/RTOG/RTOG_dupl...


In [206]:
pre_or_post_images_fields.to_csv("images_unclassified_t1_preVpost.csv")

In [207]:
pre_or_post_descriptions = pre_or_post_images_fields.drop_duplicates("description").reset_index(drop=True)
pre_or_post_descriptions

Unnamed: 0,patient_id,description,raw_path
0,84^7504^825^^,* Brain *AX T1/SE,/labs/gevaertlab/users/hackhack/RTOG/RTOG_dupl...
1,339^9598^825^^,*T1 AXIAL INC NOSE/FACE/TOP HEAD,/labs/gevaertlab/users/hackhack/RTOG/RTOG_dupl...
2,332^7729^825^^,+AX T1,/labs/gevaertlab/users/hackhack/RTOG/RTOG_dupl...
3,332^7729^825^^,+AX T1 FSE,/labs/gevaertlab/users/hackhack/RTOG/RTOG_dupl...
4,961^7729^825^^,+AX T1 SE,/labs/gevaertlab/users/hackhack/RTOG/RTOG_dupl...
5,668^7627^825^^,+Ax T1,/labs/gevaertlab/users/hackhack/RTOG/RTOG_dupl...
6,332^7729^825^^,+Ax T1 SE,/labs/gevaertlab/users/hackhack/RTOG/RTOG_dupl...
7,520^3343^825^^,1. AX SE T1,/labs/gevaertlab/users/hackhack/RTOG/RTOG_dupl...
8,787^3343^825^^,1. AX TSE T1,/labs/gevaertlab/users/hackhack/RTOG/RTOG_dupl...
9,411^7735^825^^,1MM AX SPGR 3D T1+16C,/labs/gevaertlab/users/hackhack/RTOG/RTOG_dupl...


In [208]:
pre_or_post_descriptions.to_csv("descriptions_unclassified_t1_preVpost.csv")

##### Multiple modalities detected

List of descriptions for which we can't classify modality with multiple modalities

In [209]:
multiple_mod_images_fields = images_fields[['multiple modalities detected' in s for s in images_fields.modality]]
print(len(multiple_mod_images_fields.description.unique()))
print(len(multiple_mod_images_fields))
sorted(list(multiple_mod_images_fields.description.unique()))

138
624


['*AX GRE T2*',
 '*head3-pl T2* FGRE S',
 '+Ax T2 FLAIR PROPELLER',
 '+Ax T2 PROPELLER',
 '+Ax T2 Propeller',
 '+Ax T2Flair Propeller',
 '+C AX T1 MPRAGE',
 '+C Ax T2* GRE',
 '+C T1Flair Propeller',
 '+T2 Propeller',
 '1-3-pl T2* FGRE',
 '3 pl T2* FGRE S',
 '3-D T2* GRE EPI SENSE',
 '3-pl T2* FGRE',
 '3-pl T2* FGRE S',
 '3-pl T2* FGRE S HEAD  PRE',
 '3D T2 FGRE +C STEALTH',
 'AX 3D T1  MPRAGE sp GAD',
 'AX 3D T1 MPRAGE',
 'AX FLAIR BLADE',
 'AX FLAIR BLADE FATSAT',
 'AX FSE T2 Propeller',
 'AX GRE T2',
 'AX GRE T2*',
 'AX GRE T2* (Melanoma, Bleed)',
 'AX GRE T2* EPI',
 'AX IR T2 BLADE',
 'AX T1 FLAIR',
 'AX T1 FLAIR (angled) + C',
 'AX T1 FLAIR +C',
 'AX T1 FLAIR FS POST',
 'AX T1 FLAIR POST FS',
 'AX T1 FLAIR PRE',
 'AX T1 FLAIR PRE GAD',
 'AX T1 FLAIR c+',
 'AX T1 MPRAGE',
 'AX T1 MPRAGE POST-iso voxel',
 'AX T1 MPRAGE PRE-iso voxel',
 'AX T1 POST FS BLADE',
 'AX T2 BLADE',
 'AX T2 FLAIR propeller',
 'AX T2 GRE',
 'AX T2 GRE HEMO',
 'AX T2 GRE RPT',
 'AX T2 GRE_FIL',
 'AX T2 GRE_HEME

Description for which we can't classify between T1 post and T1 pre; and related DICOM folder paths.

In [210]:
multiple_mod_images_fields = multiple_mod_images_fields.loc[:, ["patient_id", "description", "raw_path"]].sort_values("description").reset_index(drop=True)
multiple_mod_images_fields

Unnamed: 0,patient_id,description,raw_path
0,376^5915^825^^,*AX GRE T2*,/labs/gevaertlab/users/hackhack/RTOG/RTOG_dupl...
1,339^9598^825^^,*head3-pl T2* FGRE S,/labs/gevaertlab/users/hackhack/RTOG/RTOG_dupl...
2,373^9598^825^^,*head3-pl T2* FGRE S,/labs/gevaertlab/users/hackhack/RTOG/RTOG_dupl...
3,654^9598^825^^,*head3-pl T2* FGRE S,/labs/gevaertlab/users/hackhack/RTOG/RTOG_dupl...
4,817^7729^825^^,+Ax T2 FLAIR PROPELLER,/labs/gevaertlab/users/hackhack/RTOG/RTOG_dupl...
5,961^7729^825^^,+Ax T2 FLAIR PROPELLER,/labs/gevaertlab/users/hackhack/RTOG/RTOG_dupl...
6,961^7729^825^^,+Ax T2 FLAIR PROPELLER,/labs/gevaertlab/users/hackhack/RTOG/RTOG_dupl...
7,961^7729^825^^,+Ax T2 PROPELLER,/labs/gevaertlab/users/hackhack/RTOG/RTOG_dupl...
8,961^7729^825^^,+Ax T2 PROPELLER,/labs/gevaertlab/users/hackhack/RTOG/RTOG_dupl...
9,817^7729^825^^,+Ax T2 PROPELLER,/labs/gevaertlab/users/hackhack/RTOG/RTOG_dupl...


In [211]:
multiple_mod_images_fields.to_csv("images_unclassified_multiple_modalities.csv")

In [212]:
multiple_mod_descriptions = multiple_mod_images_fields.drop_duplicates("description").reset_index(drop=True)
multiple_mod_descriptions

Unnamed: 0,patient_id,description,raw_path
0,376^5915^825^^,*AX GRE T2*,/labs/gevaertlab/users/hackhack/RTOG/RTOG_dupl...
1,339^9598^825^^,*head3-pl T2* FGRE S,/labs/gevaertlab/users/hackhack/RTOG/RTOG_dupl...
2,817^7729^825^^,+Ax T2 FLAIR PROPELLER,/labs/gevaertlab/users/hackhack/RTOG/RTOG_dupl...
3,961^7729^825^^,+Ax T2 PROPELLER,/labs/gevaertlab/users/hackhack/RTOG/RTOG_dupl...
4,199^7729^825^^,+Ax T2 Propeller,/labs/gevaertlab/users/hackhack/RTOG/RTOG_dupl...
5,199^7729^825^^,+Ax T2Flair Propeller,/labs/gevaertlab/users/hackhack/RTOG/RTOG_dupl...
6,344^7766^825^^,+C AX T1 MPRAGE,/labs/gevaertlab/users/hackhack/RTOG/RTOG_dupl...
7,881^3431^825^^,+C Ax T2* GRE,/labs/gevaertlab/users/hackhack/RTOG/RTOG_dupl...
8,422^2503^825^^,+C T1Flair Propeller,/labs/gevaertlab/users/hackhack/RTOG/RTOG_dupl...
9,332^7729^825^^,+T2 Propeller,/labs/gevaertlab/users/hackhack/RTOG/RTOG_dupl...


In [213]:
multiple_mod_descriptions.to_csv("descriptions_unclassified_multiple_modalities.csv")

##### No modality detected

List of descriptions for which we can't classify modality with multiple modalities

In [214]:
no_mod_images_fields = images_fields[images_fields.modality == 'N/A']
print(len(no_mod_images_fields.description.unique()))
print(len(no_mod_images_fields))
sorted(list(no_mod_images_fields.description.unique()))

1216
5963


['',
 '\n',
 '(14760/13/1..24)-(14760/9/1..24)',
 '(16219/10/1)-(16219/8/1)',
 '(16291/9/1..24)-(16291/4/1..24)',
 '* Brain    * DW/SSh',
 '* Brain    *AX DE',
 '* Brain    *SURVEY',
 '**PG AX**',
 '*AXIAL GRADIENT',
 '*Ax PROBE-SV PRESS 35TE',
 '*DIFFUSION QUIK GE',
 '*FAST LOC FOR SPECTRO',
 '*MST SURVEY',
 '+ AX TENSOR 25',
 '+ C AX 3D SPGR',
 '+ PERFUSION AX',
 '+ ax FSPGR BRAVO',
 '+3D SPGR F/U METS/TUMOR',
 '+AX BRAVO FSPGR 3D',
 '+C  Ax STEALTH  bravo',
 '+C  Ax bravo 3mm arc',
 '+C 3D AXIAL,IRSPGR,Fast',
 '+C AX 3D SPGR',
 '+C AX FSPGR FATSAT',
 '+C Ax PERF EPI 2:15min/20slice',
 '+C EPI perf 2:15min (~20slices)',
 '+C IR SPGR 3D VOLUMETRIC',
 '+FSPGR 3D',
 '+c Ax 3D FSPGR BRAVO 1x1x1',
 '----CONTRAST----',
 '..COR 1mm MPR sp GAD (from axial)',
 '.6 SEC HELICAL SCAN',
 '.6 SEC HELICAL W/IV',
 '.AX MPR (from sag)',
 '15CC POST GD* AX',
 '2-Calibration Scan',
 '2.0',
 '20ML GD*AX',
 '20ML POST GD* AX',
 '20ML POST GD*AX',
 '20ML*POST GD*AX',
 '3 PL LOC',
 '3 PLANE',
 '3 PLANE LOC

Description for which we can't classify between T1 post and T1 pre; and related DICOM folder paths.

In [216]:
no_mod_images_fields = no_mod_images_fields.loc[:, ["patient_id", "description", "raw_path"]].sort_values("description").reset_index(drop=True)
no_mod_images_fields

Unnamed: 0,patient_id,description,raw_path
0,84^7504^825^^,,/labs/gevaertlab/users/hackhack/RTOG/RTOG_dupl...
1,114^7720^825^^,,/labs/gevaertlab/users/hackhack/RTOG/RTOG_dupl...
2,84^7504^825^^,,/labs/gevaertlab/users/hackhack/RTOG/RTOG_dupl...
3,84^7504^825^^,,/labs/gevaertlab/users/hackhack/RTOG/RTOG_dupl...
4,114^7720^825^^,,/labs/gevaertlab/users/hackhack/RTOG/RTOG_dupl...
5,84^7504^825^^,,/labs/gevaertlab/users/hackhack/RTOG/RTOG_dupl...
6,376^5915^825^^,,/labs/gevaertlab/users/hackhack/RTOG/RTOG_dupl...
7,84^7504^825^^,,/labs/gevaertlab/users/hackhack/RTOG/RTOG_dupl...
8,84^7504^825^^,,/labs/gevaertlab/users/hackhack/RTOG/RTOG_dupl...
9,376^5915^825^^,,/labs/gevaertlab/users/hackhack/RTOG/RTOG_dupl...


In [217]:
no_mod_images_fields.to_csv("images_unclassified_no_modalities.csv")

In [218]:
no_mod_descriptions = no_mod_images_fields.drop_duplicates("description").reset_index(drop=True)
no_mod_descriptions

Unnamed: 0,patient_id,description,raw_path
0,84^7504^825^^,,/labs/gevaertlab/users/hackhack/RTOG/RTOG_dupl...
1,392^7504^825^^,\n,/labs/gevaertlab/users/hackhack/RTOG/RTOG_dupl...
2,729^5903^825^^,(14760/13/1..24)-(14760/9/1..24),/labs/gevaertlab/users/hackhack/RTOG/RTOG_dupl...
3,729^5903^825^^,(16219/10/1)-(16219/8/1),/labs/gevaertlab/users/hackhack/RTOG/RTOG_dupl...
4,778^3343^825^^,(16291/9/1..24)-(16291/4/1..24),/labs/gevaertlab/users/hackhack/RTOG/RTOG_dupl...
5,84^7504^825^^,* Brain * DW/SSh,/labs/gevaertlab/users/hackhack/RTOG/RTOG_dupl...
6,84^7504^825^^,* Brain *AX DE,/labs/gevaertlab/users/hackhack/RTOG/RTOG_dupl...
7,84^7504^825^^,* Brain *SURVEY,/labs/gevaertlab/users/hackhack/RTOG/RTOG_dupl...
8,696^6001^825^^,**PG AX**,/labs/gevaertlab/users/hackhack/RTOG/RTOG_dupl...
9,654^9598^825^^,*AXIAL GRADIENT,/labs/gevaertlab/users/hackhack/RTOG/RTOG_dupl...


In [219]:
no_mod_descriptions.to_csv("descriptions_unclassified_no_modalities.csv")