**This notebook shows the entire data collection pipeline for this project using a single collection TCGA-LGG from the TCIA archive**

In [1]:
# import modules

import requests
import json
import zipfile
import pandas as pd
import io
import pydicom
import tqdm
from pydicom import dcmread
from pydicom.filebase import DicomBytesIO
import multiprocessing as mp
import matplotlib.pyplot as plt
%matplotlib inline

**The Cancer Imaging Archive (TCIA) is a service which de-identifies and hosts a large archive of medical images of cancer accessible for public download. The data are organized as “collections”; typically patients’ imaging related by a common disease (e.g. lung cancer), image modality or type (MRI, CT, digital histopathology, etc) or research focus. DICOM is the primary file format used by TCIA for radiology imaging. Supporting data related to the images such as patient outcomes, treatment details, genomics and expert analyses are also provided when available.**

Here, we have provided a series of methods to make requests to the TCIA API and download medical images of cancer available in their archive

# Get all the collections from the TCIA API 

In [2]:
def getCollections():
    '''Provides all the collections available in the Archive'''
    baseurl = 'https://services.cancerimagingarchive.net/services/v3/TCIA'
    queryEndpoint = '/query/getCollectionValues?'
    queryParams = ''
    form = 'format=json'
    url = baseurl+queryEndpoint+queryParams+form
    
    response = requests.get(url)
    if response.status_code==200:
        Collections = []
        for dictionary in response.json():
            Collections.append(dictionary['Collection'])
        return Collections
    else:
        raise ValueError('Bad/No response')

AllCollections = getCollections()

# Map the Body part values affected by the collections name

In [3]:
%%time
### multiprocessing domain

import gbp #the function py file

pool = mp.Pool(mp.cpu_count())
results = pool.map(gbp.getBodyPart,[c for c in AllCollections])


  0%|                                                                                           | 0/96 [00:00<?, ?it/s]

Wall time: 2min 49s


In [4]:
pooledBodyPartAffected = dict(zip(AllCollections,results))
BodyPartAffected = pooledBodyPartAffected

In [5]:
%store BodyPartAffected

Stored 'BodyPartAffected' (dict)


In [2]:
%store -r BodyPartAffected

# Get the collections with only brain images 

In [3]:
def filter_collections():
    '''returns only those collections where the body part affected is
    either the brain or the lung'''
    Collections_brain = []
    Collections_lung  = []
    for key,value in BodyPartAffected.items():
        if len(value)==1:
            if value[0] == 'BRAIN':
                Collections_brain.append(key)
            elif value[0] == 'LUNG':
                Collections_lung.append(key)
    return Collections_brain,Collections_lung

brain,lungs = filter_collections()

In [4]:
brain[3]

'TCGA-LGG'

# Out of the brain images collections, get the collections with clinical data and download the clinical data zip file

*To be done by someone* here I am assuming that we got the TCGA-LGG zip file 

# Out of the clinical data, get the dataframes with vital status

In [5]:
zfile = 'TCGA-LGG Clinical Data 1516.zip'
def getdfs(file):
    with zipfile.ZipFile(file) as thezip:
        dataframes = []
        for filename in thezip.namelist():
            print(filename)
            data = pd.read_csv(thezip.open(filename),sep= '\t',header=[0,1,2])
            dataframes.append(data)
        dataframes_valid = []
    for i,j in enumerate(dataframes):
        for column in j.columns:
            if 'vital_status' in column:
                dataframes_valid.append(j)
    return dataframes_valid

dfs = getdfs(zfile)

nationwidechildrens.org_clinical_nte_lgg.txt
nationwidechildrens.org_clinical_omf_v4.0_lgg.txt
nationwidechildrens.org_clinical_patient_lgg.txt
nationwidechildrens.org_clinical_radiation_lgg.txt
nationwidechildrens.org_clinical_drug_lgg.txt
nationwidechildrens.org_clinical_follow_up_v1.0_lgg.txt


# From the dataframes, get the patient ids and their outcome

In [6]:
def getPatientID(df):
    PIDS = []
    for pid in df['bcr_patient_barcode'].values:
        PIDS.append(pid[0])
    labels = []
    for label in df.loc[:,'vital_status'].values:
        labels.append(label[0])
    return PIDS,labels

In [7]:
PID,labels = getPatientID(dfs[0])

In [8]:
len(PID),len(labels)

(197, 197)

# From the patient IDs and collections, get the series id

In [10]:
%%time
### multiprocessing domain

import gSID #the function py file

pool = mp.Pool(mp.cpu_count())
results = pool.starmap(gSID.getSeriesID,[(pid,brain[3]) for pid in PID])

  0%|                                                                                          | 0/197 [00:00<?, ?it/s]

Wall time: 6min 13s


In [11]:
pooledpdict = {}

for pid,label,sid in zip(PID,labels,results):
    pooledpdict[pid] = {'label':label,'SerialIDs':sid}

In [12]:
%store pooledpdict

Stored 'pooledpdict' (dict)


In [9]:
%store -r pooledpdict

In [10]:
pdict = pooledpdict

# From the series ids, try to get the first five images in that series file

*will try random 5 might be better*

In [12]:
pool = mp.Pool(mp.cpu_count())

In [13]:
%%time
### multiprocessing domain

import gI #the function py file

results = pool.map(gI.getImages,[pdict[pid]['SerialIDs'] for pid in PID])
pool.close() 

Wall time: 1h 16min 3s


In [14]:
%%time
for pid,imgdata in zip(PID,results):
    imagedataset = []
    for f in imgdata:
        try:
            img = dcmread(DicomBytesIO(f))
            imagedataset.append(img.pixel_array)
        except:
            continue
        
    pdict[pid]['images'] = imagedataset 
    pdict[pid]['image_count'] = len(imagedataset)



Wall time: 39.9 s


In [15]:
%store pdict

Stored 'pdict' (dict)
