# Data Acquisition: Cancer Digital Slide Archive

This notebook illustrates the steps used to retrieve data from http://cancer.digitalslidearchive.net/. This process was mostly automated, and primarily relied on web scraping methods.

In [3]:
from bs4 import BeautifulSoup as BS
import pandas as pd
import os
from os import listdir
from os.path import isfile, join
# import urllib.request
import re
import joblib
num_cores = 8

Initialize empty dataframe:

In [4]:
pdf = pd.DataFrame(columns = ['url', 'collection'])

Read text files from directory. These text files were created by manually browsing each collection and copying the data into the respective .txt files.

In [5]:
mypath = '/home/dagutman/devel/MSDA-Capstone/1_Data_Acquisition/cdsa_scraping/'
onlyfiles = [f for f in listdir(mypath) if isfile(join(mypath, f))]

In [8]:
onlyfiles

['prad.txt',
 'acc.txt',
 'sarc.txt',
 'ov.txt',
 'chol.txt',
 'uvm.txt',
 'dlbc.txt',
 'coad.txt',
 'thca.txt',
 'lusc.txt',
 'thym.txt',
 'hnsc.txt',
 'lgg.txt',
 'read.txt',
 'tgct.txt',
 'kich.txt',
 'paad.txt',
 'brca.txt',
 'luad.txt',
 'esca.txt',
 'ucec.txt',
 'pcpg.txt',
 'ucs.txt',
 'meso.txt',
 'blca.txt',
 'gbm.txt',
 'lihc.txt',
 'kirp.txt',
 'skcm.txt',
 'stad.txt',
 'cesc.txt']

Retrieve img tags from the html documents. These tags correspond to thumbnails displayed for each slide:

In [10]:
for file in onlyfiles:
    print(file)
    with open(mypath+'/'+file, 'r') as myfile:
        data=myfile.read().replace('\n', '')

#    soup = BS(data, "lxml")
    soup = BS(data)

    for imgtag in soup.find_all('img'):
        pdf = pdf.append({"url": imgtag, "collection":file}, ignore_index=True)
pdf_bk = pdf

prad.txt




 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "html5lib")

  markup_type=markup_type))


acc.txt
sarc.txt
ov.txt
chol.txt
uvm.txt
dlbc.txt
coad.txt
thca.txt
lusc.txt
thym.txt
hnsc.txt
lgg.txt
read.txt
tgct.txt
kich.txt
paad.txt
brca.txt
luad.txt
esca.txt
ucec.txt
pcpg.txt
ucs.txt
meso.txt
blca.txt
gbm.txt
lihc.txt
kirp.txt
skcm.txt
stad.txt
cesc.txt


Data Cleaning: Convert BS4 objects to strings, and change thumbnail width from 200 to 500:

In [11]:
pdf['collection'] = pdf['collection'].astype(str)
pdf['collection'] = pdf['collection'].str.replace('.txt', '')

In [12]:
pdf['url'] = pdf['url'].astype(str)
pdf = pdf[pdf['url'].str.contains("emory")]
pdf['url'] = pdf['url'].str.replace('WID=200', 'WID=500')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Gather urls from between quotation marks, and construct slide names:

In [13]:
pdf['url'] = pdf['url'].str.extract('"([^"]*)"')
pdf_bk = pdf
pdf['url'] = pdf['url'].str.replace('&amp;', '&')
pdf['name'] = 'TCGA'+pdf['url'].str.extract('TCGA([^\.]*)\.')

  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
  after removing the cwd from sys.path.


#### Download Thumbnails:

In [23]:
folder_names = pdf['collection'].unique()
img_path = '/home/dagutman/Documents/tcga_imgs/thumbnails/'

def process_download(url_idx, df_slice, category):
    img_filename = img_path + category + '/' + df_slice['name'].iloc[url_idx] + ".jpg"
    img_url = df_slice.iloc[url_idx, 0]
    try:
        urllib.request.urlretrieve(img_url, img_filename)
    except:
        pass
    print(category+': '+str(url_idx+1)+'/'+str(len(df_slice['url'])+1))
    
for category in folder_names:
    category_dir = img_path+category
    if not os.path.exists(category_dir):
        os.makedirs(category_dir)
    df_slice = pdf[pdf['collection'].str.contains(category)]
    joblib.Parallel(n_jobs=num_cores)(joblib.delayed(process_download)(i, df_slice, category) for i, image in enumerate(df_slice['url']))
    sys.exit()

prad: 1/1263
prad: 4/1263
prad: 2/1263
prad: 3/1263
prad: 5/1263
prad: 6/1263
prad: 7/1263
prad: 8/1263
prad: 9/1263
prad: 10/1263
prad: 11/1263
prad: 12/1263
prad: 13/1263
prad: 14/1263
prad: 15/1263
prad: 16/1263
prad: 17/1263
prad: 18/1263
prad: 19/1263
prad: 20/1263
prad: 21/1263
prad: 22/1263
prad: 23/1263
prad: 47/1263
prad: 24/1263
prad: 48/1263
prad: 25/1263
prad: 49/1263
prad: 26/1263
prad: 50/1263
prad: 27/1263
prad: 51/1263
prad: 28/1263
prad: 29/1263
prad: 77/1263
prad: 52/1263
prad: 36/1263
prad: 30/1263
prad: 78/1263
prad: 53/1263
prad: 31/1263
prad: 54/1263
prad: 32/1263
prad: 79/1263
prad: 55/1263
prad: 33/1263
prad: 80/1263
prad: 81/1263
prad: 56/1263
prad: 34/1263
prad: 82/1263
prad: 35/1263
prad: 57/1263
prad: 107/1263
prad: 83/1263
prad: 108/1263
prad: 37/1263
prad: 84/1263
prad: 109/1263
prad: 85/1263
prad: 110/1263
prad: 38/1263
prad: 86/1263
prad: 87/1263
prad: 137/1263
prad: 39/1263
prad: 58/1263
prad: 59/1263
prad: 138/1263
prad: 111/1263
prad: 40/1263
prad: 13

prad: 526/1263
prad: 869/1263
prad: 760/1263
prad: 870/1263
prad: 527/1263
prad: 871/1263
prad: 761/1263
prad: 762/1263
prad: 872/1263
prad: 763/1263
prad: 874/1263
prad: 528/1263
prad: 764/1263
prad: 875/1263
prad: 529/1263
prad: 634/1263
prad: 765/1263
prad: 877/1263
prad: 530/1263
prad: 876/1263
prad: 766/1263
prad: 531/1263
prad: 767/1263
prad: 878/1263
prad: 768/1263
prad: 635/1263
prad: 532/1263
prad: 769/1263
prad: 879/1263
prad: 636/1263
prad: 770/1263
prad: 880/1263
prad: 533/1263
prad: 771/1263
prad: 637/1263
prad: 993/1263
prad: 772/1263
prad: 534/1263
prad: 773/1263
prad: 638/1263
prad: 639/1263
prad: 535/1263
prad: 640/1263
prad: 641/1263
prad: 994/1263
prad: 536/1263
prad: 995/1263
prad: 642/1263
prad: 643/1263
prad: 996/1263
prad: 881/1263
prad: 997/1263
prad: 537/1263
prad: 774/1263
prad: 882/1263
prad: 538/1263
prad: 883/1263
prad: 998/1263
prad: 539/1263
prad: 999/1263
prad: 1117/1263
prad: 1120/1263
prad: 1000/1263
prad: 540/1263
prad: 775/1263
prad: 541/1263
prad: 1

prad: 1079/1263
prad: 721/1263
prad: 1181/1263
prad: 618/1263
prad: 1080/1263
prad: 1182/1263
prad: 1081/1263
prad: 722/1263
prad: 974/1263
prad: 619/1263
prad: 723/1263
prad: 1183/1263
prad: 841/1263
prad: 1082/1263
prad: 1184/1263
prad: 620/1263
prad: 724/1263
prad: 975/1263
prad: 842/1263
prad: 1083/1263
prad: 725/1263
prad: 1084/1263
prad: 726/1263
prad: 976/1263
prad: 1185/1263
prad: 1085/1263
prad: 727/1263
prad: 843/1263
prad: 1186/1263
prad: 977/1263
prad: 728/1263
prad: 844/1263
prad: 1086/1263
prad: 1187/1263
prad: 845/1263
prad: 978/1263
prad: 729/1263
prad: 1087/1263
prad: 1188/1263
prad: 979/1263
prad: 730/1263
prad: 1088/1263
prad: 1189/1263
prad: 731/1263
prad: 1089/1263
prad: 1190/1263
prad: 732/1263
prad: 980/1263
prad: 1191/1263
prad: 1090/1263
prad: 846/1263
prad: 733/1263
prad: 1192/1263
prad: 981/1263
prad: 1091/1263
prad: 734/1263
prad: 1193/1263
prad: 847/1263
prad: 735/1263
prad: 1194/1263
prad: 1092/1263
prad: 982/1263
prad: 848/1263
prad: 736/1263
prad: 1195/1

NameError: name 'sys' is not defined

In [41]:
# i, image in enumerate(df_slice['url'])
print 
# import urllib
# print img_url, img_filename
#urllib.request.urlretrieve(df_slice['url'][3],"/home/dagutman/heloo.png")

# myUrl = df_slice['url'][3]
# chunk_size = 512

# myFilePath = '/home/dagutman/heloo.png'
# import urllib3
# http = urllib3.PoolManager()
# r = http.request('GET', myUrl, preload_content=False)

# with open(myFilePath, 'wb') as out:
#     while True:
#         data = r.read(chunk_size)
#         if not data:
#             break
#         out.write(data)

# r.release_conn()


url = df_slice['url'][4]

filename ='/home/dagutman/helo123.jpeg'
connection_pool = urllib3.PoolManager()
resp = connection_pool.request('GET',url )
f = open(filename, 'wb')
f.write(resp.data)
f.close()
resp.release_conn()




### Download Tiles

Make adjustments to urls to allow tiles to be requested:

In [35]:
dz_pdf = pdf
dz_pdf['url'] = dz_pdf['url'].str.replace('FIF=', 'DeepZoom=')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


The following is the code used to download tile data. Small adjustments were made to this code to allow multiple instances of python, and to allow simultaneous download on a Google Compute Engine VM -- modifications that allowed parallelization and significant speed-ups in data acquisition:

In [94]:
img_path = '/Users/aadi/Documents/tcga_imgs/tiles/'


def process_download_tile(url_idx, df_slice, category, save_dir, x, y):
    img_filename = save_dir + df_slice['name'].iloc[url_idx] + '_' + str(x) + '_' + str(y) + ".jpg"
    img_url = df_slice['url'].iloc[url_idx]
    img_url = img_url[:-17]
    img_url = img_url+"_files/15/"+str(x)+'_'+str(y)+'.jpg'
    #print(img_url)
    try:
        urllib.request.urlretrieve(img_url, img_filename)
    except:
        pass
    #print(category+': '+str(url_idx+1)+'/'+str(len(df_slice['url'])+1)+' -- '+str(x)+','+str(y))


In [96]:
for category in folder_names:
    category_dir = img_path+category
    if not os.path.exists(category_dir):
        os.makedirs(category_dir)
    df_slice = dz_pdf[dz_pdf['collection'].str.contains(category)]
    for url_idx in range(0, len(df_slice['url'])):
        print(category+': '+str(url_idx))
        save_dir = '/Users/aadi/Documents/tcga_imgs/tiles/'+category+'/'+df_slice['name'].iloc[url_idx]+'/'
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)
#        for x in range(0, 100):
        joblib.Parallel(n_jobs=16)(joblib.delayed(process_download_tile)(url_idx, df_slice, category, save_dir, x, y) for y in range(0,30) for x in range(0,100))


brca: 1000
brca: 1001
brca: 1002
brca: 1003
brca: 1004
brca: 1005
brca: 1006
brca: 1007
brca: 1008
brca: 1009
brca: 1010
brca: 1011
brca: 1012
brca: 1013
brca: 1014
brca: 1015
brca: 1016
brca: 1017
brca: 1018
brca: 1019
brca: 1020
brca: 1021
brca: 1022
brca: 1023
brca: 1024
brca: 1025
brca: 1026
brca: 1027
brca: 1028
brca: 1029
brca: 1030
brca: 1031
brca: 1032
brca: 1033
brca: 1034
brca: 1035
brca: 1036
brca: 1037
brca: 1038
brca: 1039
brca: 1040
brca: 1041
brca: 1042
brca: 1043
brca: 1044
brca: 1045
brca: 1046
brca: 1047
brca: 1048
brca: 1049


KeyboardInterrupt: 