In [1]:
import pandas as pd
from urllib.request import urlopen
import requests
from bs4 import BeautifulSoup
from sklearn.model_selection import train_test_split
import datetime as dt
from PIL import Image
from io import BytesIO

In [2]:
base_url = 'https://openi.nlm.nih.gov'

&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;We'll scrape for 5500 URLs each in queries for chest X-rays and microscope photos on [MedPix](https://medpix.nlm.nih.gov/home).

In [3]:
chestxray = ['coll=iu', 'chest']
microscopy = ['it=mc', 'micro']

In [4]:
start = dt.datetime.now()
print(start)

big_list = []
for which in [chestxray, microscopy]:
    lil_list = []
    for i in range(1, 5501):
        url = base_url + '/retrieve.php?{}&m={}&n={}'.format(which[0], i, i)
        try:
            request = requests.get(url)
            soup = BeautifulSoup(request.content, 'lxml')
            for child in soup.findChildren('p'):
                if 'imgLarge' in str(child):
                    childlet = str(child).split('\"imgLarge\": \"')[1].split('\",')[0]
                    lil_list.append(childlet)
        except:
            print(which[1], url)

    (pd.DataFrame(lil_list, columns = [which[1]]))\
    .to_csv('other_assets/urls_{}.csv'.format(which[1]), index = False)

print('done in: ', dt.datetime.now() - start)

&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;The following cell loads output of previous cell (which can be commented out after output written to file), then concatenates individual data frames and restricts new dataframe to non-null entries (5489 paths each).

In [20]:
urls_micro = pd.read_csv('other_assets/urls_micro.csv')
urls_chest = pd.read_csv('other_assets/urls_chest.csv')

urls = pd.concat([urls_micro, urls_chest], axis = 1)
urls = urls[(pd.notnull(urls['micro'])) & (pd.notnull(urls['chest']))]

print(len(urls))
urls.head()

5489


Unnamed: 0,micro,chest
0,/imgs/512/230/1032/MPX1032_synpic23886.png,/imgs/512/10/10/CXR10_IM-0002-1001.png
1,/imgs/512/230/1032/MPX1032_synpic23887.png,/imgs/512/10/10/CXR10_IM-0002-2001.png
2,/imgs/512/230/1032/MPX1032_synpic23888.png,/imgs/512/1/1/CXR1_1_IM-0001-3001.png
3,/imgs/512/227/1029/MPX1029_synpic17373.png,/imgs/512/1/1/CXR1_1_IM-0001-4001.png
4,/imgs/512/227/1029/MPX1029_synpic17374.png,/imgs/512/199/1001/CXR1001_IM-0004-1001.png


In [6]:
## split off ~500 for validation
micro_big, micro_valid = train_test_split(urls['micro'], random_state = 421, test_size = 0.1)
chest_big, chest_valid = train_test_split(urls['chest'], random_state = 365, test_size = 0.1)

In [7]:
micro_train, micro_test = train_test_split(micro_big, random_state = 421, test_size = 0.2)
chest_train, chest_test = train_test_split(chest_big, random_state = 365, test_size = 0.2)

In [8]:
## reset indeces so we can iterate over index to get .png path extensions
for df in [micro_train, chest_train, micro_test, chest_test, micro_valid, chest_valid]:
    df.reset_index(drop = True, inplace = True)
    print(len(df))
    print(df.head(2))

3952
0    /imgs/512/334/4618651/PMC4618651_prp20003-e001...
1    /imgs/512/22/4372526/PMC4372526_pone.0122753.g...
Name: micro, dtype: object
3952
0    /imgs/512/368/3576/CXR3576_IM-1757-1001.png
1      /imgs/512/314/314/CXR314_IM-1477-1001.png
Name: chest, dtype: object
988
0    /imgs/512/387/4456700/PMC4456700_pma-48-01-229...
1    /imgs/512/277/2174499/PMC2174499_1752-1947-1-1...
Name: micro, dtype: object
988
0    /imgs/512/303/1506/CXR1506_IM-0330-2001.png
1    /imgs/512/230/2636/CXR2636_IM-1121-2001.png
Name: chest, dtype: object
549
0    /imgs/512/104/2192371/PMC2192371_JCB0009027.f1...
1    /imgs/512/305/4546442/PMC4546442_oncotarget-06...
Name: micro, dtype: object
549
0    /imgs/512/248/248/CXR248_IM-1008-85149003.png
1       /imgs/512/46/1650/CXR1650_IM-0427-1001.png
Name: chest, dtype: object


In [9]:
def get_image_size(url):
    data = requests.get(url).content
    im = Image.open(BytesIO(data))
    return im.size

In [11]:
start = dt.datetime.now()
print(start)

count = 0
fail_count = 0
for i in range(len(micro_train)):
    href = base_url+micro_train[i]
    path_extension = 'assets/training_set/micro/micro_{}.png'.format(i)
    try:
        get_image_size(href) ## this will fail if 404
        with open(path_extension, 'wb') as f:
            f.write(urlopen(href).read())
    except:
        print('\nSKIPPED ({}): {}'.format(i, href), end = '\n')
        fail_count += 1
    if count%100 == 0:
        print('.', end = '')
    count += 1

print('\n\ndone: ', dt.datetime.now() - start)
print('failed: {}'.format(fail_count))

2018-02-04 17:03:46.052292
................
SKIPPED (1592): https://openi.nlm.nih.gov/imgs/512/151/4308495/PMC4308495_pnga-60-192-g001.png
..................
SKIPPED (3338): https://openi.nlm.nih.gov/imgs/512/151/4308495/PMC4308495_pnga-60-192-g003.png
......done:  0:12:07.633115
failed: 2
<function get_image_size at 0x7f95ac390d08>    3950
Name: length, dtype: int64


In [15]:
start = dt.datetime.now()
print(start)

count = 0
fail_count = 0
for i in range(len(micro_test)):
    href = base_url+micro_test[i]
    path_extension = 'assets/testing_set/micro/micro_{}.png'.format(i)
    try:
        get_image_size(href) ## this will fail if 404
        with open(path_extension, 'wb') as f:
            f.write(urlopen(href).read())
    except:
        print('\nSKIPPED ({}): {}'.format(i, href), end = '\n')
        fail_count += 1
    if count%100 == 0:
        print('.', end = '')
    count += 1

print('\n\ndone: ', dt.datetime.now() - start)
print('failed: {}'.format(fail_count))

2018-02-04 17:33:51.169611
..........done:  0:03:00.759180
failed: 0


In [16]:
start = dt.datetime.now()
print(start)

count = 0
fail_count = 0
for i in range(len(micro_valid)):
    href = base_url+micro_valid[i]
    path_extension = 'other_assets/validation_set/micro/micro_{}.png'.format(i)
    try:
        get_image_size(href) ## this will fail if 404
        with open(path_extension, 'wb') as f:
            f.write(urlopen(href).read())
    except:
        print('\nSKIPPED ({}): {}'.format(i, href), end = '\n')
        fail_count += 1
    if count%100 == 0:
        print('.', end = '')
    count += 1

print('\n\ndone: ', dt.datetime.now() - start)
print('failed: {}'.format(fail_count))

2018-02-04 17:39:08.786370
......done:  0:01:40.570705
failed: 0


In [17]:
start = dt.datetime.now()
print(start)

count = 0
fail_count = 0
for i in range(len(chest_train)):
    href = base_url+chest_train[i]
    path_extension = 'assets/training_set/chest/chest_{}.png'.format(i)
    try:
        get_image_size(href) # this will fail if 404
        with open(path_extension, 'wb') as f:
            f.write(urlopen(href).read())
    except:
        print('\nSKIPPED ({}): {}'.format(i, href), end = '\n')
        fail_count += 1
    if count%100 == 0:
        print('.', end = '')
    count += 1

print('\n\ndone: ', dt.datetime.now() - start)
print('failed: {}'.format(fail_count))

2018-02-04 17:40:49.382094
........................................done:  0:11:21.057928
failed: 0


In [18]:
start = dt.datetime.now()
print(start)

count = 0
fail_count = 0
for i in range(len(chest_test)):
    href = base_url+chest_test[i]
    path_extension = 'assets/testing_set/chest/chest_{}.png'.format(i)
    try:
        get_image_size(href) ## this will fail if 404
        with open(path_extension, 'wb') as f:
            f.write(urlopen(href).read())
    except:
        print('\nSKIPPED ({}): {}'.format(i, href), end = '\n')
        fail_count += 1
    if count%100 == 0:
        print('.', end = '')
    count += 1

print('\n\ndone: ', dt.datetime.now() - start)
print('failed: {}'.format(fail_count))

2018-02-04 17:52:10.467588
..........done:  0:03:02.477866
failed: 0


In [19]:
start = dt.datetime.now()
print(start)

count = 0
fail_count = 0
for i in range(len(chest_valid)):
    href = base_url+chest_valid[i]
    path_extension = 'other_assets/validation_set/chest/chest_{}.png'.format(i)
    try:
        get_image_size(href) ## this will fail if 404
        with open(path_extension, 'wb') as f:
            f.write(urlopen(href).read())
    except:
        print('\nSKIPPED ({}): {}'.format(i, href), end = '\n')
        fail_count += 1
    if count%100 == 0:
        print('.', end = '')
    count += 1

print('\n\ndone: ', dt.datetime.now() - start)
print('failed: {}'.format(fail_count))

2018-02-04 17:55:12.971227
......done:  0:01:41.410375
failed: 0
