Download data from ISIC Archive

https://www.isic-archive.com/#!/topWithHeader/onlyHeaderTop/apiDocumentation

## 加载CSV

In [None]:
import pandas as pd
import os

# 替换为ISIC上的csv文件路径
df_train = pd.read_csv('../Data/2017/ISIC-2017_Training_Part3_GroundTruth.csv')
# 替换为群里包含扩充数据的csv文件
df_add = pd.read_csv('../Data/ISIC2017/ISIC-2017_Training_Add_Part3_GroundTruth.csv')

In [None]:
# 仅扩充数据
# df_extra = df_add[~ df_add['image_id'].isin(df_train['image_id'])]
# print(df_extra)
# df_extra.to_csv('../Data/ISIC2017/ISIC-2017_Training_Extra_Part3_GroundTruth.csv', index=False)

## ISIC API

In [None]:
import requests


class ISICApi(object):
    def __init__(self, hostname='https://isic-archive.com',
                 username=None, password=None):
        self.baseUrl = f'{hostname}/api/v1'
        self.authToken = None

        if username is not None:
            if password is None:
                password = input(f'Password for user "{username}":')
            self.authToken = self._login(username, password)

    def _makeUrl(self, endpoint):
        return f'{self.baseUrl}/{endpoint}'

    def _login(self, username, password):
        authResponse = requests.get(
            self._makeUrl('user/authentication'),
            auth=(username, password)
        )
        if not authResponse.ok:
            raise Exception(f'Login error: {authResponse.json()["message"]}')

        authToken = authResponse.json()['authToken']['token']
        return authToken

    def get(self, endpoint):
        url = self._makeUrl(endpoint)
        headers = {'Girder-Token': self.authToken} if self.authToken else None
        return requests.get(url, headers=headers)

    def getJson(self, endpoint):
        return self.get(endpoint).json()

    def getJsonList(self, endpoint):
        endpoint += '&' if '?' in endpoint else '?'
        LIMIT = 50
        offset = 0
        while True:
            resp = self.get(
                f'{endpoint}limit={LIMIT:d}&offset={offset:d}'
            ).json()
            if not resp:
                break
            for elem in resp:
                yield elem
            offset += LIMIT


In [None]:
import pandas as pd
import os

# Initialize the API; no login is necessary for public data
api = ISICApi()
# 文件保存路径
savePath = '../Data/ISIC2017/Extra'
# savePath = 'ISICArchive/'

if not os.path.exists(savePath):
    os.makedirs(savePath)

# df_extra = df_add[~ df_add['image_id'].isin(df_train['image_id'])]
df_extra = pd.read_csv("../Data/ISIC-2017_Training_Extra_Part3_GroundTruth.csv")

In [None]:
print('Downloading %s images' % len(df_extra))
imageDetails = []
for _, image in df_extra.iterrows():
    imageList = api.getJson('image?limit=1&offset=0&sort=name&name={}'.format(image['image_id']))[0]
    print("Downloading {}, id = {}".format(imageList['name'], imageList['_id']))
    imageFileResp = api.get('image/%s/download' % imageList['_id'])
    imageFileResp.raise_for_status()
    imageFileOutputPath = os.path.join(savePath, '%s.jpg' % imageList['name'])
    with open(imageFileOutputPath, 'wb') as imageFileOutputStream:
        for chunk in imageFileResp:
            imageFileOutputStream.write(chunk)