# Pulls image data from covid-chestxray-dataset
Pulls image data from https://github.com/ieee8023/covid-chestxray-dataset.git and outputs a zip folder containting the images and a csv file containing the image metadata

In [None]:
import os

#os.environ['create_image']='True'
os.environ['repository']='romeokienzler'
os.environ['version']='0.2'
#
#os.environ['install_requirements']='True'

In [None]:
if bool(os.environ.get('create_image',False)):
    docker_file="""
    FROM registry.access.redhat.com/ubi8/python-39
    RUN pip install ipython nbformat install gitpython~=3.1 pandas==1.2.1
    RUN mkdir component-library
    RUN mkdir component-library/input
    ADD input-covid-chestxray.ipynb /component-library/input/
    ENTRYPOINT ["ipython","/component-library/input/input-covid-chestxray.ipynb","> /tmp/component.log","2> /tmp/component.err"]
    """
    with open("Dockerfile", "w") as text_file:
        text_file.write(docker_file)

    !docker build -t claimed-input-covid-chestxray:`echo $version` .
    !docker tag claimed-input-covid-chestxray:`echo $version` `echo $repository`/claimed-input-covid-chestxray:`echo $version`
    !docker push `echo $repository`/claimed-input-covid-chestxray:`echo $version`
elif bool(os.environ.get('install_requirements',False)):
    !pip install install gitpython~=3.1 pandas==1.2.1

In [None]:
import sys
import git
import os
import shutil
import logging
import pandas as pd
from shutil import copyfile

In [None]:
# @param output_filename
# @param metadata_filename
# @returns images.zip
# @returns metadata.csv

In [None]:
image_foldername = os.environ.get('image_foldername', 'covid-chestxray-images')
metadata_filename = os.environ.get('metadata_filename', 'metadata.csv')
data_dir = os.environ.get('data_dir', '.')
skip_if_exists = bool(os.environ.get('skip_if_exists', 'False'))

In [None]:
output_folder = data_dir + image_foldername
if skip_if_exists and os.path.exists(output_folder):
    sys.exit(0)

In [None]:
#data_dir = '../../data/'

In [None]:
!git clone https://github.com/ieee8023/covid-chestxray-dataset.git

In [None]:
metadata = pd.read_csv('covid-chestxray-dataset/metadata.csv')

In [None]:
metadata['finding'] = metadata['finding'].apply(lambda s: s.replace('/','_'))
metadata = metadata[~metadata["finding"].str.contains("todo")]
metadata = metadata[~metadata["finding"].str.contains("Unknown")]
metadata = metadata[~metadata.filename.str.contains('.gz')]

In [None]:
os.mkdir(output_folder)

In [None]:
folders = metadata['finding'].unique()

In [None]:
for folder in folders:
    os.mkdir(output_folder + '/' + folder)

In [None]:
for _, row in metadata.iterrows():
    file_name = row['filename']
    class_name = row['finding']
    copyfile(
        'covid-chestxray-dataset/images' + '/' + file_name, output_folder + '/' + class_name + '/' + file_name
    )

In [None]:
metadata.to_csv(data_dir + metadata_filename, index=False)

In [None]:
!rm -Rf covid-chestxray-dataset