**Title**: Upload COVID Chest X-Ray Dataset.   
**Description**:    
* https://github.com/ieee8023/covid-chestxray-dataset
* https://arxiv.org/abs/2004.12823
* https://arxiv.org/abs/2004.05405

# Data Use Aggreement
Before downloading this data, or any data, make sure you understand the restrictions on the use of data.

# Requirements:
- **Python** (Preferably >= 3.6):  

- Have administrator permissions to create Flywheel Groups and Projects.

In [None]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

# Install and import dependencies

In [None]:
!pip install pandas getpass pydicom flywheel-sdk

In [None]:
import copy
import csv
import datetime
import logging
import os
import time
from getpass import getpass

import flywheel
import pandas as pd

In [None]:
# Instantiate a logger
logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s %(message)s')
log = logging.getLogger('root')

# Download covid-chestxray dataset
The following will download the entire github repository to the local drive.

In [None]:
!git clone git@github.com:ieee8023/covid-chestxray-dataset.git

# Initialize Constants

In [None]:
ROOT_CHESTXRAY_DATA = '/Users/joshuajacobs/Projects/2020.10.09.Flywheel.Tutorials/Code/flywheel-tutorials/import_tutorials/covid-chestxray-dataset'
DEFAULT_SESSION_LABEL = 'offset_NA'
DEFAULT_ACQ_LABEL = 'Chest XR'

# Flywheel API Key and Client
Get an API_KEY. More on this in the Flywheel SDK doc [here](https://flywheel-io.gitlab.io/product/backend/sdk/branches/master/python/getting_started.html#api-key).

In [None]:
API_KEY = getpass('Enter API_KEY here: ')

Instantiate the Flywheel API client

In [None]:
fw_client = flywheel.Client(API_KEY if 'API_KEY' in locals() else os.environ.get('FW_KEY'))

Show Flywheel logging information

In [None]:
log.info('You are now logged in as %s to %s', fw_client.get_current_user()['email'], fw_client.get_config()['site']['api_url'])

# Container helpers
Import container helper functions to find existing or create new containers.

In [None]:
from container_helpers import (
    find_or_create_group, 
    find_or_create_project, 
    find_or_create_subject, 
    find_or_create_session, 
    find_or_create_acquisition,
)


# Find or Create Group and Project:
Create a group with id "public_data" and label "public_data".

Create a project with label 'covid-chestxray-dataset'.

Replace with the id and labels of the group and project you want to create.

If group and project are already created, the group and project with specified labels will be returned.

In [None]:
# Initialize the group
public_data_group = find_or_create_group(fw_client, 'public_data', 'public_data')

# Initialize the project
project_label = 'covid-chestxray-dataset'
chestxray_project = find_or_create_project(project_label, public_data_group)

# Parse the project description from the README markdown
with open(
    os.path.join(ROOT_CHESTXRAY_DATA, 'README.md'),
    'r',
    encoding='utf-8'
) as fdata:
    readme = fdata.read()
    
for local_file in ['(CONTRIBUTING.md)', '(metadata.csv)', '(images)']:
    readme = readme.replace(
        local_file, 
        '(https://github.com/ieee8023/covid-chestxray-dataset/blob/'
        f'47685d20a1b77f0664ac8b7740ef6d4646be962d/{local_file[1:-1]})'
    )

if chestxray_project:
    chestxray_project.update(description=readme)

# Prepare Dataframe 
Read the csv, clean the dataframe, produce dictionary list

### Formating and Cleaning Helper Functions

In [None]:
from dataframe_helpers import (
    convert_time_to_seconds, 
    format_sex_string, 
    create_session_label, 
    cleanup_row_dict
)


### Apply Helper Functions to Dataframe

In [None]:
# Load dataframe from downloaded csv file
df = pd.read_csv(os.path.join(ROOT_CHESTXRAY_DATA, 'metadata.csv'))

# format subject label
df['subject_label'] = df['patientid'].apply(lambda x: f'sub_{str(x).zfill(4)}')
row_dict_list = df.to_dict(orient='records')

# Apply age conversion
df['session_age'] = (
    df['age'].apply(convert_time_to_seconds, scale='Y') + 
    df['offset'].apply(convert_time_to_seconds, scale='D').astype('int64')
)

# Format subject sex
df['subject_sex'] = df['sex'].apply(format_sex_string)

# Apply to session labels with default 
df['session_label'] = df['offset'].apply(
    create_session_label, 
    default_session_label=DEFAULT_SESSION_LABEL
)

# format acquisition label
df['acquisition_label'] = df['filename'].apply(lambda x: x.rsplit('.', maxsplit=1)[0])

# throw out nans
df.fillna('', inplace=True)

# Produced cleaned row dictionary list
row_dict_list = [cleanup_row_dict(row_dict) for row_dict in df.to_dict(orient='records')]

In [None]:
df['offset'].apply(convert_time_to_seconds, scale='D').astype('int64')


# Create containers


In [None]:

# iterate through rows of dataframe
for row_dict in row_dict_list:
    subject_label = row_dict.get('subject_label')
    log.info('Processing Subject %s.', subject_label)
    subject_sex = row_dict.get('subject_sex')
    kwargs_dict = {"sex": subject_sex}
    subject = find_or_create_subject(subject_label, chestxray_project, **kwargs_dict)
    if subject:
        session_label = row_dict.get('session_label')
        log.info('Processing Session %s.', session_label)
        age_at_session = row_dict.get('session_age')
        kwargs_dict = {"age": age_at_session}
        session = find_or_create_session(session_label, subject, **kwargs_dict)
        if session:
            aqc_label = row_dict.get('acquisition_label')
            log.info('Processing Acquisition %s.', aqc_label)
            kwargs_dict = {"info": row_dict}
            acq = find_or_create_acquisition(aqc_label, session, **kwargs_dict)
            filepath = os.path.join(ROOT_CHESTXRAY_DATA, 'images', row_dict.get('filename'))
            log.info('Uploading file, %s, to acquisition, %s', filepath, acq.label)
            upload_file_to_acquistion(acq, filepath, **kwarg_dict)

In [None]:
# list unique subjects
df.subject_label.unique()