**Title**: Upload COVID Chest X-Ray Dataset.   
**Description**:    
* https://github.com/ieee8023/covid-chestxray-dataset
* https://arxiv.org/abs/2004.12823
* https://arxiv.org/abs/2004.05405

# Data Use Aggreement
Before downloading this data, or any data, make sure you understand the restrictions on the use of data.

In [None]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

# Install a few dependencies

In [None]:
!pip install pandas pydicom flywheel-sdk

In [None]:
import pandas as pd
import flywheel
import os
import csv
import time
import datetime
import copy

# Download covid-chestxray dataset
The following will download the entire github repository to the local drive.

In [None]:
!git clone git@github.com:ieee8023/covid-chestxray-dataset.git

# Initialize Variables

In [None]:
ROOT_CHESTXRAY_DATA = '/path/to/downloaded/repository/'
DEFAULT_SESSION_LABEL = 'offset_NA'
DEFAULT_ACQ_LABEL = 'Chest XR'

# Initialize client
The `COVID19_FW_KEY` is the Flywheel API-Key for the Flywheel instance you want to upload this dataset to.  You can find or create your own API-Key using the instructions found [here](https://docs.flywheel.io/hc/en-us/articles/360015135654-User-Profile). You'll need to provide the api key in your notebook, or set it to the `COVID19_FW_KEY` environment variable in your operating system.

In [None]:
COVID19_FW_KEY = os.getenv('COVID19_FW_KEY')
fw_client = flywheel.Client(COVID19_FW_KEY)

# Containers helpers
Documented container helper functions to find existing or create new containers.

In [None]:
def find_or_create_group(fw_client, id, label):
    """
    Find or create group indictated by "label".

    Args:
        id (str): Instance-unique, lower-case alphabetic with "_" as id for group.
        label (str): The label for the group. Less restricted than 'id'.

    Returns:
        flywheel.Group: The found or created group.
    """
    if not label:
        return None

    group = fw_client.groups.find_first(f"label={label}")

    if not group:
        group = flywheel.Group(id=id, label=label)
        group_id = fw_client.add_group(group)
        group = fw_client.get_group(group_id)

    if group:
        group = group.reload()

    return group


def find_or_create_project(label, group):
    """
    Find or create Flywheel project with "label" under "group".

    Args:
        label (str): [description]
        group (flywheel.Group): The Flywheel group object to create this project under.

    Returns:
        flywheel.Project: The found or created Flywheel Project object.
    """
    if not label:
        return None
    project = group.projects.find_first(f"label={label}")
    return project


def find_or_create_subject(label, sex, project):
    """
    Find or create a subject with "label" under "project".

    If subject is found, "sex" is disregarded.

    Args:
        label (str): The label to use for the subject name.
        sex (str): The sex of the subject. Can be `None`.
        project (flywheel.Project): The project object to create this subject under.

    Returns:
        flywheel.Subject: The found or created Flywheel Subject object.
    """
    if not label:
        return None
    subject = project.subjects.find_first(f"label={label}")

    if not subject:
        subject = project.add_subject(code=label, label=label)
        if sex:
            subject.update(sex=sex)

    if subject:
        subject = subject.reload()

    return subject


def find_or_create_session(label, age, subject):
    """
    Find or create a session with "label" under "subject".

    If session is found, "age" is disregarded.

    Args:
        label (str): The label to display for this session.
        age (int): The age of the subject at the event of this session.
        subject (flywheel.Subject): The Flywheel subject to create the session under.

    Returns:
        flywheel.session: The found or created Flywheel session.
    """
    if not label:
        return None
    session = subject.sessions.find_first(f"label={label}")

    if not session:
        session = subject.add_session(label=label)
        if age:
            session.update(age=age)

    if session:
        session = session.reload()

    return session


def find_or_create_acquisition(label, info_dict, fp, session, update_info=True):
    """
    Find or create a acquisition with "label" under "session".

    If acquisition exists, upload filepath "fp" and update info if `update_info`==True.

    Args:
        label (str): The label to display for this acquisition
        info_dict (dict): Dictionary to update or add to instance of acquisition.
        fp (str): Fullpath to a file to upload into the found/created acquisition.
        session (flywheel.Session): Session to find or create acquisition under.
        update_info (bool, optional): Flag to update info of existing acquisition.
            Defaults to True.

    Returns:
        flywheel.Acquisition: The found or created Flywheel Acquisition object.
    """
    if not label:
        return None
    acq = session.acquisitions.find_first(f"label={label}")

    if not acq:
        acq = session.add_acquisition(label=label)

        if info_dict:
            acq.update(info=info_dict)

    if acq:
        basename = os.path.basename(fp)
        if os.path.isfile(fp) and not acq.get_file(basename):

            acq.upload_file(fp)

            print(f"Uploading {fp} to acquisition {acq.id}")
            while not acq.get_file(basename):
                acq = acq.reload()
                time.sleep(1)

    if update_info:
        f = acq.get_file(basename)
        f.update({"type": "dicom", "modality": "X-ray"})
        f.update_info(info_dict)

    acq = acq.reload()

    return acq


# Find or Create Group and Project:
Create a group with id "public_data" and label "public_data".

Create a project with label 'covid-chestxray-dataset'.

Replace with the id and labels of the group and project you want to create.

If group and project are already created, the group and project with specified labels will be returned.

In [None]:
# Initialize the group
public_data_group = find_or_create_group(fw_client, 'public_data', 'public_data')

# Initialize the project
project_label = 'covid-chestxray-dataset'
chestxray_project = find_or_create_project(project_label, public_data_group)

# Parse the project description from the README markdown
with open(os.path.join(ROOT_CHESTXRAY_DATA, 'covid-chestxray-dataset/README.md'), 'r') as fdata:
    readme = fdata.read()
    
for local_file in ['(CONTRIBUTING.md)', '(metadata.csv)', '(images)']:
    readme = readme.replace(
        local_file, 
        '(https://github.com/ieee8023/covid-chestxray-dataset/blob/'
        f'47685d20a1b77f0664ac8b7740ef6d4646be962d/{local_file[1:-1]})'
    )

if chestxray_project:
    chestxray_project.update(description=readme)

# Prepare Dataframe 
Read the csv, clean the dataframe, produce dictionary list

### Formating and Cleaning Helper Functions

In [None]:
def convert_time_to_seconds(time_span, scale):
    """
    Convert arbitrary time span to seconds.

    On failure, returns 0 seconds.

    Args:
        time_span (str): The length of time specified by units "scale".
        scale (str): The units of the length of time specified in "time_span".
            Valid Entries: 'Y', 'M', 'W', 'D'
    Returns:
        datetime.timedelta: Total seconds in time_span.
    """

    conversion = {
        'Y': 365.25,
        'M': 30,
        'W': 7,
        'D': 1,
    }
    try:
        seconds = (
            datetime.timedelta(int(time_span) * conversion.get(scale)).total_seconds()
        )
    except ValueError:
        seconds = 0
    return seconds


def format_sex_string(sex_str):
    """
    Converts 'M', 'F', or else to 'male', 'female', or empty string (e.g. '').

    Args:
        sex_str (str): String consisting of 'M', 'F', '', or None.

    Returns:
        str: 'M', 'F', or ''
    """
    if sex_str == 'M':
        sex = 'male'
    elif sex_str == 'F':
        sex = 'female'
    else:
        sex = ''
    return sex


def create_session_label(offset):
    """
    Format session label

    Args:
        offset (str): Number of days since the start of symptoms or hospitalization
            See SCHEMA.md.

    Returns:
        str: Label of session
    """
    if not offset:
        label = DEFAULT_SESSION_LABEL
    elif np.isnan(offset):
        label = DEFAULT_SESSION_LABEL
    else:
        label = f'offset_{str(int(offset)).zfill(3)}'
    return label


def cleanup_row_dict(row_dict):
    """
    Cleanup session age, clinical notes, other notes, and empty values.
    
    Args:
        row_dict (dict): Raw dictionary representation of dataframe row.

    Returns:
        dict: Cleaned version of row_dict.
    """
    # fix session age
    row_dict['session_age'] = int(row_dict['session_age'])
    # fix notes 
    if row_dict.get('Unnamed: 16'):
        row_dict['clinical notes'] = '\s'.join(
            [row_dict['clinical notes'], row_dict['other notes']]
        )
        row_dict['other notes'] = row_dict['Unnamed: 16']
        row_dict['Unnamed: 16'] = ''

    # Copy row_dict 
    return_dict = copy.deepcopy(row_dict)
    
    # To remove empty values
    for key, value in row_dict.items():
        if value in ['', None]:
            return_dict.pop(key)
    
    return return_dict


### Apply Helper Functions to Dataframe

In [None]:
# Load dataframe from downloaded csv file
df = pd.read_csv(os.path.join(ROOT_CHESTXRAY_DATA, 'metadata.csv'))

# format subject label
df['subject_label'] = df['Patientid'].apply(lambda x: f'sub_{str(x).zfill(4)}')
row_dict_list = df.to_dict(orient='records')

# Apply age conversion
df['session_age'] = (
    df['age'].apply(convert_time_to_seconds, scale='Y') + 
    df['offset'].apply(convert_time_to_seconds, scale='D').astype('int64')
)

# Format subject sex
df['subject_sex'] = df['sex'].apply(format_sex_string)

# Apply to session labels with default 
df['session_label'] = df['offset'].apply(create_session_label)

# format acquisition label
df['acquisition_label'] = df['filename'].apply(lambda x: x.rsplit('.', maxsplit=1)[0])

# throw out nans
df.fillna('', inplace=True)

# Produced cleaned row dictionary list
row_dict_list = [cleanup_row_dict(row_dict) for row_dict in df.to_dict(orient='records')]

# Create containers


In [None]:

# iterate through rows of dataframe
for row_dict in row_dict_list:
    subject_label = row_dict.get('subject_label')
    subject_sex = row_dict.get('subject_sex')
    subject = find_or_create_subject(subject_label, subject_sex, chestxray_project)
    if subject:
        session_label = row_dict.get('session_label')
        age_at_session = row_dict.get('session_age')
        session = find_or_create_session(session_label, age_at_session, subject)
        if session:
            filepath = os.path.join('./covid-chestxray-dataset/images', row_dict.get('filename'))
            aqc_label = row_dict.get('acquisition_label')
            acq = find_or_create_acquisition(aqc_label, row_dict, filepath, session)

In [None]:
# list unique subjects
df.subject_label.unique()