**Title**: Upload kaggle chest Xray.   
**Description**:    
* https://www.kaggle.com/c/rsna-pneumonia-detection-challenge/data  

# Data Use Aggreement
Before downloading this data, or any data, from kaggle, you must agree to the rules of this competition: 

* https://www.kaggle.com/c/rsna-pneumonia-detection-challenge/rules

In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

# Install a few dependencies

In [None]:
!pip install pandas pydicom flywheel-sdk kaggle

# Download kaggle dataset

This requires that you have stored your Kaggle credentials in ~/.kaggle/kaggle.json. These can be acquired by creating a kaggle account at kaggle.com and using "Create New API Token" on the user account page. 

This dataset is currently 3.7 GB and may change in the future. Depending on the bandwidth of your internet connection, this may take some time to download.

In [1]:
!kaggle competitions download -c rsna-pneumonia-detection-challenge

/bin/sh: kaggle: command not found


In [72]:
import os
from pathlib import Path
import pandas as pd
import flywheel
import logging
from tqdm.notebook import tqdm
import pydicom
import re
import json
import time
import zipfile

# Initialize variables
Initialize path to dowload directory, default session label, and default acquisition label.

In [63]:
ROOT_KAGGLE_DATA = '/path/to/downloaded/dataset/'
DEFAULT_SESSION_LABEL = 'NA'
DEFAULT_ACQ_LABEL = 'Chest XR'

# Initialize client
The `COVID19_FW_KEY` is the Flywheel API-Key for the Flywheel instance you want to upload this dataset to. You can find or create your own API-Key using the instructions found [here](https://docs.flywheel.io/hc/en-us/articles/360015135654-User-Profile). You'll need to provide the api key in your notebook, or set it to the `COVID19_FW_KEY` environment variable in your operating system.

In [64]:
COVID19_FW_KEY = os.getenv('FW_COVID19_KEY')
fw = flywheel.Client(COVID19_FW_KEY)

# Containers helpers
Documented container helper functions to find existing or create new containers.

In [66]:
def find_or_create_group(fw_client, id, label):
    """
    Find or create group indictated by "label".

    Args:
        id (str): Instance-unique, lower-case alphabetic with "_" as id for group.
        label (str): The label for the group. Less restricted than 'id'.

    Returns:
        flywheel.Group: The found or created group.
    """
    if not label:
        return None

    group = fw_client.groups.find_first(f"label={label}")

    if not group:
        group = flywheel.Group(id=id, label=label)
        group_id = fw_client.add_group(group)
        group = fw_client.get_group(group_id)

    if group:
        group = group.reload()

    return group


def find_or_create_project(label, group):
    """
    Find or create Flywheel project with "label" under "group".

    Args:
        label (str): [description]
        group (flywheel.Group): The Flywheel group object to create this project under.

    Returns:
        flywheel.Project: The found or created Flywheel Project object.
    """
    if not label:
        return None
    project = group.projects.find_first(f"label={label}")
    return project


def find_or_create_subject(label, sex, project):
    """
    Find or create a subject with "label" under "project".

    If subject is found, "sex" is disregarded.

    Args:
        label (str): The label to use for the subject name.
        sex (str): The sex of the subject. Can be `None`.
        project (flywheel.Project): The project object to create this subject under.

    Returns:
        flywheel.Subject: The found or created Flywheel Subject object.
    """
    if not label:
        return None
    subject = project.subjects.find_first(f"label={label}")

    if not subject:
        subject = project.add_subject(code=label, label=label)
        if sex:
            subject.update(sex=sex)

    if subject:
        subject = subject.reload()

    return subject


def find_or_create_session(label, age, subject):
    """
    Find or create a session with "label" under "subject".

    If session is found, "age" is disregarded.

    Args:
        label (str): The label to display for this session.
        age (int): The age of the subject at the event of this session.
        subject (flywheel.Subject): The Flywheel subject to create the session under.

    Returns:
        flywheel.session: The found or created Flywheel session.
    """
    if not label:
        return None
    session = subject.sessions.find_first(f"label={label}")

    if not session:
        session = subject.add_session(label=label)
        if age:
            session.update(age=age)

    if session:
        session = session.reload()

    return session


def find_or_create_acquisition(label, info_dict, fp, session, update_info=True):
    """
    find_or_create_acquisition [summary]

    Args:
        label (str): The label to display for this acquisition
        info_dict (dict): Dictionary to update or add to instance of acquisition.
        fp (str): Fullpath to a file to upload into the found/created acquisition.
        session (flywheel.Session): Session to find or create acquisition under.
        update_info (bool, optional): Flag to update info of existing acquisition.
            Defaults to True.

    Returns:
        flywheel.Acquisition: The found or created Flywheel Acquisition object.
    """
    if not label:
        return None
    acq = session.acquisitions.find_first(f"label={label}")

    if not acq:
        acq = session.add_acquisition(label=label)

        if info_dict:
            acq.update(info=info_dict)

    if acq:
        basename = os.path.basename(fp)
        if os.path.isfile(fp) and not acq.get_file(basename):

            acq.upload_file(fp)

            print(f"Uploading {fp} to acquisition {acq.id}")
            while not acq.get_file(basename):
                acq = acq.reload()
                time.sleep(1)

    if update_info:
        f = acq.get_file(basename)
        f.update({"type": "dicom", "modality": "X-ray"})
        f.update_info(info_dict)

    acq = acq.reload()

    return acq


# Create the project

In [65]:
# Initialize the group
public_data_group = find_or_create_group('public_data','public_data')
# Initialize the project
project_label = 'kaggle-rsna-pneumonia-detection-challenge'
readme = 'https://www.kaggle.com/c/rsna-pneumonia-detection-challenge/data'
chestxray_project = find_or_create_project(project_label, public_data_group)
if chestxray_project:
    chestxray_project.update(description=readme)

# Read the csv

In [69]:
df = pd.read_csv(Path(ROOT_KAGGLE_DATA) / 'stage_2_train_labels.csv')

# Upload
Iterate through the training data csv to create the container hierarchy for this project:
* find or create each subject encountered
* find or create each session (with `DEFAULT_SESSION_LABEL`) encountered
* find or create each acquisition (with 'SeriesDescription' or `DEFAULT_ACQ_LABEL`) and add enclosed files.

In [None]:
for i, row in tqdm(df.iterrows(), total=len(df)):
    subject = find_or_create_subject(row['patientId'], None, chestxray_project)
    if row['Target']:
        row_dict = {
            'box': {
                'x': row['x'], 
                'y': row['y'], 
                'width': row['width'], 
                'height': row['height']
            }, 
            'Target': row['Target']
        }
    else:
        row_dict = {'Target': row['Target']}
    if subject:
        session = find_or_create_session(DEFAULT_SESSION_LABEL, None, subject)
        if session:
            filepath = str(Path(ROOT_KAGGLE_DATA) / 'stage_2_train_images' / f"{row['patientId']}.dcm")
            dcm = pydicom.read_file(filepath, stop_before_pixels=True, force=True)
            with zipfile.ZipFile(f'/tmp/{row["patientId"]}.zip', 'w') as myzip:
                myzip.write(filepath)
            acq_label = dcm.get('SeriesDescription', DEFAULT_ACQ_LABEL)
            acq = find_or_create_acquisition(acq_label, row_dict, f'/tmp/{row["patientId"]}.zip', session)
            os.remove(f'/tmp/{row["patientId"]}.zip')