# Download an Expression Matrix Using the HCA's Matrix Service

The HCA Matrix Service consumes data from the [HCA](https://prod.data.humancellatlas.org/)
[Data Store](https://github.com/HumanCellAtlas/data-store) to dynamically generate cell by gene expression matrices.

This notebook will allow you to search for and retrieve data from the DCP, then send that data to the HCA's matrix service. This will produce a file in one of a variety of matrix formats.

## 1. Retrieve data from the DCP

<div class="alert alert-info">
<h3 style="margin-top: 0;"> Instructions <i class="fa fa-info-circle"></i></h3>
    <p>Use the widget below to search for <i>Homo sapiens</i> and <i>immune system</i>. Note the name of the matching project. Then scroll down to the second step below.</p>
    <ul><li><strong>Note:</strong> As of this notebook's publication, the Matrix Service has yet to implement expression matrix support for all data types. Projects lacking matrix file support will note this in their descriptions.</li></ul>
</div>

In [1]:
from IPython.display import display, HTML

import hca.dss
import json
import requests
import nbtools
import IPython.display
import time
import datetime
import urllib

project_map = {}

@nbtools.build_ui(name="Search the DCP for Data", parameters={
    "species": {
        "type": "choice",
        "default": "",
        "choices": {
            "": "",
            "Homo sapiens": "Homo sapiens",
            "Mus musculus": "Mus musculus",
        }
    },
    "organ": {
        "type": "choice",
        "default": "",
        "choices": {
            "": "",
            "blood": "blood",
            "bone": "bone",
            "brain": "brain",
            "decidua": "decidua",
            "embryo": "embryo",
            "esophagus": "esophagus",
            "hemopoietic organ": "hemopoietic organ",
            "immune system": "immune system",
            "kidney": "kidney",
            "lymph node": "lymph node",
            "pancreas": "pancreas",
            "placenta": "placenta",
            "skin of body": "skin of body",
            "spleen": "spleen",
            "tumor": "tumor",
        }
    },
    "output_var": {
        "hide": True
    }
})
def search_data(species="", organ=""):
    global response_json, project_map
    
    # Create the search filter
    search_filter = {}
    if species:
        search_filter["genusSpecies"] = {"is":[species]}
    if organ:
        search_filter["organ"] = {"is":[organ]}
        
    # Dump the search filter to an encoded string
    search_filter = json.dumps(search_filter)

    # Query the server
    response = requests.get("https://service.explore.data.humancellatlas.org/repository/projects?filters=" + search_filter + "&size=15&order=desc")
    response_json = response.json()
    # print(json.dumps(response_json['hits'][0], indent=4, sort_keys=True))
    
    # Handle the empty result case
    if not len(response_json['hits']):
        display(HTML('<div class="alert alert-danger">No search results found</div>'))
    
    # Produce the output
    for hit in response_json['hits']:
        # Does it have matrix files?
        matrix_found = matrix_available(hit)
        
        # Add to the project map
        project_map[hit['projects'][0]['projectTitle']] = hit['entryId']
        
        # Create the display block
        block = "<div class='well'>"
        if not matrix_found: 
            block += '<h3>' + hit['projects'][0]['projectTitle'] + '</h3>'
            block += '<ul><li>Matrix Not Available</li></ul>'
        else:
            block += '<h3 class="nbtools-text-option">' + hit['projects'][0]['projectTitle'] + '</h3><ul>'
            block += '<li><strong>Short Name:</strong> ' + hit['projects'][0]['projectShortname'] + '</li>'
            block += '<li><strong>Lab:</strong> ' + ', '.join(hit['projects'][0]['laboratory']) + '</li>'
            block += '<li><strong>Species:</strong> ' + ', '.join(hit['donorOrganisms'][0]['genusSpecies']) + '</li>'
            block += '<li><strong>Organ:</strong> ' + ', '.join(hit['specimens'][0]['organ']) + '</li>'
            block += '<li><strong>Donar Count:</strong> ' + str(len(hit['donorOrganisms'][0]['id'])) + '</li>'
            block += '<li><strong>Cell Count:</strong> ' + str(sum([cs['totalCells'] for cs in hit['cellSuspensions']])) + '</li>'
            block += '<li><strong>Data:</strong> ' + ', '.join(hit['protocols'][0]['libraryConstructionApproach']) + '</li>'
            block += '</ul></div>'
        display(HTML(block))
        # print(json.dumps(hit, indent=4))  # This line is used for debugging
        
def matrix_available(project):
    if 'fileTypeSummaries' in project:
        types = project['fileTypeSummaries']
        for t in types:
            if t['fileType'] == 'matrix':
                return True
        return False  # No matrix found
    else: return False

UIBuilder(function_import='nbtools.tool(id="Search the DCP for Data", origin="Notebook").function_or_method', …

## 2. Launch the Matrix Service job

<div class="alert alert-info">
<h3 style="margin-top: 0;"> Instructions <i class="fa fa-info-circle"></i></h3>
    <ul><li>Click the <i>project</i> parameter below to see a list of all available projects (produced from the search in Step 1). Select <i>Census of Immune Cells</i>.</li>
    <li>For the <i>matrix format</i> parameter, select <i>mtx</i>. You can <a href="https://broadinstitute.github.io/wot/file_formats/#mtx" target="_blank">read more about the mtx format here</a>.</li>
        <li>Click <i>Run</i>. This will launch a Matrix Service job which will dynamically generate the mtx file. Wait a few moments for the job to finish.</li></ul>
</div>

In [2]:
#######################################
# CSS Hack for long file names        #
#######################################

style = """.nbtools-widget-job-output-file {
    overflow: hidden;
    white-space: nowrap;
    display: block;
}"""
display(HTML(f'<style type="text/css">{style}</style>'))

#######################################
# Specify the Matrix Service server   #
#######################################

# matrix_host = "https://matrix.staging.data.humancellatlas.org/v0/"
matrix_host = "https://matrix.data.humancellatlas.org/v0/"

#######################################
# UI Builder spec                     #
#######################################

spec = { 'name':        "Launch the Matrix Service", 
         'description': "Launch an HCA Matrix Service job, which will produce the data in the desired format",
         'parameters':  {
             "project": {
                 "name": "project",
                 "type": "text",
                 "description": "Select a project from the results above, either choosing it from the dropdown menu or pasting in a URL."
             },
             "output_format": {
                 "name": "matrix format",
                 "type": "choice",
                 "default": "loom",
                 "choices": {
                     "csv": "csv",
                     "loom": "loom",
                     "mtx": "mtx",
                     "zarr": "zarr"
                 }
             },
             "output_var": {
                 "hide": True
             }
         }
     }

#######################################
# Display widget and launch job       #
#######################################

@nbtools.build_ui(**spec)
def launch_matrix_job(project, output_format="loom"):
    # Get the endpoint for creating a new job
    submit_endpoint = matrix_host + "matrix/"
    
    # Handle project URLs
    if is_url(project):
        project_id = project[47:98] + '...'
        manifest_url = project
        
    # Get URL from project name by looking up the project ID
    else:
        # Create a bundle URL from the project name
        project_id = urllib.parse.quote_plus(project_map[project])
        manifest_url = f'https://service.explore.data.humancellatlas.org/fetch/manifest/files?filters=%7B%22projectId%22%3A%7B%22is%22%3A%5B%22{project_id}%22%5D%7D%2C%22fileFormat%22%3A%7B%22is%22%3A%5B%22matrix%22%5D%7D%7D&format=tsv'
    
    # Get the start time
    start_time = time.time()
    
    # Create the UIOutput widget
    job_widget = nbtools.UIOutput(name="Matrix Service Job: " + project_id, 
                                  description="Your HCA Matrix Service job has been launched. It may take a few minutes before your results are ready.",
                                  status="Pending",
                                  files=[project_id])
        
    # Display the output widget
    display(job_widget)
    
    # Wait for the results
    wait_and_get_results(widget=job_widget, start_time=start_time, manifest_url=manifest_url, wait_max=30)
    
#######################################
# Poll for the Matrix Service results #
#######################################

url_to_download = None

def wait_and_get_results(widget, start_time, manifest_url, wait_max=30):
    global url_to_download
    
    # Make the request for the matrix file  
    try:
        manifest_request = requests.get(manifest_url).json()
    except:
        widget.status = 'Error'
        display(HTML('<div class="alert alert-danger">Encountered an error parsing the Matrix Service response</div>'))  

    # If Status == 301 the service wants us to wait
    if manifest_request['Status'] == 301:
        elapsed_time = round(time.time() - start_time)                              # Get time elapsed
        widget.status = "Running " + str(datetime.timedelta(seconds=elapsed_time))  # Set status
        manifest_url = manifest_request['Location']                                 # Set the new URL to query
        time.sleep(min(manifest_request['Retry-After'] * 5, 30))                    # Sleep
        
        # Recursively call wait_and_get_results() again
        wait_and_get_results(widget=widget, start_time=start_time, manifest_url=manifest_url, wait_max=wait_max)
        
    # If Status == 302 the matrix file is ready
    elif manifest_request['Status'] == 302:
        widget.status = 'Success'
        url_to_download = manifest_request['Location']
        display(nbtools.UIOutput(name='Matrix Service Results', files=[manifest_request['Location']]))
    
    # An unknown status is probably an error
    else:
        widget.status = 'Error ' + response_json['status']
        display(HTML('<div class="alert alert-danger">An error was encountered when generating the matrix file</div>'))  
        
#######################################
# Is the given string a URL?          #
#######################################
        
def is_url(project):
    parsed = urllib.parse.urlparse(project)
    return (parsed.scheme and parsed.netloc)

UIBuilder(description='Launch an HCA Matrix Service job, which will produce the data in the desired format', f…

## 3. Save matrix to workspace

<div class="alert alert-info">
<h3 style="margin-top: 0;"> Instructions <i class="fa fa-info-circle"></i></h3>
    <p>Click the <i>Save to Workspace</i> button below to save a copy of the matrix to your notebook workspace.</p>
</div>

In [4]:
import ipywidgets
import urllib.request 

def download(event):
    global url_to_download
    
    print('Saving the matrix to the notebook workspace... Please wait...')
    urllib.request.urlretrieve(url_to_download, 'expression_matrix.mtx')
    print('Complete')

save_to_workspace = ipywidgets.Button(description="Save to Workspace", _dom_classes=['btn-primary'])
save_to_workspace.on_click(download)
display(save_to_workspace)

# Hide code hack
display(HTML("""
    <script type="text/javascript">$('.cell > .input').hide();</script>
"""))

Button(description='Save to Workspace', style=ButtonStyle(), _dom_classes=('btn-primary',))