[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/crunchdao/quickstarters/blob/master/competitions/broad-3/quickstarters/random-submission/random-submission.ipynb)

![Cover](https://raw.githubusercontent.com/crunchdao/competitions/refs/heads/master/competitions/broad-3/assets/cover.png)

In [None]:
%pip install --upgrade crunch-cli

Get a new token: https://hub.crunchdao.com/competitions/broad-3/submit/via/notebook

In [None]:
# To retrieve a larger dataset, include the --size large argument as shown below:
!crunch setup --notebook --size large broad-3 hello --token aaaabbbbccccddddeeeeffff

In [None]:
!pip install spatialdata

In [None]:
import os

import numpy
import pandas
import scanpy
import skimage.io
import spatialdata

In [None]:
import crunch
crunch = crunch.load_notebook()

In [12]:
# In the training function, users build and train the model to make inferences on the test data.
# Your model must be stored in the `model_directory_path`.
def train(
    data_directory_path: str,
    model_directory_path: str
):
    # Loading scRNAseq data
    # The single-cell RNA sequencing (scRNA-seq) data provides gene expression data
    # for 18,615 protein-coding genes from colon tissue samples with and without dysplasia.
    scRNAseq = scanpy.read_h5ad(os.path.join(data_directory_path, 'Crunch3_scRNAseq.h5ad'))

    # Loading Spatial Data
    # UC9_I.zarr contains H&E Image noncancerous mucosa (already provided in Crunch 1 and Crunch 2)
    sdata = spatialdata.read_zarr(os.path.join(data_directory_path, 'UC9_I.zarr'))

    # Load dysplasia-related files
    # These files include:
    # - HE: An H&E image of tissue regions exhibiting dysplasia
    # - HE_nuc: A nuclear segmentation mask
    # - region: An ROI mask indicating dysplastic vs. non-dysplastic regions of the tissue
    #
    # Using these images, you can extract additional spatial features and labels that may
    # be relevant for training or evaluating your model.
    dysplasia_file = {
        # H&E image of tissue with dysplasia
        'HE': os.path.join(data_directory_path, 'UC9_I-crunch3-HE.tif'),

        # Nucleus segmentation of H&E image
        'HE_nuc': os.path.join(data_directory_path, 'UC9_I-crunch3-HE-label-stardist.tif'),

        # Regions in H&E image highlighting dysplasia and non-dysplasia
        'region': os.path.join(data_directory_path, 'UC9_I-crunch3-HE-dysplasia-ROI.tif')
    }

    # Read the dysplasia-related images and store them in a dictionary
    dysplasia_img_list = {}
    for key in dysplasia_file:
        dysplasia_img_list[key] = skimage.io.imread(dysplasia_file[key])

    # TODO Add your training code here and save the trained model to the specified model_directory_path.

In [13]:
# In the inference function, the trained model is loaded and used to make inferences on a
# sample of data that matches the characteristics of the training test.
def infer(
    data_file_path: str,
    data_directory_path: str,
    model_directory_path: str
):
    # Load the list of genes to predict
    gene_list = pandas.read_csv(os.path.join(data_directory_path, 'Crunch3_gene_list.csv'))
    gene_names = gene_list['gene_symbols']

    # The intended goal is to rank all 18,615 protein-coding genes based on their ability
    # to distinguish dysplasia from noncancerous mucosa regions, assigning them ranks
    # from 1 (best discriminator) to 18,615 (worst).

    # Currently, we generate a random permutation of gene names as a placeholder.
    # Replace the logic below with actual model inference:
    # 1. Load the trained model from the model directory.
    # 2. Use the model to score and rank the genes accordingly.
    # 3. Return the predicted ranking as a DataFrame.

    prediction = pandas.DataFrame(
        numpy.random.permutation(gene_names),
        index=numpy.arange(1, len(gene_names) + 1),
        columns=['Gene Name'],
    )

    return prediction

In [None]:
# This command is running a local test with your submission
# making sure that your submission can be accepted by the system
crunch.test(
    no_determinism_check=True,
)

The final step is to write your report as specified in the [# Justification Report](https://docs.crunchdao.com/competitions/competitions/broad-institute-autoimmune-disease/crunch-3#justification-report).

You must:
1. Explain how your method works. (5-10 sentences)
2. Describe the reasoning behind your gene panel design. (5-10 sentences)
3. Specify the datasets and any other resources utilized. (5-10 sentences)

Once a Notebook is submitted on the platform, the report cannot be changed anymore. <br />
Therefore, it is important to take your time and explain what you have done. <br />
The better is your report, the better mark you could get during the Peer Review phase. <br />

The limit is about one page.

---
file: REPORT.md
---

<!-- Don't forget to change me -->

# Method Description

Explain how your method works. (5-10 sentences)

# Rationale

Describe the reasoning behind your gene panel design. (5-10 sentences)

# Data and Resources Used

Specify the datasets and any other resources utilized. (5-10 sentences)

---
<!-- this cell is not part of the paper, but just serve as a separator -->

Now remember to download this notebook and then submit it at https://hub.crunchdao.com/competitions/broad-3/submit/