# Gene Expression Values as a function of Total Gene Count for raw counts, quantile and log CPM normalizations

Note that throughout the notebook, we use *'rankit'* and *'quantile normalization'* interchangeably. A brief overview can be found <a href = 'https://en.wikipedia.org/wiki/Quantile_normalization'>here</a>.

In [1]:
!pip install cellxgene-census==1.6.0
!pip install tiledbsoma==1.4.4





In [2]:
import json
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
from sklearn.linear_model import LinearRegression
import numpy as np
import statsmodels.api as sm
lowess = sm.nonparametric.lowess
import warnings
warnings.filterwarnings('ignore')

## Marker + Housekeeping Genes

In [3]:
NUM_MIN_EXPRESSED_GENES_PER_CELL = 500
NUM_MIN_ULTRA_LOW_EXPRESSED_GENES = 2
CENSUS_VERSION = "2023-10-18"
ASSAYS = ['sci-RNA-seq', 'Seq-Well', 'Drop-seq', 'CEL-seq2', "10x 3\\' v1", "10x 5\\' v1", "10x 3\\' v2", "10x 5\\' v2", "10x 3\\' v3", "10x 3\\' transcription profiling", "10x 5\\' transcription profiling", "10x technology"]
HOUSEKEEPING_GENES = ['MALAT1', 'ACTB', 'GAPDH', 'UBC', 'SDHA', 'YWHAZ', 'PGK1', 'PPIA', 'RPL13A', 'RPLP0', 'B2M']
NUM_MARKER_GENES = 5

contents_cell_types = open('../data/data_tidy/overlapping_categories.json').read()
contents_marker_genes = open('../data/data_raw/hubmap_marker_genes.json').read()
tissue2cell_types = json.loads(contents_cell_types)
marker_genes = json.loads(contents_marker_genes)

cell_type_mapping = {
    'CL:4028006' : 'alveolar type 2 fibroblast cell',
    'CL:0000525' : 'syncytiotrophoblast cell',
    'CL:0000786' : 'plasma cell',
    'CL:0000895' : 'naive thymus-derived CD4-positive, alpha-beta T cell',
    'CL:0000909' : 'CD8-positive, alpha-beta memory T cell',
    'CL:0000071' : 'blood vessel endothelial cell',
    'CL:0000899' : 'T-helper 17 cell',
    'CL:0000084' : 'T cell',
    'CL:1001106' : 'kidney loop of Henle thick ascending limb epithelial cell'
}
mapping2cell_type = {v:k for k, v in cell_type_mapping.items()}

def get_marker_genes(tissue, cell_type):
    return [entry['symbol'] for entry in marker_genes[mapping2cell_type[cell_type]] if entry['tissue'] == tissue]

## Rankit Implementation

In [None]:
import cellxgene_census
import numpy as np
import numba as nb
from scipy import stats
import scipy 
import pandas as pd
import scanpy as sc


@nb.jit
def quantiles(max_rank: int, ranks: np.ndarray) -> np.ndarray:
    """
    :returns an array of n floats equally spaced from 0 to 1
    """
    return np.array([np.round((i - 0.5) / max_rank, 5) for i in ranks])


def rankit(Xraw: scipy.sparse.spmatrix, offset: float = 3.0) -> scipy.sparse.csr_matrix:
    """
    Row-wise normalizes values of a matrix using the rankit method. The target distribution is a normal distribution
    with variance of 1 and mean as set in `offset`
    https://en.wikipedia.org/wiki/Rankit
    In statistics, rankits of a set of data are the expected values of the order statistics of
    a sample from the standard normal distribution the same size as the data
    Caveat: equal values are ranked in undefined order.
    param Xraw: query matrix to be normalized
    param offset: mean for the resulting row-wise values that will follow a normal distribution with variance 1. This
    helps to shift values to a positive scale.
    :returns row-wise normalized matrix using rankit
    """
    X = Xraw.tocsr(copy=True)  # get Compressed Sparse Row format of raw expression values matrix
    indptr = X.indptr  # get row count
    warning_raised = False
    for row in range(0, indptr.shape[0] - 1):
        data = X.data[indptr[row] : indptr[row + 1]]
        if len(data) > 0:
            # Assign ranks to data, assigning the same value to ties
            ranks = stats.rankdata(data, method="dense")

            max_rank = max(ranks)
            prob_level = quantiles(max_rank, ranks)

            normal_quantiles = stats.norm.ppf(prob_level, loc=offset)
            X.data[indptr[row] : indptr[row + 1]] = normal_quantiles
        elif not warning_raised:
            print("This dataset has at least one row of all zero expressions")
            warning_raised = True
    return X

## Analyses

### Get metadata

In [None]:
def get_census_query(cell_types, tissues, assays):
    """
    Returns a value filter query for the census to retrieve data for 
    a given array of cell types, tissues and assays

    param cell_types: the cell_types to retrieve from the census
    param tissues: the tissues the cell_types are found in
    param assays: the assays corresponding to the cell_types and tissues

    :returns the value_filter query
    """
    value_filter = "is_primary_data == True and cell_type in [" 
    for cell_type in cell_types:
        value_filter += "'" + cell_type.replace("'", "\\'") + "', "
    value_filter = value_filter[:-2] + "] and tissue_general in ["
    for tissue in tissues:
        value_filter += "'" + tissue + "', "
    value_filter = value_filter[:-2] + "] and assay in ["
    for assay in assays:
        value_filter += "'" + assay + "', "
    value_filter = value_filter[:-2] + "]"
    return value_filter

In [None]:
def get_census_metadata(cell_types, tissues, assays, census):
    """
    Queries the census to retrieve metadata for a given array of cell_types, tissues and assays

    param cell_types: the cell_types to retrieve metadata for
    param tissues: the tissues the cell_types are found in
    param assays: the assays corresponding to the cell_types and tissues

    :returns
        metadata: metadata retrieved from the census 
        census_query: the query used to retrieve the metadata from cellxgene_census
        genes: the set of corresponding genes from the census
    """
    census_query = get_census_query(cell_types, tissues, assays)
    
    # Reads SOMADataFrame as a slice
    metadata = census["census_data"]["homo_sapiens"].obs.read(
        value_filter = census_query,
        column_names = ["soma_joinid", "assay", "dataset_id", "cell_type", "tissue", "tissue_general", "suspension_type", "disease", "donor_id", "raw_sum"]
    )

    genes = census["census_data"]["homo_sapiens"].ms["RNA"].var.read().concat().to_pandas()
        
    # Concatenates results to pyarrow.Table
    metadata = metadata.concat()
    
    # Converts to pandas.DataFrame
    metadata = metadata.to_pandas()
    # metadata['cell_type'] = metadata['cell_type'].apply(lambda x: cell_type2parents[x])
        
    print('There are', len(metadata), 'observations')
    return metadata, census_query, genes

### Get X iteratively and perform normalizations
**1. Pre-processing**
- *Remove cells with <500 expressed genes*

**2. Normalizations**
1. Rankit
2. log CPM

**3. Post-processing** <br>
- *Removal of Noisy Ultra-low Expression Values* <br>
After applying normalization, any gene/cell combination that had counts less or equal than 2 are set to missing data. This allows for removal of noise due to ultra-lowly expressed genes and provides a cleaner visualization.


In [None]:
from cellxgene_census.experimental.util import X_sparse_iter
import tiledbsoma as soma

def get_raw_count_data_and_normalize(cell_type, tissue, assays):
    """
    Retrieves raw counts data from the census for a given cell_type, tissue and set of assays 
    and performs quantile normalization and log (CPM) normalizations. Data is retrieved in batches.

    param cell_type: cell_type to get data for
    param tissue: corresponding tissue for the cell_type
    param assays: assays corresponding to the cell_type and tissue

    :returns
        all_X_raw_counts: raw counts data
        all_X_rankit: quantile normalized data
        all_X_log_cpm: log (CPM) normalized data
        genes: the set of corresponding genes from the census
    """
    with cellxgene_census.open_soma(census_version=CENSUS_VERSION) as census:
        tissues = [tissue]
        cell_types = [cell_type]

        metadata, value_filter, genes = get_census_metadata(cell_types, tissues, assays, census)
        exp = census["census_data"]["homo_sapiens"]
        query = exp.axis_query(
        measurement_name = "RNA",
        obs_query = soma.AxisQuery(
            value_filter = value_filter
        ))
        all_obs_soma_joinids = []
        all_var_soma_joinids = []
        all_X_raw_counts = None
        all_X_rankit = None
        all_X_log_cpm = None

        i = 0
        for (obs_soma_joinids, var_soma_joinids), X_chunk in X_sparse_iter(query, X_name="raw", stride=10000):
            print('Parsing chunk Batch:', i)

            # Remove cells with < 500 expressed genes
            n_obs = X_chunk.shape[0]
            non_zero = [X_chunk[i,:].count_nonzero() for i in range(n_obs)]
            mask = [i >= NUM_MIN_EXPRESSED_GENES_PER_CELL for i in non_zero]
            obs_soma_joinids = obs_soma_joinids[mask]

            all_obs_soma_joinids.extend(obs_soma_joinids)
            all_var_soma_joinids.extend(var_soma_joinids)

            X_chunk = X_chunk[mask, :]

            print('\tCompute rankit values Batch:', i)
            # Compute rankit values
            X_rankit = rankit(X_chunk)

            # Removal of ultra-low expressed genes
            nonzero_mask = X_chunk.nonzero()
            X_chunk_nonzero = X_chunk[nonzero_mask]
            lowly_expressed_mask = np.array(X_chunk_nonzero <= NUM_MIN_ULTRA_LOW_EXPRESSED_GENES)[0]

            nonzero_rows_indices = nonzero_mask[0][lowly_expressed_mask]
            nonzero_cols_indices = nonzero_mask[1][lowly_expressed_mask]

            X_rankit[nonzero_rows_indices, nonzero_cols_indices] = 0
            X_rankit.eliminate_zeros()

            X_chunk[nonzero_rows_indices, nonzero_cols_indices] = 0
            X_chunk.eliminate_zeros()

            # Compute log CPM values
            print('\tCompute log CPM values Batch:', i)
            library_sizes = np.array(metadata[metadata['soma_joinid'].isin(obs_soma_joinids)]['raw_sum']).reshape(-1, 1)
            X_cpm = X_chunk / library_sizes * 1e6
            X_log_cpm = scipy.sparse.csr_matrix(np.log(X_cpm + 1))

            X_log_cpm[nonzero_rows_indices, nonzero_cols_indices] = 0
            X_log_cpm.eliminate_zeros()

            if i == 0:
                all_X_raw_counts = X_chunk
                all_X_rankit = X_rankit    
                all_X_log_cpm = X_log_cpm   
            else:
                all_X_raw_counts = scipy.sparse.vstack((all_X_raw_counts, X_chunk))
                all_X_rankit = scipy.sparse.vstack((all_X_rankit, X_rankit))
                all_X_log_cpm = scipy.sparse.vstack((all_X_log_cpm, X_log_cpm))
            i += 1
            # X_chunk is a scipy.csr_matrix of csc_matrix
            # For each X_chunk[i, j], the associated soma_joinid is
            # obs_soma_joinids[i] and var_soma_joinids[j]
    metadata = metadata[metadata['soma_joinid'].isin(all_obs_soma_joinids)]
    metadata['m_idx'] = [i for i in range(len(metadata))]

    return all_X_raw_counts, all_X_rankit, all_X_log_cpm, genes

## Generate figures

In [None]:
def get_scatter_plot(x, y, gene_name, gene_type, color, showlegend):
    """
    Generate scatter plot for one gene

    param x: number of genes expressed in a cell_type
    param y: gene expression (raw counts or normalized)
    param gene_name: gene_name
    param gene_type: marker or housekeeping gene
    param color: color to use for scatter plot
    param showlegend: True if showing legend

    :returns scatter plot figure
    """
    return go.Scatter(x = x, y = y, mode = 'markers', marker=dict(color=color), name = gene_name, legendgroup = gene_type, legendgrouptitle_text = gene_type, showlegend = showlegend)

def get_trendline(x, y, color, line_name, gene_type, showlegend):
    """
    Generate trendline plot for one gene

    param x: number of genes expressed in a cell_type
    param y: predicted gene expression value either through LOESS or LR (whether raw_counts or normalized)
    param gene_name: gene_name
    param gene_type: marker or housekeeping gene
    param color: color to use for scatter plot
    param showlegend: True if showing legend

    :returns trendline scatter plot figure
    """
    return go.Scatter(x = x, y = y, marker=dict(color=color), name = line_name, legendgroup = gene_type, legendgrouptitle_text = gene_type, showlegend = showlegend)

In [None]:
def get_fig(cell_type, X, marker_gene, housekeeping_gene, num_genes_expressed, method_type, row_num, col_num, genes, showlegend = False):
    """
    Returns one row in the final figure, containing gene expression values (raw or normalized) as a function of number of genes expressed in a cell
    for one marker gene and one housekeeping gene in a given cell_type. Also computes Pearson and R2 correlation and LOESS and Linear Regression trendlines.

    param cell_type: cell_type to generate figure for
    param X: raw counts or normalized gene expression values
    param marker_gene: one marker gene
    param housekeeping_gene: one housekeeping gene
    param num_genes_expressed: vector containing the counts for number of genes expressed in the given cell_type
    param method_type: one of 'raw counts', 'quantile normalization' or 'log (CPM)'
    param row_num: row number in the final figure
    param col_num: 0 for raw_counts, 1 for quantile normalization, 2 for log (CPM)
    param genes: the set of corresponding genes from the census
    param showlegend: True if to show legend for this row

    :returns dictionary containing mappings between fig names and fig objects
    """
    gene_marker_index = [genes[genes['feature_name'] == marker_gene].index.values[0]]
    housekeeping_gene_index = [genes[genes['feature_name'] == housekeeping_gene].index.values[0]]

    mg_expression_values = X[:, gene_marker_index].toarray().reshape(-1, 1).squeeze()
    hg_expression_values = X[:, housekeeping_gene_index].toarray().reshape(-1, 1).squeeze()

    df = pd.DataFrame({'gene_expression' : list(mg_expression_values) + list(hg_expression_values), 
                       'num_genes_expressed' : num_genes_expressed + num_genes_expressed, 
                       'gene' : [marker_gene] * len(mg_expression_values) + [housekeeping_gene] * len(hg_expression_values),
                       'gene_type' : ['marker_gene'] * len(mg_expression_values) + ['housekeeping_gene'] * len(hg_expression_values)})
    
    df_markers = df[(df['gene_type'] == 'marker_gene') & (df['gene_expression'] != 0)]
    df_housekeeping = df[(df['gene_type'] != 'marker_gene') & (df['gene_expression'] != 0)]
    num_expressed_mg = len(df_markers)
    num_expressed_hg = len(df_housekeeping)

    markers_gene_expression = df_markers['gene_expression'].values
    markers_num_genes_expressed = df_markers['num_genes_expressed'].values.reshape(-1, 1)

    housekeeping_gene_expression = df_housekeeping['gene_expression'].values
    housekeeping_num_genes_expressed = df_housekeeping['num_genes_expressed'].values.reshape(-1, 1)

    # Linear Regression
    reg_mg = LinearRegression().fit(markers_num_genes_expressed, markers_gene_expression)
    reg_hg = LinearRegression().fit(housekeeping_num_genes_expressed, housekeeping_gene_expression)

    mg_preds_lr = reg_mg.predict(markers_num_genes_expressed)
    hg_preds_lr = reg_hg.predict(housekeeping_num_genes_expressed)

    # R2 coefficient of determination
    r2_mg = reg_mg.score(markers_num_genes_expressed, markers_gene_expression)
    r2_hg = reg_hg.score(housekeeping_num_genes_expressed, housekeeping_gene_expression)

    # LOESS
    mg_preds = np.array(lowess(df_markers['gene_expression'].to_list(), df_markers['num_genes_expressed'].to_list()))
    hg_preds = np.array(lowess(df_housekeeping['gene_expression'].to_list(), df_housekeeping['num_genes_expressed'].to_list()))
    
    # Pearson Correlations
    pearson_corr_mg = df_markers.corr('pearson')['num_genes_expressed']['gene_expression']
    pearson_corr_hg = df_housekeeping.corr('pearson')['num_genes_expressed']['gene_expression']

    if len(df_markers) > 50000:
        df_markers = df_markers.sample(50000)
        df_housekeeping = df_housekeeping.sample(50000)

    fig_mg = get_scatter_plot(df_markers['num_genes_expressed'].to_list(), df_markers['gene_expression'].to_list(), marker_gene, 'Marker', "#b19fc9", True)
    fig_hg = get_scatter_plot(df_housekeeping['num_genes_expressed'].to_list(), df_housekeeping['gene_expression'].to_list(), housekeeping_gene, 'Housekeeping', "#88d5d4", True)

    showlegend = (row_num == NUM_MARKER_GENES - 1 and col_num == 2)
    trendline_loess_mg = get_trendline(mg_preds[:, 0], mg_preds[:, 1], "#619c80", "LOESS", 'Marker Genes', showlegend)
    trendline_loess_hg = get_trendline(hg_preds[:, 0], hg_preds[:, 1], "#f3a5ac", "Linear Regression", 'Housekeeping Genes', showlegend)

    trendline_lr_mg = get_trendline(df_markers['num_genes_expressed'].to_list(), mg_preds_lr, "#f3a5ac", "LOESS", 'Marker Genes', showlegend)
    trendline_lr_hg = get_trendline(df_housekeeping['num_genes_expressed'].to_list(), hg_preds_lr, "#f3a5ac", "Linear Regression", 'Housekeeping Genes', showlegend)

    if row_num == 0:
        title_mg = "<b><span style='text-decoration:underline;'>" + method_type + "</span></b>"
        title_hg = "<b><span style='text-decoration:underline;'>" + method_type + "</span></b>"
    else:
        title_mg = "<br><br><br>"
        title_hg = "<br><br><br>"
    if col_num % 3 == 1:
        title_mg += "<br><b>" + marker_gene + "</b><br>N = " + str(num_expressed_mg)
        title_hg += "<br><b>" + housekeeping_gene + "</b><br>N = " + str(num_expressed_hg)
    else:
        title_mg += '<br><br>'
        title_hg += '<br><br>'
    title_mg += "<br>Pearson r:" + f"{pearson_corr_mg:.4f}" + "</b><br>R2 Score:" + f"{r2_mg:.4f}"
    title_hg += "<br>Pearson r:" + f"{pearson_corr_hg:.4f}" + "</b><br>R2 Score:" + f"{r2_hg:.4f}"

    fig_info = {'fig_mg' : fig_mg, 
                'fig_hg' : fig_hg, 
                'title_mg' : title_mg, 
                'title_hg' : title_hg, 
                'pearson_corr_mg' : pearson_corr_mg, 
                'pearson_corr_hg' : pearson_corr_hg, 
                'r2_mg' : r2_mg, 
                'r2_hg' : r2_hg, 
                'trendline_loess_mg' : trendline_loess_mg, 
                'trendline_loess_hg' : trendline_loess_hg, 
                'trendline_lr_mg' : trendline_lr_mg, 
                'trendline_lr_hg' : trendline_lr_hg}
    return fig_info

In [None]:
def get_mg_hg_information(fig_info_raw_counts, fig_info_rankit, fig_info_log_cpm, metadata_field):
    """
    Returns figure data from raw_counts, quantile normalization and log(CPM) figure dictionaries in an array format
    mg stands for marker_gene; hg stands for housekeeping genes

    param fig_info_raw_counts: dictionary containing mappings between fig names and fig objects for raw counts data
    param fig_info_rankit: dictionary containing mappings between fig names and fig objects for quantile normalization data
    param fig_info_log_cpm: dictionary containing mappings between fig names and fig objects for log (CPM) normalized data
    
    :returns array containing figure information from raw_counts, quantile normalization and log (CPM) for marker and housekeeping genes
    """
    return [fig_info_raw_counts[metadata_field + '_mg'], fig_info_rankit[metadata_field + '_mg'], fig_info_log_cpm[metadata_field + '_mg'], 
            fig_info_raw_counts[metadata_field + '_hg'], fig_info_rankit[metadata_field + '_hg'], fig_info_log_cpm[metadata_field + '_hg']]

In [None]:
def plot_genes(cell_type, tissue, X_raw, X_rankit, X_log_cpm, num_genes_expressed, housekeeping_genes, num_marker_genes, genes):
    """
    Generates the plots for a given cell_type - gene expression values (raw or normalized) as a function of number of genes expressed in a cell
    for a set of marker and housekeeping genes. Also computes Pearson and R2 correlation and LOESS and Linear Regression trendlines.

    param X_raw: raw counts data
    param X_rankit: quantile normalized data
    param X_log_cpm: log (CPM) normalized data
    param num_genes_expressed: vector containing the counts for number of genes expressed in the given cell_type
    param housekeeping_genes: housekeeping genes to plot
    param num_marker_genes: indicates how many maker genes to select.

    returns:
        corr_df_mg: df containing correlation values for marker genes
        corr_df_hg: df containing correlation values for housekeeping genes
        r2_df_mg: df containing R2 coeff of determination values for marker genes
        r2_df_hg: df containing R2 coeff of determination values for housekeeping genes
        fig: final figure 
    """
    figures = []
    trendlines_loess = []
    trendlines_lr = []
    titles = []
    gene_expression_markers = get_marker_genes(tissue, cell_type)
    gene_markers = np.random.choice(gene_expression_markers, min(num_marker_genes, len(gene_expression_markers)), replace = False)
    pearson_correlations_mg = []
    pearson_correlations_hg = []
    r2s_mg = []
    r2s_hg = []
    housekeeping_genes_small = housekeeping_genes[:num_marker_genes]

    row_num = 0
    for gene_marker, housekeeping_gene in zip(gene_markers, housekeeping_genes_small):

        fig_info_raw_counts = get_fig(cell_type, X_raw, gene_marker, housekeeping_gene, num_genes_expressed, 'Raw Counts', row_num, 0, genes, showlegend = True)
        fig_info_rankit = get_fig(cell_type, X_rankit, gene_marker, housekeeping_gene, num_genes_expressed, 'Quantile Normalization', row_num, 1, genes)
        fig_info_log_cpm = get_fig(cell_type, X_log_cpm, gene_marker, housekeeping_gene, num_genes_expressed, 'log CPM', row_num, 2, genes)

        figures.append(get_mg_hg_information(fig_info_raw_counts, fig_info_rankit, fig_info_log_cpm, 'fig'))
        trendlines_loess.append(get_mg_hg_information(fig_info_raw_counts, fig_info_rankit, fig_info_log_cpm, 'trendline_loess'))
        trendlines_lr.append(get_mg_hg_information(fig_info_raw_counts, fig_info_rankit, fig_info_log_cpm, 'trendline_lr'))

        titles += get_mg_hg_information(fig_info_raw_counts, fig_info_rankit, fig_info_log_cpm, 'title')
        pearson_correlations = get_mg_hg_information(fig_info_raw_counts, fig_info_rankit, fig_info_log_cpm, 'pearson_corr')
        pearson_correlations_mg.append(pearson_correlations[:3])
        pearson_correlations_hg.append(pearson_correlations[3:])

        r2s = get_mg_hg_information(fig_info_raw_counts, fig_info_rankit, fig_info_log_cpm, 'r2')
        r2s_mg.append(r2s[:3])
        r2s_hg.append(r2s[3:])
        row_num += 1

    pearson_correlations_mg = np.array(pearson_correlations_mg)
    pearson_correlations_hg = np.array(pearson_correlations_hg)
    r2s_mg = np.array(r2s_mg)
    r2s_hg = np.array(r2s_hg)

    corr_df_mg = pd.DataFrame({'raw_counts_corr' : pearson_correlations_mg[:, 0], 'rankit_corr' : pearson_correlations_mg[:, 1], 'log_cpm_corr' : pearson_correlations_mg[:, 2], 'gene' : gene_markers})
    corr_df_hg = pd.DataFrame({'raw_counts_corr' : pearson_correlations_hg[:, 0], 'rankit_corr' : pearson_correlations_hg[:, 1], 'log_cpm_corr' : pearson_correlations_hg[:, 2], 'gene' : housekeeping_genes_small})
    r2_df_mg = pd.DataFrame({'raw_counts_r2' : r2s_mg[:, 0], 'rankit_r2' : r2s_mg[:, 1], 'log_cpm_r2' : r2s_mg[:, 2], 'gene' : gene_markers})
    r2_df_hg = pd.DataFrame({'raw_counts_r2' : r2s_hg[:, 0], 'rankit_r2' : r2s_hg[:, 1], 'log_cpm_r2' : r2s_hg[:, 2], 'gene' : housekeeping_genes_small})
    fig = make_subplots(rows=len(gene_markers), cols=6, subplot_titles = titles, y_title='Y: Expression Value', x_title='X: Total Gene Count',) 

    for row, row_figures in enumerate(figures):
        for col, figure in enumerate(row_figures):
            fig.add_trace(figure, row=row + 1, col=col + 1)
            fig.add_trace(trendlines_loess[row][col], row=row + 1, col=col + 1)
            fig.add_trace(trendlines_lr[row][col], row=row + 1, col=col + 1)
    return corr_df_mg, corr_df_hg, r2_df_mg, r2_df_hg, fig

In [None]:
def generate_analyses(cell_type_id, tissue, num_marker_genes):
    """
    Generate analyses for a given cell_type and tissue

    param cell_type_id: CL ontology ID of desired cell_type
    param tissue: desired tissue
    """
    cell_type = cell_type_mapping[cell_type_id]
    X_raw, X_rankit, X_log_cpm, genes = get_raw_count_data_and_normalize(cell_type, tissue, ASSAYS)
    num_cells = X_raw.shape[0]
    num_genes_expressed = [X_raw[i, :].count_nonzero() for i in range(num_cells)]
    corr_df_mg, corr_df_hg, r2_df_mg, r2_df_hg, fig = plot_genes(cell_type, tissue, X_raw, X_rankit, X_log_cpm, num_genes_expressed, HOUSEKEEPING_GENES, num_marker_genes, genes)
    fig.update_layout(autosize=False,width=1500,height=1500, title_text =  cell_type + ", " + tissue + ", N = " + str(num_cells), title_x=0.5, title_y = 0.98, margin = {'t' : 200}, title_font=dict(size=30))
    fig.update_traces(marker=dict(size=2))
    display(fig)

### alveolar type 2 fibroblast cell, lung

In [None]:
generate_analyses('CL:4028006', 'lung', 5)

### syncytiotrophoblast cell, placenta

In [None]:
generate_analyses('CL:0000525', 'placenta', 4)

### plasma cell, lymph node

In [None]:
generate_analyses('CL:0000786', 'lymph node', 4)

### plasma cell, bone marrow

In [None]:
generate_analyses('CL:0000786', 'bone marrow', 5)

### T cell, kidney

In [None]:
generate_analyses('CL:0000084', 'kidney', 5)