# ANOVA on average normalized gene expression values
Results for one-way repeated measures ANOVA scores conducted on marker and housekeeping genes in five different cell types <br>
Note that throughout the notebook, we use *'rankit'* and *'quantile normalization'* interchangeably. A brief overview can be found <a href = 'https://en.wikipedia.org/wiki/Quantile_normalization'>here</a>.

In [18]:
import json
import plotly.figure_factory as ff
import plotly.express as px
import pingouin as pg
import plotly.graph_objects as go
import numpy as np
np.random.seed(123)
import plotly.offline as pyo

## Marker + Housekeeping Genes

In [2]:
NUM_MIN_EXPRESSED_GENES_PER_CELL = 500
NUM_MIN_ULTRA_LOW_EXPRESSED_GENES = 2
CENSUS_VERSION = "2023-10-18"
ASSAYS = ['sci-RNA-seq', 'Seq-Well', 'Drop-seq', 'CEL-seq2', "10x 3\\' v1", "10x 5\\' v1", "10x 3\\' v2", "10x 5\\' v2", "10x 3\\' v3", "10x 3\\' transcription profiling", "10x 5\\' transcription profiling", "10x technology"]
HOUSEKEEPING_GENES = ['MALAT1', 'ACTB', 'GAPDH', 'UBC', 'SDHA', 'YWHAZ', 'PGK1', 'PPIA', 'RPL13A', 'RPLP0', 'B2M']
NUM_MARKER_GENES = 5

contents_cell_types = open('../data/data_tidy/overlapping_categories.json').read()
contents_marker_genes = open('../data/data_raw/hubmap_marker_genes.json').read()
tissue2cell_types = json.loads(contents_cell_types)
marker_genes = json.loads(contents_marker_genes)

cell_type_mapping = {
    'CL:4028006' : 'alveolar type 2 fibroblast cell',
    'CL:0000525' : 'syncytiotrophoblast cell',
    'CL:0000786' : 'plasma cell',
    'CL:0000895' : 'naive thymus-derived CD4-positive, alpha-beta T cell',
    'CL:0000909' : 'CD8-positive, alpha-beta memory T cell',
    'CL:0000071' : 'blood vessel endothelial cell',
    'CL:0000899' : 'T-helper 17 cell',
    'CL:0000084' : 'T cell',
    'CL:1001106' : 'kidney loop of Henle thick ascending limb epithelial cell'
}
mapping2cell_type = {v:k for k, v in cell_type_mapping.items()}

def get_marker_genes(tissue, cell_type):
    return [entry['symbol'] for entry in marker_genes[mapping2cell_type[cell_type]] if entry['tissue'] == tissue]

## Rankit Implementation

In [3]:
import cellxgene_census
import numpy as np
import numba as nb
from scipy import stats
import scipy 
import pandas as pd
import scanpy as sc


@nb.jit
def quantiles(max_rank: int, ranks: np.ndarray) -> np.ndarray:
    """
    :returns an array of n floats equally spaced from 0 to 1
    """
    return np.array([np.round((i - 0.5) / max_rank, 5) for i in ranks])


def rankit(Xraw: scipy.sparse.spmatrix, offset: float = 3.0) -> scipy.sparse.csr_matrix:
    """
    Row-wise normalizes values of a matrix using the rankit method. The target distribution is a normal distribution
    with variance of 1 and mean as set in `offset`
    https://en.wikipedia.org/wiki/Rankit
    In statistics, rankits of a set of data are the expected values of the order statistics of
    a sample from the standard normal distribution the same size as the data
    Caveat: equal values are ranked in undefined order.
    param Xraw: query matrix to be normalized
    param offset: mean for the resulting row-wise values that will follow a normal distribution with variance 1. This
    helps to shift values to a positive scale.
    :returns row-wise normalized matrix using rankit
    """
    X = Xraw.tocsr(copy=True)  # get Compressed Sparse Row format of raw expression values matrix
    indptr = X.indptr  # get row count
    warning_raised = False
    for row in range(0, indptr.shape[0] - 1):
        data = X.data[indptr[row] : indptr[row + 1]]
        if len(data) > 0:
            # Assign ranks to data, assigning the same value to ties
            ranks = stats.rankdata(data, method="dense")

            max_rank = max(ranks)
            prob_level = quantiles(max_rank, ranks)

            normal_quantiles = stats.norm.ppf(prob_level, loc=offset)
            X.data[indptr[row] : indptr[row + 1]] = normal_quantiles
        elif not warning_raised:
            print("This dataset has at least one row of all zero expressions")
            warning_raised = True
    return X

## Analyses

In [4]:
def get_census_query(cell_types, tissues, assays):
    """
    Returns a value filter query for the census to retrieve data for 
    a given array of cell types, tissues and assays

    :param cell_types: the cell_types to retrieve from the census
    :param tissues: the tissues the cell_types are found in
    :param assays: the assays corresponding to the cell_types and tissues

    :return: the value_filter query
    """
    value_filter = "is_primary_data == True and cell_type in [" 
    for cell_type in cell_types:
        value_filter += "'" + cell_type.replace("'", "\\'") + "', "
    value_filter = value_filter[:-2] + "] and tissue_general in ["
    for tissue in tissues:
        value_filter += "'" + tissue + "', "
    value_filter = value_filter[:-2] + "] and assay in ["
    for assay in assays:
        value_filter += "'" + assay + "', "
    value_filter = value_filter[:-2] + "]"
    return value_filter

In [5]:
def get_census_metadata(cell_types, tissues, assays, census):
    """
    Queries the census to retrieve metadata for a given array of cell_types, tissues and assays

    :param cell_types: the cell_types to retrieve metadata for
    :param tissues: the tissues the cell_types are found in
    :param assays: the assays corresponding to the cell_types and tissues

    :return:
        metadata: metadata retrieved from the census 
        census_query: the query used to retrieve the metadata from cellxgene_census
        genes: the set of corresponding genes from the census
    """
    census_query = get_census_query(cell_types, tissues, assays)
    
    # Reads SOMADataFrame as a slice
    metadata = census["census_data"]["homo_sapiens"].obs.read(
        value_filter = census_query,
        column_names = ["soma_joinid", "assay", "dataset_id", "cell_type", "tissue", "tissue_general", "suspension_type", "disease", "donor_id", "raw_sum"]
    )

    genes = census["census_data"]["homo_sapiens"].ms["RNA"].var.read().concat().to_pandas()
        
    # Concatenates results to pyarrow.Table
    metadata = metadata.concat()
    
    # Converts to pandas.DataFrame
    metadata = metadata.to_pandas()
    # metadata['cell_type'] = metadata['cell_type'].apply(lambda x: cell_type2parents[x])
        
    print('There are', len(metadata), 'observations')
    return metadata, census_query, genes

### Get X iteratively and perform normalizations
**1. Pre-processing**
- *Remove cells with <500 expressed genes*

**2. Normalizations**
1. Rankit
2. log CPM

**3. Post-processing** <br>
- *Removal of Noisy Ultra-low Expression Values* <br>
After applying normalization, any gene/cell combination that had counts less or equal than 2 are set to missing data. This allows for removal of noise due to ultra-lowly expressed genes and provides a cleaner visualization.


In [6]:
from cellxgene_census.experimental.util import X_sparse_iter
import tiledbsoma as soma

def get_raw_count_data_and_normalize(cell_type, tissue, assays):
    """
    Retrieves raw counts data from the census for a given cell_type, tissue and set of assays 
    and performs quantile normalization and log (CPM) normalizations. Data is retrieved in batches.

    :param cell_type: cell_type to get data for
    :param tissue: corresponding tissue for the cell_type
    :param assays: assays corresponding to the cell_type and tissue

    :return:
        all_X_raw_counts: raw counts data
        all_X_rankit: quantile normalized data
        all_X_log_cpm: log (CPM) normalized data
        genes: the set of corresponding genes from the census
    """
    with cellxgene_census.open_soma(census_version=CENSUS_VERSION) as census:
        tissues = [tissue]
        cell_types = [cell_type]

        metadata, value_filter, genes = get_census_metadata(cell_types, tissues, assays, census)
        exp = census["census_data"]["homo_sapiens"]
        query = exp.axis_query(
        measurement_name = "RNA",
        obs_query = soma.AxisQuery(
            value_filter = value_filter
        ))
        all_obs_soma_joinids = []
        all_var_soma_joinids = []
        all_X_raw_counts = None
        all_X_rankit = None
        all_X_log_cpm = None

        i = 0
        for (obs_soma_joinids, var_soma_joinids), X_chunk in X_sparse_iter(query, X_name="raw", stride=10000):
            print('Parsing chunk Batch:', i)

            # Remove cells with < 500 expressed genes
            n_obs = X_chunk.shape[0]
            non_zero = [X_chunk[i,:].count_nonzero() for i in range(n_obs)]
            mask = [i >= NUM_MIN_EXPRESSED_GENES_PER_CELL for i in non_zero]
            obs_soma_joinids = obs_soma_joinids[mask]

            all_obs_soma_joinids.extend(obs_soma_joinids)
            all_var_soma_joinids.extend(var_soma_joinids)

            X_chunk = X_chunk[mask, :]

            print('\tCompute rankit values Batch:', i)
            # Compute rankit values
            X_rankit = rankit(X_chunk)

            # Removal of ultra-low expressed genes
            nonzero_mask = X_chunk.nonzero()
            X_chunk_nonzero = X_chunk[nonzero_mask]
            lowly_expressed_mask = np.array(X_chunk_nonzero <= NUM_MIN_ULTRA_LOW_EXPRESSED_GENES)[0]

            nonzero_rows_indices = nonzero_mask[0][lowly_expressed_mask]
            nonzero_cols_indices = nonzero_mask[1][lowly_expressed_mask]

            X_rankit[nonzero_rows_indices, nonzero_cols_indices] = 0
            X_rankit.eliminate_zeros()

            X_chunk[nonzero_rows_indices, nonzero_cols_indices] = 0
            X_chunk.eliminate_zeros()

            # Compute log CPM values
            print('\tCompute log CPM values Batch:', i)
            library_sizes = np.array(metadata[metadata['soma_joinid'].isin(obs_soma_joinids)]['raw_sum']).reshape(-1, 1)
            X_cpm = X_chunk / library_sizes * 1e6
            X_log_cpm = scipy.sparse.csr_matrix(np.log(X_cpm + 1))

            X_log_cpm[nonzero_rows_indices, nonzero_cols_indices] = 0
            X_log_cpm.eliminate_zeros()

            if i == 0:
                all_X_raw_counts = X_chunk
                all_X_rankit = X_rankit    
                all_X_log_cpm = X_log_cpm   
            else:
                all_X_raw_counts = scipy.sparse.vstack((all_X_raw_counts, X_chunk))
                all_X_rankit = scipy.sparse.vstack((all_X_rankit, X_rankit))
                all_X_log_cpm = scipy.sparse.vstack((all_X_log_cpm, X_log_cpm))
            i += 1
            # X_chunk is a scipy.csr_matrix of csc_matrix
            # For each X_chunk[i, j], the associated soma_joinid is
            # obs_soma_joinids[i] and var_soma_joinids[j]
    metadata = metadata[metadata['soma_joinid'].isin(all_obs_soma_joinids)]
    metadata['m_idx'] = [i for i in range(len(metadata))]

    obs_soma_joinid2idx = {x : i for i, x in enumerate(all_obs_soma_joinids)}
    obs_idx2soma_joinid = {v:k for k, v in obs_soma_joinid2idx.items()}

    var_idx2soma_joinid = {i : x for i, x in enumerate(all_var_soma_joinids)}
    var_soma_joinid2idx = {v:k for k, v in var_idx2soma_joinid.items()}
    return all_X_raw_counts, all_X_rankit, all_X_log_cpm, metadata, genes, obs_soma_joinid2idx, var_idx2soma_joinid

In [7]:
def get_average_ge(X):
    """
    Get average gene expression values for raw or normalized matrix X

    :param X: raw or normalized (quantile, log CPM) matrix of gene expression values

    :return: array containing average gene expression values, where each entry corresponds to the average for a specicic gene
    """
    covariate_gene_expression_sums = X.sum(axis = 0).tolist()[0]
    num_genes = X.shape[1]
    non_zero_cols, non_zero_counts = np.unique(X.indices, return_counts=True)
    averages = np.zeros(num_genes)
    for non_zero_col, non_zero_counts in zip(non_zero_cols, non_zero_counts):
        averages[non_zero_col] = covariate_gene_expression_sums[non_zero_col] / non_zero_counts
    return averages

In [8]:
def get_covariate_df(metadata, genes, X_raw, X_rankit, X_log_cpm, obs_soma_joinid2idx, covariate, genes_of_interest):
    """
    Computes average gene expression values per covariate

    :param metadata: dataframe containing metadata information about the cell type considered
    :param genes: the set of corresponding genes from the census
    :param X_raw: raw counts data
    :param X_rankit: quantile normalized data
    :param X_log_cpm: log (CPM) normalized data
    :param obs_soma_joinid2idx: mapping from obs_soma_join_id to idx in the X matrices
    :param covariate: covariate to consider (dataset_id or assay)
    :param genes_of_interest: genes to subset from the census

    :returns:
        covariate_raw_df: average gene expression values per covariate for raw counts
        covariate_rankit_df: average gene expression values per covariate for quantile normalized (rankit) data
        covariate_log_cpm_df: average gene expression values per covariate for log CPM normalized data
        covariate_df: average gene expression values per covariate aggregated over raw, quantile normalized and log CPM data
    """
    covariates_grouped = metadata.groupby(covariate).aggregate(list)
    all_covariate_indices = covariates_grouped['soma_joinid'].to_list()
    num_covariates = len(all_covariate_indices)
    gene_expression_averages_raw = []
    gene_expression_averages_rankit = []
    gene_expression_averages_log_cpm = []

    gene_expression_raw_by_covariate = []
    gene_expression_rankit_by_covariate = []
    gene_expression_log_cpm_by_covariate = []
    print('There are', num_covariates, covariate)

    all_indices = []
    for covariate_idx in range(num_covariates):
        soma_join_ids = all_covariate_indices[covariate_idx]
        covariate_indices = [obs_soma_joinid2idx[x] for x in soma_join_ids]
        all_indices.extend(covariate_indices)

        covariate_gene_expression_avg_raw = get_average_ge(X_raw[covariate_indices])
        covariate_gene_expression_avg_rankit = get_average_ge(X_rankit[covariate_indices])
        covariate_gene_expression_avg_log_cpm = get_average_ge(X_log_cpm[covariate_indices])

        gene_expression_averages_raw.append(covariate_gene_expression_avg_raw)
        gene_expression_averages_rankit.append(covariate_gene_expression_avg_rankit)
        gene_expression_averages_log_cpm.append(covariate_gene_expression_avg_log_cpm)

        covariate_gene_expression_raw = X_raw[covariate_indices].toarray()
        covariate_gene_expression_rankit = X_rankit[covariate_indices].toarray()
        covariate_gene_expression_log_cpm = X_log_cpm[covariate_indices].toarray()

        gene_expression_raw_by_covariate.append(covariate_gene_expression_raw)
        gene_expression_rankit_by_covariate.append(covariate_gene_expression_rankit)
        gene_expression_log_cpm_by_covariate.append(covariate_gene_expression_log_cpm)
        
    covariates_list = covariates_grouped.index.to_list()
    covariate_df = pd.DataFrame(columns = ['cell_type_raw_count_avg', 'cell_type_rankit_avg', 'cell_type_log_cpm_avg'] + [x + '_raw_count_avg' for i, x in enumerate(covariates_list)] + [x +  '_rankit_avg' for i, x in enumerate(covariates_list)] + [x +  '_log_cpm_avg' for i, x in enumerate(covariates_list)])

    covariate_df['cell_type_raw_count_avg'] = get_average_ge(X_raw[all_indices])
    covariate_df['cell_type_rankit_avg'] = get_average_ge(X_rankit[all_indices])
    covariate_df['cell_type_log_cpm_avg'] = get_average_ge(X_log_cpm[all_indices])

    for cov_raw, cov_rankit, cov_log_cpm, cov, gene_expressions_raw, gene_expressions_rankit, gene_expressions_log_cpm in zip(gene_expression_averages_raw, gene_expression_averages_rankit, gene_expression_averages_log_cpm, covariates_list, gene_expression_raw_by_covariate, gene_expression_rankit_by_covariate, gene_expression_log_cpm_by_covariate):
        covariate_df[cov + '_raw_count_avg'] = cov_raw
        covariate_df[cov + '_rankit_avg'] = cov_rankit
        covariate_df[cov + '_log_cpm_avg'] = cov_log_cpm

    covariate_df.index = genes_of_interest

    covariate_raw_df = covariate_df[[x for x in covariate_df.columns.to_list() if 'raw_count' in x]]
    covariate_rankit_df = covariate_df[[x for x in covariate_df.columns.to_list() if 'rankit' in x]]
    covariate_log_cpm_df = covariate_df[[x for x in covariate_df.columns.to_list() if 'log_cpm' in x]]

    covariates_grouped = metadata.groupby(covariate).aggregate(list)
    all_covariate_indices = covariates_grouped['soma_joinid'].to_list()
    num_genes = len(genes)
    return covariate_raw_df, covariate_rankit_df, covariate_log_cpm_df, covariate_df

In [9]:
from statsmodels.stats.anova import AnovaRM
from collections import Counter
import pingouin

def get_anova_rm_score(covariate_df_mg, genes_of_interest):
    """
    Computes one-way ANOVA with repeated measures on a dataframe containing average gene expression values (raw or normalized)
    by treating each average gene expression value as the dependent variable and the covariate as the independent one

    :param covariate_df: dataframe containing average gene expression values (raw or normalized) for a given covariate
    :param genes_of_interest: genes we are interested in 

    :returns: f statistic, p_value of one-way ANOVA with repeated measures
    """
    cov_df_for_anova_rm = covariate_df_mg.T[1:].reset_index()
    cov_df_for_anova_rm.rename(columns = {'index' : 'covariate'}, inplace = True)
    covariates = cov_df_for_anova_rm['covariate'].to_list()
    num_covariates = len(cov_df_for_anova_rm)

    gene_averages = []
    all_genes = []
    gene_averages = []
    all_covariates = []
    low_genes = set([])
    gene2expression = {}
    for gene in genes_of_interest:
        is_expressed = False
        for dataset, gene_average in zip(covariates, cov_df_for_anova_rm[gene].to_list()):
            if gene_average == 0:
                low_genes.add(gene)  
            else:
                is_expressed = True
            all_genes.append(gene)
            all_covariates.append(dataset)
            gene_averages.append(gene_average)
        gene2expression[gene] = is_expressed

    anova_df = pd.DataFrame({'gene' : all_genes, 'covariate' : all_covariates, 'gene_avg' : gene_averages})
    if len(anova_df) > 2:
        # gene average is treated as the dependent variable and the covariate as the independent one
        anova_rm = pg.rm_anova(dv='gene_avg', within=['covariate'], subject='gene', data=anova_df)
        f = anova_rm['F'].values[0]
        p_val = anova_rm['p-unc'].values[0]
    else:
        f = -1
        p_val = -1

    return f, p_val, anova_df

In [10]:
def get_anova_rm_scores(cell_types, covariates = ['dataset_id', 'assay'], all_genes = False):
    """
    Retrieve pairwise log2fold changes for a given set of [(cell_type, tissue)] pairs stratified by covariates
    Computes values for raw counts, quantile normalization and log (CPM)

    :param cell_types: array of [(cell_type, tissue)] pairs
    :param covariates: covariates to consider
    :param all_genes: True if to consider the entire gene expression vector; False if to consider only the marker genes

    :return: dataframe containing the pairwise log2fold changes for a given set of [(cell_type, tissue)] pairs stratified by covariates
    """
    anova_rm_f_scores = []
    anova_rm_p_vals = []
    anova_rm_method = []
    anova_rm_covariate = []
    anova_rm_cell_type = []
    anova_rm_types = []
    genes_set = False
        

    for cell_type, tissue in cell_types:

        X_raw_counts, X_rankit, X_log_cpm, metadata, genes, obs_soma_joinid2idx, var_idx2soma_joinid = get_raw_count_data_and_normalize(cell_type, tissue, ASSAYS)
        if len(metadata) == 0:
            continue
        if not genes_set:
            marker_genes = genes['feature_name'].to_list()
            genes_set = True
        var_soma_joinid2idx = {i : x for i, x in var_idx2soma_joinid.items()}

        if not all_genes:
            marker_genes = get_marker_genes(tissue, cell_type) 
        marker_genes_indices =  list([var_soma_joinid2idx[genes[genes['feature_name'] == x]['soma_joinid'].values[0]] for x in marker_genes])
        hg_indices =  list([var_soma_joinid2idx[genes[genes['feature_name'] == x]['soma_joinid'].values[0]] for x in HOUSEKEEPING_GENES])

        for genes_indices, genes_type, genes_of_interest in zip([marker_genes_indices, hg_indices], ['Marker Gene', 'Housekeeping Gene'], [marker_genes, HOUSEKEEPING_GENES]):
            for covariate in covariates:
                covariate_raw_df, covariate_rankit_df, covariate_log_cpm_df, covariate_df = get_covariate_df(metadata, genes, X_raw_counts[:, genes_indices], X_rankit[:, genes_indices], X_log_cpm[:, genes_indices], obs_soma_joinid2idx, covariate, genes_of_interest)
                
                f_raw, p_raw, data_anova_raw_df = get_anova_rm_score(covariate_raw_df, genes_of_interest)
                f_rankit, p_rankit, data_anova_rankit_df = get_anova_rm_score(covariate_rankit_df, genes_of_interest)
                f_log_cpm, p_log_cpm, data_anova_log_cpm_df = get_anova_rm_score(covariate_log_cpm_df, genes_of_interest)
                anova_rm_f_scores.extend([f_raw, f_rankit, f_log_cpm])
                anova_rm_p_vals.extend([p_raw, p_rankit, p_log_cpm])
                anova_rm_method.extend(['raw', 'rankit', 'log (CPM)'])
                anova_rm_types.extend([genes_type] * 3)
                anova_rm_covariate.extend([covariate] * 3)
                anova_rm_cell_type.extend([cell_type + ", " + tissue] * 3)
        
    anova_df = pd.DataFrame({'cell_type' : anova_rm_cell_type, 'covariate' : anova_rm_covariate, 'method' : anova_rm_method, 'F score' : anova_rm_f_scores, 'p_val' : anova_rm_p_vals, 'gene_type' : anova_rm_types})
    return anova_df

### Repeated Measures ANOVA Scores

In [11]:
cell_types = [('plasma cell', 'lymph node'), ('T cell', 'kidney'), ('alveolar type 2 fibroblast cell', 'lung'), ('syncytiotrophoblast cell', 'placenta'), ('plasma cell', 'bone marrow')]
anova_df = get_anova_rm_scores(cell_types, covariates = ['dataset_id', 'assay'], all_genes = False)

There are 4374 observations
Parsing chunk Batch: 0
	Compute rankit values Batch: 0
	Compute log CPM values Batch: 0


  self._set_arrayXarray(i, j, x)


There are 5 dataset_id
There are 5 assay
There are 5 dataset_id
There are 5 assay
There are 3111 observations
Parsing chunk Batch: 0
	Compute rankit values Batch: 0
	Compute log CPM values Batch: 0


  self._set_arrayXarray(i, j, x)


There are 5 dataset_id
There are 3 assay
There are 5 dataset_id
There are 3 assay
There are 13961 observations
Parsing chunk Batch: 0
	Compute rankit values Batch: 0
	Compute log CPM values Batch: 0


  self._set_arrayXarray(i, j, x)


Parsing chunk Batch: 1
	Compute rankit values Batch: 1
	Compute log CPM values Batch: 1
There are 2 dataset_id
There are 4 assay
There are 2 dataset_id
There are 4 assay
There are 38465 observations
Parsing chunk Batch: 0
	Compute rankit values Batch: 0
	Compute log CPM values Batch: 0


  self._set_arrayXarray(i, j, x)


Parsing chunk Batch: 1
	Compute rankit values Batch: 1
	Compute log CPM values Batch: 1


  self._set_arrayXarray(i, j, x)


Parsing chunk Batch: 2
	Compute rankit values Batch: 2
	Compute log CPM values Batch: 2
Parsing chunk Batch: 3
	Compute rankit values Batch: 3
	Compute log CPM values Batch: 3
There are 3 dataset_id
There are 3 assay
There are 3 dataset_id
There are 3 assay
There are 1835 observations
Parsing chunk Batch: 0
	Compute rankit values Batch: 0
	Compute log CPM values Batch: 0


  self._set_arrayXarray(i, j, x)


There are 3 dataset_id
There are 4 assay
There are 3 dataset_id
There are 4 assay


In [19]:
import plotly.express as px
#only genes that are expressed
colors = ["#88d5d4", "#f7cb6c", "#f3a5ac", "#619c80", "#df7670","#b19fc9"]

fig = px.scatter(anova_df, y="p_val", x = 'method', facet_row = "covariate", facet_col = "gene_type", color = "cell_type", color_discrete_sequence = colors)
fig.update_layout(width=1000,height=800, title_text="<b>Repeated Measures ANOVA Scores per covariate</b> <br> H0: average normalized gene expression is the same in all covariate entities <br><br>", title_x=0.5, title_y = 0.98)
fig.add_hline(y=0.05, line_width=3, line_dash="dash", line_color="gray", annotation_text = "p_val = 0.05")
fig.add_hrect(y0=0.05, y1=0, line_width=0, fillcolor="red", opacity=0.2)
fig.update_traces(marker=dict(size=7))
pyo.iplot(fig, filename = 'basic-line')

In [13]:
anova_df_agg = anova_df[anova_df['method'] == 'log (CPM)'].groupby('covariate').aggregate(list).reset_index()
anova_df_assay_values = anova_df_agg[anova_df_agg['covariate'] == 'assay']['p_val'].to_list()[0]
anova_df_dataset_values = anova_df_agg[anova_df_agg['covariate'] == 'dataset_id']['p_val'].to_list()[0]

In [14]:
def get_sig_p_values(p_val_arr, sig_level = 0.05):
    ct = 0
    for x in p_val_arr:
        if x > sig_level:
            ct += 1
    return ct

In [15]:
num_sig_assay_p_vals = get_sig_p_values(anova_df_assay_values)
print(num_sig_assay_p_vals, '/', len(anova_df_assay_values), 'p-values are not significant for the assay covariate')

6 / 10 p-values are not significant for the assay covariate


In [16]:
num_sig_dataset_p_vals = get_sig_p_values(anova_df_dataset_values)
print(num_sig_dataset_p_vals, '/', len(anova_df_dataset_values), 'p-values are not significant for the dataset covariate')

9 / 10 p-values are not significant for the dataset covariate
