### Get downstream genes
1. For each cluster, only include genes that are at least the DE gene (fdr<0.1) of one cluster member regulator.
2. Rank-based ranking of downstream genes, summation of upregulation and downregulation ranking (based on zscore) across cluster member then rank based on rank sum.
3. Simple number of upstream regulators (both positive and negative) within cluster member regulators.
4. Sign coherence of upstream regulators (same sign +1, opposite sign -1).

In [1]:
import pandas as pd
import scanpy as sc
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

  from anndata import __version__ as anndata_version
  if Version(anndata.__version__) >= Version("0.11.0rc2"):
  if Version(anndata.__version__) >= Version("0.11.0rc2"):


In [2]:
datadir = '../../../../3_expts/processed_data'

In [4]:
adata_de = sc.read_h5ad(datadir + f'/CD4i_final/DE_results_all_confounders/CD4i_final.merged_DE_results_corrected.h5ad')
adata_de.layers['zscore'] = adata_de.layers['log_fc'] / adata_de.layers['lfcSE']
adata_de.layers['zscore'][np.where(adata_de.layers['zscore'] > 50)] = 50
adata_de.X = np.nan_to_num(adata_de.layers['zscore'], 0)

In [5]:
adata_final = sc.read_h5ad(datadir + '/analysis_largefiles/nde30ntotal75_varfiltered_clustering_additional_filtering.h5ad')

In [6]:
de_summary_stats = pd.read_csv(datadir + '/CD4i_final/DE_results_all_confounders/DE_summary_stats_per_target_corrected.csv', index_col=0)
de_summary_stats.baseMean = de_summary_stats.baseMean.fillna(0)

In [7]:
adata_final

AnnData object with n_obs × n_vars = 2336 × 13959
    obs: 'target_contrast_gene_name', 'culture_condition', 'target_contrast', 'chunk', 'n_cells_target', 'target_contrast_corrected', 'target_contrast_gene_name_corrected', 'hdbscan'
    var: 'gene_ids', 'gene_name'
    layers: 'adj_p_value', 'baseMean', 'lfcSE', 'log_fc', 'p_value', 'zscore'

In [8]:
adata_final.obs.hdbscan.unique()

[37, 39, 58, 53, 15, ..., 18, 52, 79, 46, 63]
Length: 72
Categories (72, int64): [0, 1, 2, 3, ..., 106, 107, 108, 109]

In [9]:
adata_de.obs

Unnamed: 0,target_contrast_gene_name,culture_condition,target_contrast,chunk,n_cells_target,target_contrast_corrected,target_contrast_gene_name_corrected
ENSG00000012963_Stim8hr,UBR7,Stim8hr,ENSG00000012963,0,491.0,ENSG00000012963,UBR7
ENSG00000017260_Stim8hr,ATP2C1,Stim8hr,ENSG00000017260,0,469.0,ENSG00000017260,ATP2C1
ENSG00000067606_Stim8hr,PRKCZ,Stim8hr,ENSG00000067606,0,427.0,ENSG00000067606,PRKCZ
ENSG00000092929_Stim8hr,UNC13D,Stim8hr,ENSG00000092929,0,830.0,ENSG00000092929,UNC13D
ENSG00000100504_Stim8hr,PYGL,Stim8hr,ENSG00000100504,0,414.0,ENSG00000100504,PYGL
...,...,...,...,...,...,...,...
ENSG00000198342_Rest,ZNF442,Rest,ENSG00000198342,680,324.0,ENSG00000198342,ZNF442
ENSG00000205572_Rest,SERF1B,Rest,ENSG00000205572,680,44.0,ENSG00000205572,SERF1B
ENSG00000221890_Rest,NPTXR,Rest,ENSG00000221890,680,293.0,ENSG00000221890,NPTXR
ENSG00000236320_Rest,SLFN14,Rest,ENSG00000236320,680,848.0,ENSG00000236320,SLFN14


In [10]:
df_all = pd.DataFrame()
for cl in adata_final.obs.hdbscan.unique():
    adata_subset = adata_de[adata_de.obs_names.isin(adata_final[adata_final.obs.hdbscan==cl].obs_names)].copy()
    # First only consider DE gene for at least one cluster member regulator
    adata_subset = adata_subset[:,(adata_subset.layers['adj_p_value']<0.1).sum(0)>=1].copy()
    downstream_gene_list = adata_subset.var['gene_name'].tolist()
    downstream_gene_ids_list = adata_subset.var['gene_ids'].tolist()
    # Calculate simple number of upstream regulators for each gene
    num_of_upstream = (adata_subset.layers['adj_p_value']<0.1).sum(0)
    sign_coherence = ((adata_subset.layers['adj_p_value']<0.1) * (np.sign(adata_subset.layers['zscore']))).sum(0) / num_of_upstream
    # For each member regulator, calculate the rank of downstream genes based on zscore, then add the rank together and rank based on sum rank
    zscore_rank_neg = stats.rankdata(stats.rankdata(adata_subset.layers['zscore'], axis=1).sum(0)).astype(int)
    zscore_rank_pos = stats.rankdata(-stats.rankdata(adata_subset.layers['zscore'], axis=1).sum(0)).astype(int)
    # Summarize into dataFrame
    df = pd.DataFrame({'hdbscan_cluster': cl,
                       'downstream_gene': downstream_gene_list,
                       'downstream_gene_ids': downstream_gene_ids_list,
                       'num_of_upstream': num_of_upstream,
                       'sign_coherence': sign_coherence,
                       'zscore_rank_negative_regulation': zscore_rank_neg,
                       'zscore_rank_positive_regulation': zscore_rank_pos,})
    df_all = pd.concat([df_all, df])

In [11]:
df_all.to_csv(datadir + '/analysis_largefiles/nde30ntotal75_varfiltered_clustering_additional_filtering_downstream_genes.csv')