### Get downstream genes
1. For each cluster, only include genes that are at least the DE gene (fdr<0.1) of one cluster member regulator.
2. Rank-based ranking of downstream genes, summation of upregulation and downregulation ranking (based on zscore) across cluster member then rank based on rank sum.
3. Simple number of upstream regulators (both positive and negative) within cluster member regulators.
4. Sign coherence of upstream regulators (same sign +1, opposite sign -1).

In [11]:
import pandas as pd
import scanpy as sc
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

In [12]:
datadir = '../../../../3_expts/processed_data'

In [None]:
adata_de = sc.read_h5ad(datadir + f'/CD4i_final/DE_results_all_confounders/CD4i_final.merged_DE_results_corrected.h5ad', backed='r')

In [4]:
adata_final = sc.read_h5ad(datadir + '/analysis_largefiles/nde50ntotal100_varfiltered_clustering_additional_filtering_10232025.h5ad', backed='r')

In [5]:
de_summary_stats = pd.read_csv(datadir + '/CD4i_final/DE_results_all_confounders/DE_summary_stats_per_target_corrected.csv', index_col=0)
de_summary_stats.baseMean = de_summary_stats.baseMean.fillna(0)

In [9]:
df_all = pd.DataFrame()
for cl in adata_final.obs.hdbscan.unique():
    adata_subset = adata_de[adata_de.obs_names.isin(adata_final[adata_final.obs.hdbscan==cl].obs_names)].to_memory().copy()
    # First only consider DE gene for at least one cluster member regulator
    adata_subset = adata_subset[:,(adata_subset.layers['adj_p_value']<0.1).sum(0)>=1].copy()
    downstream_gene_list = adata_subset.var['gene_name'].tolist()
    downstream_gene_ids_list = adata_subset.var['gene_ids'].tolist()
    # Calculate simple number of upstream regulators for each gene
    num_of_upstream = (adata_subset.layers['adj_p_value']<0.1).sum(0)
    sign_coherence = ((adata_subset.layers['adj_p_value']<0.1) * (np.sign(adata_subset.layers['zscore']))).sum(0) / num_of_upstream
    # For each member regulator, calculate the rank of downstream genes based on zscore, then add the rank together and rank based on sum rank
    zscore_rank_neg = stats.rankdata(stats.rankdata(adata_subset.layers['zscore'], axis=1).sum(0)).astype(int)
    zscore_rank_pos = stats.rankdata(-stats.rankdata(adata_subset.layers['zscore'], axis=1).sum(0)).astype(int)
    # Summarize into dataFrame
    df = pd.DataFrame({'hdbscan_cluster': cl,
                       'downstream_gene': downstream_gene_list,
                       'downstream_gene_ids': downstream_gene_ids_list,
                       'num_of_upstream': num_of_upstream,
                       'sign_coherence': sign_coherence,
                       'zscore_rank_negative_regulation': zscore_rank_neg,
                       'zscore_rank_positive_regulation': zscore_rank_pos,})
    df_all = pd.concat([df_all, df])

In [10]:
df_all.to_csv(datadir + '/analysis_largefiles/nde50ntotal100_varfiltered_clustering_additional_filtering_10232025_downstream_genes.csv')