### Get downstream genes
1. For each cluster, only include genes that are at least the DE gene (fdr<0.1) of one cluster member regulator.
2. Rank-based ranking of downstream genes, summation of upregulation and downregulation ranking (based on zscore) across cluster member then rank based on rank sum.
3. Simple number of upstream regulators (both positive and negative) within cluster member regulators.
4. Sign coherence of upstream regulators (same sign +1, opposite sign -1).

In [1]:
import pandas as pd
import scanpy as sc
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

  from anndata import __version__ as anndata_version
  if Version(anndata.__version__) >= Version("0.11.0rc2"):
  if Version(anndata.__version__) >= Version("0.11.0rc2"):


In [2]:
datadir = '../../../../3_expts/processed_data'

In [3]:
adata_de = sc.read_h5ad(datadir + f'/CD4i_final/DE_results_all_confounders/CD4i_final.merged_DE_results_corrected.h5ad', backed='r')

In [4]:
adata_final = sc.read_h5ad(datadir + '/analysis_largefiles/nde50ntotal50_varfiltered_simple_clustering.h5ad', backed='r')

In [5]:
de_summary_stats = pd.read_csv(datadir + '/CD4i_final/DE_results_all_confounders/DE_summary_stats_per_target_corrected.csv', index_col=0)
de_summary_stats.baseMean = de_summary_stats.baseMean.fillna(0)

In [6]:
df_all = pd.DataFrame()
for cl in adata_final.obs.hdbscan.unique():
    adata_subset = adata_de[adata_de.obs_names.isin(adata_final[adata_final.obs.hdbscan==cl].obs_names)].to_memory().copy()
    # First only consider DE gene for at least one cluster member regulator
    adata_subset = adata_subset[:,(adata_subset.layers['adj_p_value']<0.1).sum(0)>=1].copy()
    downstream_gene_list = adata_subset.var['gene_name'].tolist()
    downstream_gene_ids_list = adata_subset.var['gene_ids'].tolist()
    # Calculate simple number of upstream regulators for each gene
    num_of_upstream = (adata_subset.layers['adj_p_value']<0.1).sum(0)
    sign_coherence = ((adata_subset.layers['adj_p_value']<0.1) * (np.sign(adata_subset.layers['zscore']))).sum(0) / num_of_upstream
    # For each member regulator, calculate the rank of downstream genes based on zscore, then add the rank together and rank based on sum rank
    zscore_rank_neg = stats.rankdata(stats.rankdata(adata_subset.layers['zscore'], axis=1).sum(0)).astype(int)
    zscore_rank_pos = stats.rankdata(-stats.rankdata(adata_subset.layers['zscore'], axis=1).sum(0)).astype(int)
    # Summarize into dataFrame
    df = pd.DataFrame({'hdbscan_cluster': cl,
                       'downstream_gene': downstream_gene_list,
                       'downstream_gene_ids': downstream_gene_ids_list,
                       'num_of_upstream': num_of_upstream,
                       'sign_coherence': sign_coherence,
                       'zscore_rank_negative_regulation': zscore_rank_neg,
                       'zscore_rank_positive_regulation': zscore_rank_pos,})
    df_all = pd.concat([df_all, df])

In [7]:
df_all.to_csv(datadir + '/analysis_largefiles/nde50ntotal50_varfiltered_simple_clustering_downstream_genes.csv')

In [4]:
pb = sc.read_h5ad(datadir + f'/CD4i_final/CD4i_final_merged.DE_pseudobulk_corrected.h5ad', backed='r')

In [6]:
pb.obs

Unnamed: 0,10xrun_id,cell_sample_id,donor_id,culture_condition,guide_id,perturbed_gene_name,perturbed_gene_id,guide_type,n_cells,total_counts,keep_min_cells,keep_effective_guides,keep_total_counts,keep_for_DE,keep_test_genes,log10_n_cells,perturbed_gene_id_corrected,perturbed_gene_name_corrected
CD4i_R2_D1_Stim48hr_A1BG-1,CD4i_R2,CD4i_R2_D1_Stim48hr,CE0008162,Stim48hr,A1BG-1,A1BG,ENSG00000121410,targeting,26.0,326500.0,True,True,True,True,True,1.414973,ENSG00000121410,A1BG
CD4i_R2_D1_Stim48hr_A1BG-2,CD4i_R2,CD4i_R2_D1_Stim48hr,CE0008162,Stim48hr,A1BG-2,A1BG,ENSG00000121410,targeting,160.0,2313880.0,True,True,True,True,True,2.204120,ENSG00000121410,A1BG
CD4i_R2_D1_Stim48hr_A2M-1,CD4i_R2,CD4i_R2_D1_Stim48hr,CE0008162,Stim48hr,A2M-1,A2M,ENSG00000175899,targeting,1.0,23782.0,False,True,True,False,True,0.000000,ENSG00000175899,A2M
CD4i_R2_D1_Stim48hr_A2M-2,CD4i_R2,CD4i_R2_D1_Stim48hr,CE0008162,Stim48hr,A2M-2,A2M,ENSG00000175899,targeting,22.0,330204.0,True,True,True,True,True,1.342423,ENSG00000175899,A2M
CD4i_R2_D1_Stim48hr_AAAS-1,CD4i_R2,CD4i_R2_D1_Stim48hr,CE0008162,Stim48hr,AAAS-1,AAAS,ENSG00000094914,targeting,95.0,1389867.0,True,True,True,True,True,1.977724,ENSG00000094914,AAAS
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
CD4i_R2_D4_Rest_ZYX-1,CD4i_R2,CD4i_R2_D4_Rest,CE0006864,Rest,ZYX-1,ZYX,ENSG00000159840,targeting,1.0,11009.0,False,True,True,False,True,0.000000,ENSG00000159840,ZYX
CD4i_R2_D4_Rest_ZYX-2,CD4i_R2,CD4i_R2_D4_Rest,CE0006864,Rest,ZYX-2,ZYX,ENSG00000159840,targeting,39.0,318693.0,True,True,True,True,True,1.591065,ENSG00000159840,ZYX
CD4i_R2_D4_Rest_ZYX-3,CD4i_R2,CD4i_R2_D4_Rest,CE0006864,Rest,ZYX-3,ZYX,ENSG00000159840,targeting,31.0,247297.0,True,True,True,True,True,1.491362,ENSG00000159840,ZYX
CD4i_R2_D4_Rest_ZZEF1-1,CD4i_R2,CD4i_R2_D4_Rest,CE0006864,Rest,ZZEF1-1,ZZEF1,ENSG00000074755,targeting,100.0,1050023.0,True,True,True,True,True,2.000000,ENSG00000074755,ZZEF1


In [16]:
pb[(pb.obs.perturbed_gene_name=='IL2RA')&(pb.obs.culture_condition=='Rest'), pb.var.gene_name=='IL2RA']

View of AnnData object with n_obs × n_vars = 8 × 1 backed at '../../../../3_expts/processed_data/CD4i_final/CD4i_final_merged.DE_pseudobulk_corrected.h5ad'
    obs: '10xrun_id', 'cell_sample_id', 'donor_id', 'culture_condition', 'guide_id', 'perturbed_gene_name', 'perturbed_gene_id', 'guide_type', 'n_cells', 'total_counts', 'keep_min_cells', 'keep_effective_guides', 'keep_total_counts', 'keep_for_DE', 'keep_test_genes', 'log10_n_cells', 'perturbed_gene_id_corrected', 'perturbed_gene_name_corrected'
    var: 'gene_ids', 'gene_name'

In [15]:
pb[pb.obs.perturbed_gene_name=='NTC', pb.var.gene_name=='IL2RA']

View of AnnData object with n_obs × n_vars = 11018 × 1 backed at '../../../../3_expts/processed_data/CD4i_final/CD4i_final_merged.DE_pseudobulk_corrected.h5ad'
    obs: '10xrun_id', 'cell_sample_id', 'donor_id', 'culture_condition', 'guide_id', 'perturbed_gene_name', 'perturbed_gene_id', 'guide_type', 'n_cells', 'total_counts', 'keep_min_cells', 'keep_effective_guides', 'keep_total_counts', 'keep_for_DE', 'keep_test_genes', 'log10_n_cells', 'perturbed_gene_id_corrected', 'perturbed_gene_name_corrected'
    var: 'gene_ids', 'gene_name'

In [17]:
2*6*2

24