In [None]:
import pandas as pd
import numpy as np

def calculate_fdr_log10fdr(data, p_value_col='p.value'):
    """
    Calculate False Discovery Rate (FDR) and log10(FDR) from a pandas DataFrame.

    Args:
        data (pandas.DataFrame): DataFrame containing the data.
        p_value_col (str, optional): Name of the column containing p-values. Default is 'p.value'.

    Returns:
        pandas.DataFrame: Input DataFrame with two new columns 'FDR' and 'log10(FDR)' added.
    """
    # Sort the p-values in ascending order
    data = data.sort_values(by=p_value_col)

    # Rank the sorted p-values
    ranks = np.arange(len(data)) + 1

    # Calculate FDR using the Benjamini-Hochberg procedure
    fdr = data[p_value_col] * len(data) / ranks

    # Calculate log10(FDR)
    log10_fdr = -np.log10(fdr)

    # Add the FDR and log10(FDR) columns to the DataFrame
    data['FDR'] = fdr
    data['-log10(FDR)'] = log10_fdr

    return data

def filter_and_aggregate_genes(df, pc_gene_list, pvalue_threshold, coefficients, t_condition, annolevel='subclass', celltype='Micro'):
    """
    Filters and aggregates gene data based on specific criteria and statistical thresholds.
    
    Parameters:
    - df (DataFrame): The dataframe containing gene data with columns like 'p.value', 'coef', 'statistic', etc.
    - pc_gene_list (list): List of pre-defined genes of interest.
    - pvalue_threshold (float): Threshold for the p-value; genes with a p-value below this threshold are considered.
    - coefficients (list): A list of coefficients used to filter genes in the dataframe.
    - t_condition (str): Condition for filtering based on the statistic value. It can be '<=0', '>=0', or 'all'.
    - annolevel (str, optional): Annotation level used to further filter genes; defaults to 'subclass'.
    - celltype (str, optional): Specific cell type to further refine the filtering; defaults to 'Micro'.

    Returns:
    DataFrame: A dataframe containing aggregated results for common genes across the different filters, 
    including mean statistics and p-values, sorted by ascending p-value.
    """
    filtered_dfs = []
    for coef in coefficients:
        if t_condition == '<=0':
            filtered_df = df[(df['p.value'] < pvalue_threshold) & (df['coef'] == coef) & (df['statistic'] <= 0) & (df['ID'].isin(pc_gene_list)) & (df['AnnoLevel'] == annolevel) & (df['method'] == 'FE') & (df['assay'] == celltype)]
        elif t_condition == '>=0':
            filtered_df = df[(df['p.value'] < pvalue_threshold) & (df['coef'] == coef) & (df['statistic'] >= 0) & (df['ID'].isin(pc_gene_list) & (df['AnnoLevel'] == annolevel) & (df['method'] == 'FE')) & (df['assay'] == celltype)]
        elif t_condition == 'all':
            filtered_df = df[(df['p.value'] < pvalue_threshold) & (df['coef'] == coef) & (df['ID'].isin(pc_gene_list) & (df['AnnoLevel'] == annolevel) & (df['method'] == 'FE')) & (df['assay'] == celltype)]
        else:
            raise ValueError("Invalid t condition.")
        filtered_dfs.append(set(filtered_df['ID']))

    common_genes = set.intersection(*filtered_dfs)
    result_df = df[df['ID'].isin(common_genes)]
    result_df = result_df.groupby('ID', as_index=False).agg({'statistic':'mean', 'p.value': 'mean'}).sort_values(by='p.value', ascending=True)
    return result_df


In [None]:
#load dream genes, apply fdr correction,
META_FNAME = '/sc/arion/projects/psychAD/NPS-AD/freeze2_rc/analysis/results/meta/res_meta.parquet'
res_meta = pd.read_parquet(META_FNAME)

#filter for protein coding genes, remove X and Y sex genes
pc_genes = pd.read_csv('/sc/arion/projects/CommonMind/collin/PsychAD/data/protein_coding_genes/psychad_protein_coding_genes.csv', header=None)
#remove sex-specific genes
pc_genes = pc_genes[pc_genes[3] != 'X']
pc_genes = pc_genes[pc_genes[3] != 'Y']
#filter res_meta for protein coding genes
pc_genes = pc_genes[1].tolist()

res_meta = calculate_fdr_log10fdr(res_meta)



In [None]:
#iterate through celltypes to find significant genes and create dictionary of celltype: SIG GENES
# CLASS 

class_types = ['EN', 'IN', 'Oligo', 'OPC', 'Astro', 'Immune']
celltype_DEGs = {}

for celltype in class_types:
    celltype_DEGs[celltype] = filter_and_aggregate_genes(res_meta, pc_genes, 0.05, ['m01x'], 'all', celltype=celltype, annolevel='class').ID.tolist()
    print(celltype, 'SIGNIFICANT_GENES:', len(celltype_DEGs[celltype]))

In [None]:
import pandas as pd

def significant_targets(GRN, TF_list, celltype_DEGs, quantile=0.0):
    # Initialize dictionary to store results
    TF_targets_dict = {}

    # Iterate through each cell type and its corresponding differentially expressed genes (DEGs)
    for celltype, DEGs in celltype_DEGs.items():
        # Prepare a sub-dictionary for the current cell type if not already present
        if celltype not in TF_targets_dict:
            TF_targets_dict[celltype] = {}
        
        # Iterate through each transcription factor (TF)
        for TF in TF_list:
            # Filter the data for the current TF
            tf_data = GRN[GRN['TF'] == TF]

            # Sort the data by importance score in descending order
            sorted_tf_data = tf_data.sort_values(by='importance', ascending=False)

            # Calculate the threshold percentile (default is 50th percentile)
            percentile_threshold = sorted_tf_data['importance'].quantile(quantile)

            # Select regulon genes above the percentile threshold
            regulon_genes = sorted_tf_data[sorted_tf_data['importance'] > percentile_threshold]['target'].tolist()

            # Determine significant targets by finding the intersection of DEGs and regulon genes
            SIGNIFICANT_TARGETS = list(set(DEGs) & set(regulon_genes))

            # Store results in the dictionary
            TF_targets_dict[celltype][TF] = {'SIGNIFICANT_TARGETS': SIGNIFICANT_TARGETS}
    
    return TF_targets_dict


In [None]:
# Load your data
#SCENIC_GRN = pd.read_csv('/sc/arion/projects/CommonMind/collin/PsychAD/no_var_pilot/UNIVERSAL/consensus_adj/UNIVERSAL_consensus_adj.tsv', sep='\t')
#SCENIC_GRN = UNIVERSAL_GRN
#Define TFs
all_TF = list(UNIVERSAL_GRN.TF.unique())
TF_list = all_TF

# Set quantile threshold

TF_targets_dict = significant_targets(UNIVERSAL_GRN, TF_list, celltype_DEGs, quantile=0.0)

In [None]:
import pandas as pd

# Assume TF_targets_dict is already available
# Initialize an empty dictionary to store gene counts for each celltype, regulon
data = {}

for celltype, tfs in TF_targets_dict.items():
    data[celltype] = {}
    for tf, targets in tfs.items():
        data[celltype][tf] = len(targets['SIGNIFICANT_TARGETS'])
        print(tf, 'SIGNIFICANT_TARGETS:', len(targets['SIGNIFICANT_TARGETS']))


# Convert the dictionary to a DataFrame
gene_count_df = pd.DataFrame.from_dict(data, orient='index').fillna(0)
gene_count_df = gene_count_df.astype(int)  # Ensure data type is integer

#Heatmap of Frequency of Significant Target Genes Across Cell Types
import seaborn as sns
import matplotlib.pyplot as plt

# Plotting the heatmap
plt.figure(figsize=(50, 40))  # Adjust the size based on the number of genes/cell types
sns.heatmap(gene_count_df, cmap="YlGnBu", annot=False, cbar_kws={'label': 'Count of Significant Appearances', 'shrink': 0.1, 'aspect': 10}, square=True, vmax=100)
plt.title('Heatmap of Frequency of Significant Target Genes Across Cell Types')
plt.xlabel('Genes')
plt.ylabel('Cell Types')
plt.xticks(rotation=90)  # Rotate gene labels for better visibility if necessary
plt.yticks(rotation=0)
plt.show()


In [None]:
#Heatmap of Shared Significant Genes Across Cell Types'
import pandas as pd

# Assume TF_targets_dict is the dictionary obtained from the function
# Initialize an empty dictionary to store all significant genes per cell type
all_genes_per_celltype = {}

for celltype, tfs in TF_targets_dict.items():
    # Create a set to store unique genes for this celltype
    genes_set = set()
    for tf, info in tfs.items():
        # Extend the set with significant targets for each TF
        genes_set.update(info['SIGNIFICANT_TARGETS'])
    # Store the set of all genes for the celltype
    all_genes_per_celltype[celltype] = genes_set

# Initialize a DataFrame to store the overlap counts
overlap_df = pd.DataFrame(index=all_genes_per_celltype.keys(), columns=all_genes_per_celltype.keys(), data=0)

# Calculate the overlap for each pair of cell types
for celltype1, genes1 in all_genes_per_celltype.items():
    for celltype2, genes2 in all_genes_per_celltype.items():
        # Find intersection of genes between two celltypes and count them
        overlap_count = len(genes1.intersection(genes2))
        overlap_df.loc[celltype1, celltype2] = overlap_count

import seaborn as sns
import matplotlib.pyplot as plt

# Plotting the heatmap
plt.figure(figsize=(10, 8))  # Adjust size as needed
sns.heatmap(overlap_df, annot=True, cmap="Blues", fmt='d', cbar_kws={'label': 'Shared Significant Genes'})
plt.title('Heatmap of Shared Significant Genes Across Cell Types')
plt.xlabel('Cell Types')
plt.ylabel('Cell Types')
plt.xticks(rotation=45)
plt.yticks(rotation=45)
plt.show()

