In [2]:
import scanpy as sc
import numpy as np
import pandas as pd
import anndata as ad
import time
import sys
from statsmodels.stats.multitest import multipletests

import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import random
import matplotlib as mpl

sys.path.append('../3_DE_analysis/')
from DE_analysis_utils import *

pd.set_option('display.max_rows', 150)
plt.rcParams['pdf.fonttype'] = 42
mpl.rcParams['font.family'] = 'Arial'

In [3]:
de_summary_stats = pd.read_csv('../../metadata/suppl_tables/DE_stats.suppl_table.csv', index_col=0)
downstream_gene_df = pd.read_csv('../../metadata/suppl_tables/clustering_downstream_genes.csv.gz', index_col=0)
cluster_nde75_ntotal50 = pd.read_csv('../../metadata/clustering_results.csv', index_col=0)

In [4]:
import gseapy as gp

go_bp_sets = gp.get_library(name='GO_Biological_Process_2025', organism='Human')

In [5]:
downstream_gene_df

Unnamed: 0,hdbscan_cluster,downstream_gene,downstream_gene_ids,num_of_upstream,sign_coherence,zscore_rank_negative_regulation,zscore_rank_positive_regulation,condition
0,21,DPM1,ENSG00000000419,2,0.0,756,7898,Rest
1,21,C1orf112,ENSG00000000460,4,-0.5,1046,7608,Rest
2,21,CFH,ENSG00000000971,3,1.0,6819,1835,Rest
3,21,FUCA2,ENSG00000001036,4,0.5,3624,5030,Rest
4,21,GCLC,ENSG00000001084,1,1.0,2649,6005,Rest
...,...,...,...,...,...,...,...,...
1119,23,ZNF658,ENSG00000274349,1,1.0,480,645,Stim48hr
1120,23,MLLT6,ENSG00000275023,1,1.0,1056,69,Stim48hr
1121,23,PRAG1,ENSG00000275342,1,1.0,838,287,Stim48hr
1122,23,H2BC9,ENSG00000275713,1,1.0,649,476,Stim48hr


In [6]:
def wrap_long_label_at_middle_char(label, max_length=40):
    """
    Wraps a long label by inserting a newline (\n) at the space closest to the
    *middle character index* of the label.
    """
    if len(label) > max_length:
        label_len = len(label)
        
        # 1. Calculate the target split index (closest to the middle character)
        target_index = label_len // 2
        
        # 2. Find the index of the space *closest* to the target_index
        
        # Search backward from the target index to find the first space
        best_split_index = -1
        for i in range(target_index, 0, -1):
            if label[i] == ' ':
                best_split_index = i
                break
        
        # If a space was found backward, check if the forward space is closer
        if best_split_index != -1:
            backward_distance = target_index - best_split_index
            
            # Search forward from the target index to find the first space
            forward_split_index = -1
            for i in range(target_index + 1, label_len):
                if label[i] == ' ':
                    forward_split_index = i
                    break

            if forward_split_index != -1:
                forward_distance = forward_split_index - target_index
                
                # Choose the space that is closer to the target_index
                if forward_distance < backward_distance:
                    best_split_index = forward_split_index
            
        # If no space was found backward (only one word, or space is past target),
        # try searching forward exclusively.
        if best_split_index == -1:
            for i in range(target_index + 1, label_len):
                if label[i] == ' ':
                    best_split_index = i
                    break
        
        # 3. If a suitable space was found, split and join the label
        if best_split_index != -1:
            line1 = label[:best_split_index]
            # line2 starts *after* the space
            line2 = label[best_split_index + 1:] 
            return f'{line1}\n{line2}'

    return label

def remove_duplicated_go(df, overlap_thres=0.1):
    # Convert string of genes into sets
    df['gene_set_temp'] = df['Genes'].str.split(';').apply(set)
    # Sort by P-value to prioritize the most significant terms
    df = df.sort_values('P-value')
    keep_indices = []
    seen_gene_sets = []
    for index, row in df.iterrows():
        current_genes = row['gene_set_temp']
        is_duplicate = False
        # Compare current row against rows we have already decided to keep
        for seen_genes in seen_gene_sets:
            intersection = len(current_genes.intersection(seen_genes))
            # Check if overlap is > overlap_thres of the current set OR > overlap_thres of the seen set
            if (intersection > overlap_thres * len(current_genes)) or \
               (intersection > overlap_thres * len(seen_genes)):
                is_duplicate = True
                break
        if not is_duplicate:
            keep_indices.append(index)
            seen_gene_sets.append(current_genes)
    
    # Create the final consolidated dataframe
    df_clean = df.loc[keep_indices].drop(columns=['gene_set_temp'])
    df_clean = df_clean.reset_index(drop=True)
    return df_clean

### Get downstream gene GO enrichment

In [7]:
padj_thres = 0.05
top_gene_num = 100
top_gene_frac = 0.03

In [8]:
cond_cl_direction = [('Stim48hr',36,'negative'),
                     ('Stim48hr',36,'positive'),
                     ('Rest',27,'negative'),
                     ('Rest',27,'positive'),
                     ('Rest',10,'negative'),
                     ('Rest',10,'positive'),
                     ('Stim8hr',10,'negative'),
                     ('Stim8hr',10,'positive'),
                     ('Stim48hr',10,'negative'),
                     ('Stim48hr',10,'positive'),
                     ('Rest',9,'negative'),
                     ('Rest',9,'positive'),
                     ('Stim8hr',9,'negative'),
                     ('Stim8hr',9,'positive'),
                     ('Stim48hr',9,'negative'),
                     ('Stim48hr',9,'positive'),
                     ('Rest',81,'negative'),
                     ('Rest',81,'positive'),
                     ('Stim8hr',81,'negative'),
                     ('Stim8hr',81,'positive'),
                     ('Stim48hr',81,'negative'),
                     ('Stim48hr',81,'positive'),
                     ('Rest',7,'negative'),
                     ('Rest',7,'positive'),
                     ('Stim8hr',7,'negative'),
                     ('Stim8hr',7,'positive'),
                     ('Stim48hr',7,'negative'),
                     ('Stim48hr',7,'positive'),
                     ('Rest',0,'negative'),
                     ('Rest',0,'positive'),
                     ('Stim8hr',0,'negative'),
                     ('Stim8hr',0,'positive'),
                     ('Stim48hr',0,'negative'),
                     ('Stim48hr',0,'positive'),]

In [9]:
for item in cond_cl_direction:
    cond, cl, direction = item
    df_rank = downstream_gene_df[(downstream_gene_df.hdbscan_cluster==cl)&(downstream_gene_df.condition==cond)].copy()
    # Take top_gene_num top downstream genes or top top_gene_frac whichever is larger
    num_downstream = max([top_gene_num, top_gene_frac*len(df_rank)])
    gene_list = df_rank[df_rank['zscore_rank_'+direction+'_regulation']<num_downstream].downstream_gene.tolist()
    bg = df_rank['downstream_gene'].unique().tolist()
    enr = gp.enrichr(gene_list=gene_list, 
                    gene_sets=[go_bp_sets],
                    organism='human',
                    outdir=None,
                    background=bg
                    )
    df = enr.results.copy()
    df = df[df['Adjusted P-value'] < padj_thres]
    df['Neglog_p'] = -np.log10(df['Adjusted P-value'])
    
    # Remove terms that are duplicated
    df = remove_duplicated_go(df)
    
    df['Wrapped_Term'] = df['Term'].apply(wrap_long_label_at_middle_char)
    
    df = df.sort_values('Neglog_p', ascending=False)
    # Show only the top 3 terms
    if len(df)>3:
        df = df.iloc[:3]
    
    # 1. Define fixed dimensions
    target_bar_width = 2  # Width of the actual plot area in inches (adjust as needed)
    right_padding = 0.5     # Space for x-axis labels on the right
    fig_height = 0.45 * len(df) + 0.5 # Your original height logic (slightly adj for padding)

    # 2. Initialize figure with arbitrary width first
    fig, ax = plt.subplots(figsize=(10, fig_height))
    
    # 3. Create the plot
    sns.barplot(data=df, x='Neglog_p', y='Wrapped_Term', color='gray', ax=ax)
    
    threshold = -np.log10(padj_thres)
    plt.axvline(x=threshold, color='black', linestyle='--', linewidth=1)
    plt.xlabel(r'-Log10(FDR)', fontsize=14)
    plt.xticks(fontsize=14)
    plt.yticks(fontsize=12)
    plt.ylabel('')
    sns.despine()

    # 4. Calculate the exact width needed for the Y-labels
    # We must draw the canvas once to render the text and get its size
    fig.canvas.draw() 
    
    # Get the renderer to calculate text bounding boxes
    renderer = fig.canvas.get_renderer()
    
    # Find the widest Y-label in inches
    max_label_width = 0
    for label in ax.get_yticklabels():
        # Get bounding box of the label
        bbox = label.get_window_extent(renderer)
        # Transform pixels to inches
        bbox_inches = bbox.transformed(fig.dpi_scale_trans.inverted())
        if bbox_inches.width > max_label_width:
            max_label_width = bbox_inches.width

    # Add a small buffer (e.g., 0.2 inches) between text and axis line
    left_margin = max_label_width + 0.2

    # 5. Calculate new total figure width
    # Total = (Left Margin for text) + (Fixed Plot Area) + (Right Margin)
    total_fig_width = left_margin + target_bar_width + right_padding

    # 6. Apply the new size and fixed margins
    fig.set_size_inches(total_fig_width, fig_height)
    
    # subplots_adjust takes values from 0 to 1 (ratios of the total width)
    plt.subplots_adjust(
        left = left_margin / total_fig_width,
        right = 1.0 - (right_padding / total_fig_width),
        top = 0.9,    # Adjust top/bottom as needed
        bottom = 0.3  # Enough space for x-axis label
    )

    plt.savefig('./results/cluster'+str(cl)+'_'+cond+'_'+direction+'_downstream_go_enrichment.pdf', 
                dpi=600, bbox_inches='tight', pad_inches=0)
    plt.close()

### Check individual gene sets

In [4]:
padj_thres = 0.05

In [21]:
cond, cl, direction = ('Rest',10,'positive')
df_rank = downstream_gene_df[(downstream_gene_df.hdbscan_cluster==cl)&(downstream_gene_df.condition==cond)].copy()
# Take top_gene_num top downstream genes or top top_gene_frac whichever is larger
num_downstream = max([top_gene_num, top_gene_frac*len(df_rank)])
gene_list = df_rank[df_rank['zscore_rank_'+direction+'_regulation']<num_downstream].downstream_gene.tolist()
bg = df_rank['downstream_gene'].unique().tolist()
enr = gp.enrichr(gene_list=gene_list, 
                gene_sets=[go_bp_sets],
                organism='human',
                outdir=None,
                background=bg
                )
df = enr.results.copy()
df = df[df['Adjusted P-value'] < padj_thres]
df['Neglog_p'] = -np.log10(df['Adjusted P-value'])

# Remove terms that are duplicated
df = remove_duplicated_go(df)

df['Wrapped_Term'] = df['Term'].apply(wrap_long_label_at_middle_char)

df = df.sort_values('Neglog_p', ascending=False)
df

Unnamed: 0,Gene_set,Term,Overlap,P-value,Adjusted P-value,Odds Ratio,Combined Score,Genes,Neglog_p,Wrapped_Term
0,gs_ind_0,Homotypic Cell-Cell Adhesion (GO:0034109),6/26,9.4e-05,0.042341,10.463415,96.971284,MYL12A;TJP2;TLN1;ACTN1;PIK3CG;CD99,1.373243,Homotypic Cell-Cell\nAdhesion (GO:0034109)
1,gs_ind_0,Regulation of Cell Killing (GO:0031341),3/4,0.000104,0.042341,76.334522,700.340143,MAPK8;CFH;CD59,1.373243,Regulation of Cell Killing (GO:0031341)
2,gs_ind_0,Establishment of Melanosome Localization (GO:0...,4/10,0.000143,0.042341,22.717352,201.140732,MREG;RAB27A;RAB11A;MYO5A,1.373243,Establishment of Melanosome\nLocalization (GO:...


In [22]:
df.loc[0, 'Genes']

'MYL12A;TJP2;TLN1;ACTN1;PIK3CG;CD99'

In [None]:
'NOL6;NOP2;IMP4;FBL;EIF5B;WDR75;METTL5;RRS1;BOP1;EIF2A;LYAR;WDR43;NOP16;NOP56;NOLC1;RSL24D1'

'EXOSC2;IMP4;WDR18;FBL;DKC1;SNU13;NOL11;MPHOSPH6;DDX49;MAK16;RRP9;LYAR;NOLC1;EXOSC7'