In [1]:
from pathlib import Path

# Get the current working directory as a Path object
current_path = Path.cwd()
home_folder = 'evan_home'

# Traverse up the directory tree until you find the target folder
for parent in [current_path] + list(current_path.parents):
    if parent.name == home_folder:
        home_path = parent
        break
else:
    raise ValueError(f"Folder '{home_folder}' not found in the current working directory.")

print("Home Path:", home_path)
source_code_dir = home_path / 'Source_code'
dataset_dir = home_path / 'Dataset'

Home Path: c:\Users\evanlee\Documents\Bmi_NAS_evan\evan_home


In [2]:
import numpy as np
import pandas as pd
import scanpy as sc
import json
import os
import matplotlib.pyplot as plt
from matplotlib_venn import venn2

In [3]:
### Read representative cells
# types = ['B', 'CD4_T', 'CD8_T', 'DC', 'Mono', 'NK', 'other', 'other_T']
# adata = sc.read_h5ad('/home/jovyan/work/Research_datasets/PBMC_Hao/GSE164378_Hao/Harmony_noZ/Hao_L1_repcells_loginv_Harmony_noZ.h5ad')
# adata = sc.read_h5ad('/Users/evanli/Documents/Research_datasets/PBMC_Hao/GSE164378_Hao/Harmony_noZ/Hao_L1_repcells_loginv_Harmony_noZ.h5ad')
adata = sc.read_h5ad(dataset_dir / 'PBMC_Hao/GSE164378_Hao/Harmony_noZ/Hao_L1_repcells_loginv_Harmony_noZ.h5ad')
print('Original adata:', adata.shape)
label = adata.obs['celltype.l1'].tolist()
types = np.unique(label).tolist()
print('all cell types:', types)

Original adata: (59897, 27504)
all cell types: ['B', 'CD4_T', 'CD8_T', 'DC', 'Mono', 'NK', 'other', 'other_T']


## Read PreLect features

In [3]:
# types = ['B', 'CD4_T', 'CD8_T', 'DC', 'Mono', 'NK', 'other', 'other_T']

In [4]:
# os.chdir('/Users/evanli/Documents/EvanPys/Progress/PBMC_Hao_batch_noZ/Level1/feature_selection_k3')
os.chdir(source_code_dir / 'PBMC_Hao_batch_noZ/Level1/feature_selection_k3')

features_dict = {}
# Read features for each celltype
for celltype in types:
    try:
        feature_df_k3 = pd.read_csv(f'{celltype}_features.txt', names=['Gene', 'Weight', 'Tendency'], sep='\t')
        features_dict[celltype] = feature_df_k3
    except:
        print('skipping:', celltype)
        continue

count_df_k3 = pd.DataFrame(columns=['Feature_count', 'Positive_feature_count'])
for celltype in features_dict.keys():
    feature_df_k3 = features_dict[celltype]
    feature_count = feature_df_k3.shape[0]
    positive_count = feature_df_k3[feature_df_k3['Tendency'] == 1].shape[0]
    count_df_k3.loc[celltype] = [feature_count, positive_count]
count_df_k3

Unnamed: 0,Feature_count,Positive_feature_count
B,19,10
CD4_T,201,95
CD8_T,23,9
DC,50,23
Mono,50,20
NK,33,17
other,5,3
other_T,247,112


## Read ACT markers

In [5]:
# marker_df = pd.read_csv('/Users/evanli/Documents/EvanPys/Progress/PBMC_Hao_batch_noZ/Level1/ACT_annotation/Human_blood_marker_processed.csv')
marker_df = pd.read_csv(source_code_dir / 'PBMC_Hao_batch_noZ/Level1/ACT_annotation/Human_blood_marker_processed.csv')
print(marker_df.shape)
marker_df.head()

(147, 6)


Unnamed: 0,Species,Tissue,super_type,CellType,Marker,Resource
0,Human,Blood,B,Activated B cell,"CD5, MIR155HG, TAGLN2","35025971, 28910360, 2474446"
1,Human,Blood,T_CD4,"Activated CD4-positive, alpha-beta T cell","CD4, CCR6, CD14, CD19, CD38, CD3D, CD3G, CD69,...","35381424, 32783921, 35831277, 34529726, 103413..."
2,Human,Blood,T_CD4,"Activated CD4-positive, CD25-positive, CCR4-po...","CD3D, CD3E, CD3G, CD4, FOXP3",30977974
3,Human,Blood,,Adult endothelial progenitor cell,"KDR, PROM1, CD34, PTPRC, PECAM1, CD14","27561827, 29391882, 24641802, 29147957, 267700..."
4,Human,Blood,T,Alpha-beta T cell,"CD3D, CD3E, CD3G, IL7R","34284174, 28777444"


In [6]:
def get_celltype_marker(name, marker_df):
    string = marker_df[marker_df['CellType'] == name]['Marker'].tolist()[0]
    marker_set = set(string.split(', '))
    return marker_set

In [7]:
ACT_name = ['B cell', 'CD4-positive, alpha-beta T cell', 'CD8-positive, alpha-beta T cell', 'Dendritic cell', 'Monocyte', 'Natural killer cell']
cluster_2_ACT_name = dict(zip(types, ACT_name))
cluster_2_ACT_name

{'B': 'B cell',
 'CD4_T': 'CD4-positive, alpha-beta T cell',
 'CD8_T': 'CD8-positive, alpha-beta T cell',
 'DC': 'Dendritic cell',
 'Mono': 'Monocyte',
 'NK': 'Natural killer cell'}

## Fisher exact test

In [None]:
import scipy.stats as stats

def perform_fisher_test(PreLect_genes, ACT_genes, Hao_genes):
    """
    Performs Fisher's exact test on the provided gene sets.

    Parameters:
    - PreLect_genes: Set of genes in PreLect_genes.
    - ACT_genes: Set of genes in ACT_genes.
    - Hao_genes: Set of all genes in Hao_genes (background population).

    Returns:
    - oddsratio: The odds ratio calculated from the contingency table.
    - p_value: The p-value from Fisher's exact test.
    """
    # Ensure the inputs are sets
    PreLect_genes = set(PreLect_genes)
    ACT_genes = set(ACT_genes)
    Hao_genes = set(Hao_genes)

    # Calculate the counts for the contingency table
    a = len(PreLect_genes & ACT_genes)                   # Genes in both PreLect_genes and ACT_genes
    b = len(PreLect_genes - ACT_genes)                   # Genes in PreLect_genes but not in ACT_genes
    c = len(ACT_genes - PreLect_genes)                   # Genes in ACT_genes but not in PreLect_genes
    d = len(Hao_genes - (PreLect_genes | ACT_genes))     # Genes in neither gene set

    # Construct the contingency table
    contingency_table = [[a, b],
                         [c, d]]

    # Perform Fisher's exact test
    oddsratio, p_value = stats.fisher_exact(contingency_table)

    # Output the results
    print("Contingency Table:")
    print(f"                   In ACT_genes    Not in ACT_genes")
    print(f"In PreLect_genes       {a}               {b}")
    print(f"Not in PreLect_genes   {c}            {d}")
    print(f"\nOdds Ratio: {oddsratio}")
    print(f"P-value: {p_value}")

    return oddsratio, p_value

# OR > 1: 
#   Indicates a positive association between being in PreLect_genes and being in ACT_genes
#   Genes in PreLect_genes are more likely to be in ACT_genes compared to genes not in PreLect_genes
#   The higher the OR, the stronger the positive association between PreLect_genes and ACT_genes


## B

In [20]:
PreLect_genes = set(features_dict['B']['Gene'].tolist())
ACT_genes = set(get_celltype_marker(cluster_2_ACT_name['B'], marker_df))
Hao_genes = set(adata.var_names.tolist())

# ACT_genes: remove any that are not part of Hao_genes
ACT_genes = ACT_genes.intersection(Hao_genes)

In [21]:
len(PreLect_genes)

19

In [22]:
len(ACT_genes)

58

In [23]:
len(Hao_genes)

27504

In [25]:
len(PreLect_genes.intersection(ACT_genes))

8

In [26]:
len(PreLect_genes.intersection(Hao_genes))

19

In [27]:
len(ACT_genes.intersection(Hao_genes))

58

In [38]:
perform_fisher_test(PreLect_genes, ACT_genes, Hao_genes)

Contingency Table:
                   In ACT_genes    Not in ACT_genes
In PreLect_genes       8               11
Not in PreLect_genes   50            27435

Odds Ratio: 399.05454545454546
P-value: 1.7541298652423053e-17


(399.05454545454546, 1.7541298652423053e-17)

## all types

In [39]:
cluster_2_ACT_name

{'B': 'B cell',
 'CD4_T': 'CD4-positive, alpha-beta T cell',
 'CD8_T': 'CD8-positive, alpha-beta T cell',
 'DC': 'Dendritic cell',
 'Mono': 'Monocyte',
 'NK': 'Natural killer cell'}

In [42]:
for clus, name in cluster_2_ACT_name.items():
    print('=====')
    print(clus)
    PreLect_genes = features_dict[clus]['Gene'].tolist()
    ACT_genes = get_celltype_marker(name, marker_df)
    Hao_genes = adata.var_names.tolist()
    # ACT_genes: remove any that are not part of Hao_genes
    ACT_genes = ACT_genes.intersection(Hao_genes)

    perform_fisher_test(PreLect_genes, ACT_genes, Hao_genes)

=====
B
Contingency Table:
                   In ACT_genes    Not in ACT_genes
In PreLect_genes       8               11
Not in PreLect_genes   50            27435

Odds Ratio: 399.05454545454546
P-value: 1.7541298652423053e-17
=====
CD4_T
Contingency Table:
                   In ACT_genes    Not in ACT_genes
In PreLect_genes       13               188
Not in PreLect_genes   28            27275

Odds Ratio: 67.3584726443769
P-value: 1.6866982166477405e-18
=====
CD8_T
Contingency Table:
                   In ACT_genes    Not in ACT_genes
In PreLect_genes       8               15
Not in PreLect_genes   18            27463

Odds Ratio: 813.7185185185185
P-value: 9.359120855294212e-20
=====
DC
Contingency Table:
                   In ACT_genes    Not in ACT_genes
In PreLect_genes       9               41
Not in PreLect_genes   38            27416

Odds Ratio: 158.37227214377407
P-value: 1.3087738378567983e-16
=====
Mono
Contingency Table:
                   In ACT_genes    Not in ACT_genes