In [1]:
from pathlib import Path

# Get the current working directory as a Path object
current_path = Path.cwd()
home_folder = 'evan_home'

# Traverse up the directory tree until you find the target folder
for parent in [current_path] + list(current_path.parents):
    if parent.name == home_folder:
        home_path = parent
        break
else:
    raise ValueError(f"Folder '{home_folder}' not found in the current working directory.")

print("Home Path:", home_path)
source_code_dir = home_path / 'Source_code'
dataset_dir = home_path / 'Dataset'


Home Path: c:\Users\evanlee\Documents\Bmi_NAS_evan\evan_home


In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import json
import pickle
import xgboost as xgb

In [3]:
import scanpy as sc

# adata = sc.read_h5ad(r"C:\Users\evanlee\Documents\Research_datasets\PBMC_Hao\GSE164378_Hao\Harmony_noZ\Hao_Harmony_test_no_scale.h5ad")
# adata = sc.read_h5ad('/Users/evanli/Documents/Research_datasets/PBMC_Hao/GSE164378_Hao/Harmony_noZ/Hao_Harmony_test_no_scale.h5ad')
adata = sc.read_h5ad(dataset_dir / 'PBMC_Hao/GSE164378_Hao/Harmony_noZ/Hao_Harmony_test_no_scale.h5ad')

print('Original adata:', adata.shape)
adata.obs['celltype.l1'] = adata.obs['celltype.l1'].str.replace(' ', '_')
label = adata.obs['celltype.l1'].tolist()
types = np.unique(label).tolist()
print('all cell types:', types)
print('====================')
# del adata


Original adata: (161764, 33538)
all cell types: ['B', 'CD4_T', 'CD8_T', 'DC', 'Mono', 'NK', 'other', 'other_T']


In [3]:
adata.obs.head()

Unnamed: 0,celltype.l1,celltype.l2,celltype.l3,Batch,donor,time,lane,Phase,nCount_ADT,nFeature_ADT,nCount_RNA,nFeature_RNA,leiden
L1_AAACCCAAGAAACTCA,Mono,CD14 Mono,CD14 Mono,Batch1,P2,7,L1,G1,7535,217,10823,2915,4
L1_AAACCCAAGACATACA,CD4_T,CD4 TCM,CD4 TCM_1,Batch1,P1,7,L1,G1,6013,209,5864,1617,2
L1_AAACCCACAACTGGTT,CD8_T,CD8 Naive,CD8 Naive,Batch1,P4,2,L1,S,6620,213,5067,1381,5
L1_AAACCCACACGTACTA,NK,NK,NK_2,Batch1,P3,7,L1,G1,3567,202,4786,1890,3
L1_AAACCCACAGCATACT,CD8_T,CD8 Naive,CD8 Naive,Batch1,P4,7,L1,G1,6402,215,6505,1621,5


## Read features

In [4]:
import os
# os.chdir(r"C:\Users\evanlee\Documents\GitHub\EvanPys\Progress\PBMC_Hao_batch_noZ\Level1\feature_selection_k3")
# os.chdir('/Users/evanli/Documents/EvanPys/Progress/PBMC_Hao_batch_noZ/Level1/feature_selection_k3')
os.chdir(source_code_dir / 'PBMC_Hao_batch_noZ/Level1/feature_selection_k3')

features_dict = {}
# Read features for each celltype
for celltype in types:
    try:
        feature_df = pd.read_csv(f'{celltype}_features.txt', names=['Gene', 'Weight', 'Tendency'], sep='\t')
        features_dict[celltype] = feature_df
    except:
        print('skipping:', celltype)
        continue

In [5]:
features_dict.keys()

dict_keys(['B', 'CD4_T', 'CD8_T', 'DC', 'Mono', 'NK', 'other', 'other_T'])

In [6]:
count_df = pd.DataFrame(columns=['Feature_count', 'Positive_feature_count'])
for celltype in features_dict.keys():
    feature_df = features_dict[celltype]
    feature_count = feature_df.shape[0]
    positive_count = feature_df[feature_df['Tendency'] == 1].shape[0]
    count_df.loc[celltype] = [feature_count, positive_count]
count_df

Unnamed: 0,Feature_count,Positive_feature_count
B,19,10
CD4_T,201,95
CD8_T,23,9
DC,50,23
Mono,50,20
NK,33,17
other,5,3
other_T,247,112


In [None]:
# os.chdir(r"C:\Users\evanlee\Documents\GitHub\EvanPys\Progress\PBMC_Hao_batch_noZ\Level1\DEG_L1\L1_DEG_table")
os.chdir(source_code_dir / 'PBMC_Hao_batch_noZ/Level1/DEG_L1/L1_DEG_table')
deg_dict = {}
for celltype in types:
    DEG_table = pd.read_csv(celltype + '_DEG1000.csv', index_col=0)
    n_features = count_df.loc[celltype, 'Feature_count']
    DEGn = DEG_table['names'][:n_features].tolist()

    deg_dict[celltype] = DEGn

In [8]:
isinstance(deg_dict['B'], list)

True

## LR

In [6]:
import os
import sys
import importlib
# sys.path.append('/Users/evanli/Documents/EvanPys/Progress')
# sys.path.append('/home/jovyan/work/GitHub/EvanPys/Progress')
# sys.path.append(r'C:\Users\evanlee\Documents\GitHub\EvanPys\Progress')
sys.path.append(str(source_code_dir))
from evan_library import evan_models as emd
importlib.reload(emd)


<module 'evan_library.evan_models' from 'c:\\Users\\evanlee\\Documents\\Bmi_NAS_evan\\evan_home\\Source_code\\evan_library\\evan_models.py'>

In [10]:
# initialize
evan_lr = emd.EvanModels(adata, deg_dict, level='l1')

In [None]:
# os.chdir(r"C:\Users\evanlee\Documents\GitHub\EvanPys\Progress\PBMC_Hao_batch_noZ\Level1\DEG_L1")
os.chdir(source_code_dir / 'PBMC_Hao_batch_noZ/Level1/DEG_L1')
all_metrics_df, cv_results_dict, likelihood_dict = evan_lr.run_LR_kfold_for_types()

K-fold CV for: B
is a list
Cross-validation...
likelihood > 0.5: 13721
[0.9987713564356134, 0.9957196316903023, 0.9898550724637681, 0.9927772907378845, 0.9998379273063407, 0.9990056473767369, 0.992111873004848]
K-fold CV for: CD4_T
is a list
Cross-validation...
likelihood > 0.5: 40807
[0.941743738353981, 0.886658606997365, 0.8830522656049606, 0.8848431557951564, 0.9827622180849811, 0.9516339803698866, 0.8458636853078574]
K-fold CV for: CD8_T
is a list
Cross-validation...
likelihood > 0.5: 23939
[0.9712930077008698, 0.9349500528103596, 0.8788220858895706, 0.9060040133519885, 0.987791715480385, 0.956705794391388, 0.8896855820917511]
K-fold CV for: DC
is a list
Cross-validation...
likelihood > 0.5: 3520
[0.9982922598890266, 0.9679229635612614, 0.9547177700348431, 0.9612570197092432, 0.9996514769818894, 0.9906748822546982, 0.9604170446639145]
K-fold CV for: Mono
is a list
Cross-validation...
likelihood > 0.5: 49728
[0.9930222277316038, 0.9815451494356002, 0.9956896636123703, 0.988566746608

In [12]:
all_metrics_df

Unnamed: 0,Accuracy,Precision,Recall,F1-score,ROC-AUC,PR-AUC,MCC
B,0.998771,0.99572,0.989855,0.992777,0.999838,0.999006,0.992112
CD4_T,0.941744,0.886659,0.883052,0.884843,0.982762,0.951634,0.845864
CD8_T,0.971293,0.93495,0.878822,0.906004,0.987792,0.956706,0.889686
DC,0.998292,0.967923,0.954718,0.961257,0.999651,0.990675,0.960417
Mono,0.993022,0.981545,0.99569,0.988567,0.998485,0.994587,0.983598
NK,0.990101,0.956221,0.958074,0.95714,0.998676,0.991454,0.951549
other,0.991817,0.940101,0.657584,0.773724,0.912742,0.77077,0.782539
other_T,0.987497,0.894146,0.796356,0.842406,0.988898,0.902799,0.837448


In [13]:
!cd

C:\Users\evanlee\Documents\GitHub\EvanPys\Progress\PBMC_Hao_batch_noZ\Level1\DEG_L1


In [14]:
all_metrics_df.to_csv('LR_DEG_metrics_l1.csv')