In [1]:
from pathlib import Path

# Get the current working directory as a Path object
current_path = Path.cwd()
home_folder = 'evan_home'

# Traverse up the directory tree until you find the target folder
for parent in [current_path] + list(current_path.parents):
    if parent.name == home_folder:
        home_path = parent
        break
else:
    raise ValueError(f"Folder '{home_folder}' not found in the current working directory.")

print("Home Path:", home_path)
source_code_dir = home_path / 'Source_code'
dataset_dir = home_path / 'Dataset'


Home Path: c:\Users\evanlee\Documents\Bmi_NAS_evan\evan_home


In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import json
import pickle
import xgboost as xgb

In [3]:
import scanpy as sc

# adata = sc.read_h5ad(r"C:\Users\evanlee\Documents\Research_datasets\PBMC_Hao\GSE164378_Hao\Harmony_noZ\Hao_Harmony_test_no_scale.h5ad")
# adata = sc.read_h5ad('/Users/evanli/Documents/Research_datasets/PBMC_Hao/GSE164378_Hao/Harmony_noZ/Hao_Harmony_test_no_scale.h5ad')
adata = sc.read_h5ad(dataset_dir / 'PBMC_Hao/GSE164378_Hao/Harmony_noZ/Hao_Harmony_test_no_scale.h5ad')
print('Original adata:', adata.shape)
adata.obs['celltype.l2'] = adata.obs['celltype.l2'].str.replace(' ', '_')
label = adata.obs['celltype.l2'].tolist()
types = np.unique(label).tolist()
print('all cell types:', types)
print('====================')
# del adata


Original adata: (161764, 33538)
all cell types: ['ASDC', 'B_intermediate', 'B_memory', 'B_naive', 'CD14_Mono', 'CD16_Mono', 'CD4_CTL', 'CD4_Naive', 'CD4_Proliferating', 'CD4_TCM', 'CD4_TEM', 'CD8_Naive', 'CD8_Proliferating', 'CD8_TCM', 'CD8_TEM', 'Doublet', 'Eryth', 'HSPC', 'ILC', 'MAIT', 'NK', 'NK_CD56bright', 'NK_Proliferating', 'Plasmablast', 'Platelet', 'Treg', 'cDC1', 'cDC2', 'dnT', 'gdT', 'pDC']


In [3]:
adata.obs.head()

Unnamed: 0,celltype.l1,celltype.l2,celltype.l3,Batch,donor,time,lane,Phase,nCount_ADT,nFeature_ADT,nCount_RNA,nFeature_RNA,leiden
L1_AAACCCAAGAAACTCA,Mono,CD14_Mono,CD14 Mono,Batch1,P2,7,L1,G1,7535,217,10823,2915,4
L1_AAACCCAAGACATACA,CD4 T,CD4_TCM,CD4 TCM_1,Batch1,P1,7,L1,G1,6013,209,5864,1617,2
L1_AAACCCACAACTGGTT,CD8 T,CD8_Naive,CD8 Naive,Batch1,P4,2,L1,S,6620,213,5067,1381,5
L1_AAACCCACACGTACTA,NK,NK,NK_2,Batch1,P3,7,L1,G1,3567,202,4786,1890,3
L1_AAACCCACAGCATACT,CD8 T,CD8_Naive,CD8 Naive,Batch1,P4,7,L1,G1,6402,215,6505,1621,5


## Read features

In [4]:
import os
# os.chdir(r"C:\Users\evanlee\Documents\GitHub\EvanPys\Progress\PBMC_Hao_batch_noZ\Level2\feature_selection_k3")
# os.chdir('/Users/evanli/Documents/EvanPys/Progress/PBMC_Hao_batch_noZ/Level1/feature_selection_k3')
os.chdir(source_code_dir / 'PBMC_Hao_batch_noZ/Level2/feature_selection_k3')

features_dict = {}
# Read features for each celltype
for celltype in types:
    try:
        feature_df = pd.read_csv(f'{celltype}_features.txt', names=['Gene', 'Weight', 'Tendency'], sep='\t')
        features_dict[celltype] = feature_df
    except:
        print('skipping:', celltype)
        continue

In [5]:
features_dict.keys()

dict_keys(['ASDC', 'B_intermediate', 'B_memory', 'B_naive', 'CD14_Mono', 'CD16_Mono', 'CD4_CTL', 'CD4_Naive', 'CD4_Proliferating', 'CD4_TCM', 'CD4_TEM', 'CD8_Naive', 'CD8_Proliferating', 'CD8_TCM', 'CD8_TEM', 'Doublet', 'Eryth', 'HSPC', 'ILC', 'MAIT', 'NK', 'NK_CD56bright', 'NK_Proliferating', 'Plasmablast', 'Platelet', 'Treg', 'cDC1', 'cDC2', 'dnT', 'gdT', 'pDC'])

In [6]:
count_df = pd.DataFrame(columns=['Feature_count', 'Positive_feature_count'])
for celltype in features_dict.keys():
    feature_df = features_dict[celltype]
    feature_count = feature_df.shape[0]
    positive_count = feature_df[feature_df['Tendency'] == 1].shape[0]
    count_df.loc[celltype] = [feature_count, positive_count]
count_df

Unnamed: 0,Feature_count,Positive_feature_count
ASDC,47,25
B_intermediate,96,49
B_memory,170,90
B_naive,9,6
CD14_Mono,29,16
CD16_Mono,37,17
CD4_CTL,122,45
CD4_Naive,730,393
CD4_Proliferating,70,35
CD4_TCM,247,133


In [7]:
# os.chdir(r"C:\Users\evanlee\Documents\GitHub\EvanPys\Progress\PBMC_Hao_batch_noZ\Level2\DEG_L2\L2_DEG_table")
os.chdir(source_code_dir / 'PBMC_Hao_batch_noZ/Level2/DEG_L2/L2_DEG_table')
deg_dict = {}
for celltype in types:
    DEG_table = pd.read_csv(celltype + '_DEG1000.csv', index_col=0)
    n_features = count_df.loc[celltype, 'Feature_count']
    DEGn = DEG_table['names'][:n_features].tolist()

    deg_dict[celltype] = DEGn

In [9]:
deg_dict['B_naive']

['IGHM', 'IGHD', 'CD79A', 'CD37', 'MS4A1', 'CD74', 'TCL1A', 'CD79B', 'RALGPS2']

## XGBoost

In [8]:
import os
import sys
import importlib
# sys.path.append('/Users/evanli/Documents/EvanPys/Progress')
# sys.path.append('/home/jovyan/work/GitHub/EvanPys/Progress')
# sys.path.append(r'C:\Users\evanlee\Documents\GitHub\EvanPys\Progress')
sys.path.append(str(source_code_dir))
from evan_library import evan_models as emd
importlib.reload(emd)


<module 'evan_library.evan_models' from 'c:\\Users\\evanlee\\Documents\\Bmi_NAS_evan\\evan_home\\Source_code\\evan_library\\evan_models.py'>

In [9]:
# initialize
evan_xgb = emd.EvanModels(adata, deg_dict, level='l2')

In [None]:
# os.chdir(r"C:\Users\evanlee\Documents\GitHub\EvanPys\Progress\PBMC_Hao_batch_noZ\Level2\DEG_L2")
os.chdir(source_code_dir / 'PBMC_Hao_batch_noZ/Level2/DEG_L2')
all_metrics_df, cv_results_dict, likelihood_dict = evan_xgb.run_XGB_kfold_for_types(save_path='./')

K-fold CV for: ASDC


Parameters: { "verbose" } are not used.



Cross-validation...
likelihood > 0.5: 74
[0.9998299988759879, 0.9345959595959595, 0.6871794871794872, 0.7904761904761906, 0.9999539115696828, 0.9272519378769379, 0.8005238730292911]
K-fold CV for: B_intermediate


Parameters: { "verbose" } are not used.



Cross-validation...
likelihood > 0.5: 2375
[0.9944286012908548, 0.8658326126320018, 0.7460154241645245, 0.8008491697932213, 0.9974285857313803, 0.894495538513308, 0.8006352856105343]
K-fold CV for: B_memory


Parameters: { "verbose" } are not used.



Cross-validation...
likelihood > 0.5: 3289
[0.9964454329432486, 0.9157828571028592, 0.9086844106463878, 0.912144249657714, 0.9994134444226208, 0.9721057569164854, 0.9103768973504336]
K-fold CV for: B_naive


Parameters: { "verbose" } are not used.



Cross-validation...
likelihood > 0.5: 7730
[0.9947531505739795, 0.9418269894955541, 0.9486539937926102, 0.9452170139486272, 0.9991046435012976, 0.9797762469413609, 0.9424746291845205]
K-fold CV for: CD14_Mono


Parameters: { "verbose" } are not used.



Cross-validation...
likelihood > 0.5: 42739
[0.9911444960502284, 0.9788775761983015, 0.9877606201853448, 0.9832980496868091, 0.9988296995163377, 0.9947052143089463, 0.9772925920664151]
K-fold CV for: CD16_Mono


Parameters: { "verbose" } are not used.



Cross-validation...
likelihood > 0.5: 6343
[0.99379497031847, 0.9039346002596826, 0.9414546705605924, 0.9222535128298942, 0.998567026368763, 0.9550002663170352, 0.9192622176756634]
K-fold CV for: CD4_CTL


Parameters: { "verbose" } are not used.



Cross-validation...
likelihood > 0.5: 1652
[0.9940576925159311, 0.8203295500242695, 0.5716255876165494, 0.6736323956372331, 0.99556361406145, 0.7940351588293095, 0.681964423024727]
K-fold CV for: CD4_Naive


Parameters: { "verbose" } are not used.



Cross-validation...
likelihood > 0.5: 17465
[0.9784871447647813, 0.9056423266394178, 0.8940850430898013, 0.8998136734079993, 0.9952222965930814, 0.9611071072120421, 0.8877951513991663]
K-fold CV for: CD4_Proliferating


Parameters: { "verbose" } are not used.



Cross-validation...
likelihood > 0.5: 106
[0.9995595427323188, 0.7236308789249966, 0.5588235294117647, 0.6253354100260793, 0.9973760690564003, 0.6989604419680988, 0.6329773642273968]
K-fold CV for: CD4_TCM


Parameters: { "verbose" } are not used.



Cross-validation...
likelihood > 0.5: 14786
[0.9665870756375432, 0.8332844336430766, 0.796322432229693, 0.8143595022932884, 0.988879556084688, 0.8948238476552367, 0.796273641545036]
K-fold CV for: CD4_TEM


Parameters: { "verbose" } are not used.



Cross-validation...
likelihood > 0.5: 4014
[0.98402763903688, 0.7907127940845448, 0.5394079717392692, 0.6412551595578008, 0.9864222018342073, 0.7389410347540156, 0.6455658131567021]
K-fold CV for: CD8_Naive


Parameters: { "verbose" } are not used.



Cross-validation...
likelihood > 0.5: 10695
[0.9939649624859722, 0.9619455079720511, 0.9468297671120315, 0.9543044876039737, 0.998759066216385, 0.9890130051498387, 0.9511213890763409]
K-fold CV for: CD8_Proliferating


Parameters: { "verbose" } are not used.



Cross-validation...
likelihood > 0.5: 85
[0.9995981790257211, 0.7319696969696969, 0.4523809523809524, 0.5530578117534639, 0.9993950141562431, 0.6942790966289827, 0.5718223554517123]
K-fold CV for: CD8_TCM


Parameters: { "verbose" } are not used.



Cross-validation...
likelihood > 0.5: 2743
[0.991445858123696, 0.855123556525071, 0.6270605027654919, 0.7232611922756627, 0.9924919530843151, 0.8291869802678693, 0.7281039869763083]
K-fold CV for: CD8_TEM


Parameters: { "verbose" } are not used.



Cross-validation...
likelihood > 0.5: 11683
[0.9897226677347761, 0.9387648509750992, 0.9181407635693214, 0.9283346281644663, 0.9978731702988635, 0.9777668773392669, 0.922867946815248]
K-fold CV for: Doublet


Parameters: { "verbose" } are not used.



Cross-validation...
likelihood > 0.5: 526
[0.9971022511252906, 0.7716135477837606, 0.3139390034364261, 0.4442867348359866, 0.857687363438786, 0.389237259255959, 0.48975417275051514]
K-fold CV for: Eryth


Parameters: { "verbose" } are not used.



Cross-validation...
likelihood > 0.5: 86
[0.9998377265526388, 0.8664102564102565, 0.8032967032967033, 0.8319395515917256, 0.9819012021261818, 0.8587087350168121, 0.8333116478962161]
K-fold CV for: HSPC


Parameters: { "verbose" } are not used.



Cross-validation...
likelihood > 0.5: 326
[0.9998609080898401, 0.9959183673469388, 0.9354136429608128, 0.964625965737636, 0.9999653974088037, 0.9910209618301387, 0.9650781936619758]
K-fold CV for: ILC


Parameters: { "verbose" } are not used.



Cross-validation...
likelihood > 0.5: 127
[0.9995904528418219, 0.9549450549450549, 0.5285714285714286, 0.6717857142857143, 0.9993113031355947, 0.8349432800317608, 0.7050884960321799]
K-fold CV for: MAIT


Parameters: { "verbose" } are not used.



Cross-validation...
likelihood > 0.5: 2791
[0.9968704333648762, 0.9102860982434408, 0.9075054164357332, 0.9088431686569383, 0.9993810876141751, 0.9625817698778809, 0.9072783212928167]
K-fold CV for: NK


Parameters: { "verbose" } are not used.



Cross-validation...
likelihood > 0.5: 17234
[0.989034941234533, 0.9392780348169489, 0.9587274777460021, 0.9488865318740203, 0.9985474049914048, 0.986434237054198, 0.9428186008819888]
K-fold CV for: NK_CD56bright


Parameters: { "verbose" } are not used.



Cross-validation...
likelihood > 0.5: 928
[0.9984622645956426, 0.898779274386194, 0.8328388520971302, 0.8634011754527412, 0.9996341581008006, 0.9467417736326054, 0.8638435171030876]
K-fold CV for: NK_Proliferating


Parameters: { "verbose" } are not used.



Cross-validation...
likelihood > 0.5: 540
[0.9989568105245248, 0.8763539802183417, 0.8058254963427377, 0.8394396089695573, 0.9997397270591014, 0.931713279140822, 0.839746644373675]
K-fold CV for: Plasmablast


Parameters: { "verbose" } are not used.



Cross-validation...
likelihood > 0.5: 362
[0.9998995443832424, 0.9932203389830508, 0.9625365283459966, 0.9774186860685716, 0.9999848646807772, 0.9867578100186588, 0.9775992044776249]
K-fold CV for: Platelet


Parameters: { "verbose" } are not used.



Cross-validation...
likelihood > 0.5: 2197
[0.9986708996246545, 0.9770275618194162, 0.9280177483956464, 0.9518720180163115, 0.9771532772238795, 0.951677541060341, 0.9515328172442568]
K-fold CV for: Treg


Parameters: { "verbose" } are not used.



Cross-validation...
likelihood > 0.5: 2378
[0.9934704159599892, 0.888400587418962, 0.662014118931527, 0.7586402543016576, 0.994430433992472, 0.8562505851156743, 0.7638307302660037]
K-fold CV for: cDC1


Parameters: { "verbose" } are not used.



Cross-validation...
likelihood > 0.5: 150
[0.9999304538956448, 0.9833043478260869, 0.942, 0.9614821247647996, 0.9995502681310748, 0.9757020445955945, 0.9620329296151826]
K-fold CV for: cDC2


Parameters: { "verbose" } are not used.



Cross-validation...
likelihood > 0.5: 2486
[0.9976740759105345, 0.9311795683936873, 0.9175461346633416, 0.9242253510754097, 0.9988502252384499, 0.9662674217477052, 0.9231152253596997]
K-fold CV for: dnT


Parameters: { "verbose" } are not used.



Cross-validation...
likelihood > 0.5: 321
[0.9985704457991986, 0.8622712842712843, 0.4175438596491228, 0.561344537815126, 0.9737477680257755, 0.6198232142727552, 0.5987059310071907]
K-fold CV for: gdT


Parameters: { "verbose" } are not used.



Cross-validation...
likelihood > 0.5: 3586
[0.994042239551032, 0.8976003266389402, 0.8307655255057684, 0.8628537875041431, 0.9906157264010578, 0.9019616595391229, 0.8605050995454698]
K-fold CV for: pDC


Parameters: { "verbose" } are not used.



Cross-validation...
likelihood > 0.5: 859
[0.9998686360650414, 0.9841619375307207, 0.9913043478260869, 0.9877151110207281, 0.9999907105677407, 0.998094228691279, 0.9876581895869647]


In [13]:
all_metrics_df

Unnamed: 0,Accuracy,Precision,Recall,F1-score,ROC-AUC,PR-AUC,MCC
ASDC,0.99983,0.934596,0.687179,0.790476,0.999954,0.927252,0.800524
B_intermediate,0.994429,0.865833,0.746015,0.800849,0.997429,0.894496,0.800635
B_memory,0.996445,0.915783,0.908684,0.912144,0.999413,0.972106,0.910377
B_naive,0.994753,0.941827,0.948654,0.945217,0.999105,0.979776,0.942475
CD14_Mono,0.991144,0.978878,0.987761,0.983298,0.99883,0.994705,0.977293
CD16_Mono,0.993795,0.903935,0.941455,0.922254,0.998567,0.955,0.919262
CD4_CTL,0.994058,0.82033,0.571626,0.673632,0.995564,0.794035,0.681964
CD4_Naive,0.978487,0.905642,0.894085,0.899814,0.995222,0.961107,0.887795
CD4_Proliferating,0.99956,0.723631,0.558824,0.625335,0.997376,0.69896,0.632977
CD4_TCM,0.966587,0.833284,0.796322,0.81436,0.98888,0.894824,0.796274


In [None]:
cv_results_dict

In [14]:
!cd

C:\Users\evanlee\Documents\GitHub\EvanPys\Progress\PBMC_Hao_batch_noZ\Level2\DEG_L2


In [15]:
all_metrics_df.to_csv('XGBclassifier_DEG_binary_metrics_l2.csv')