In [1]:
from pathlib import Path

# Get the current working directory as a Path object
current_path = Path.cwd()
home_folder = 'evan_home'

# Traverse up the directory tree until you find the target folder
for parent in [current_path] + list(current_path.parents):
    if parent.name == home_folder:
        home_path = parent
        break
else:
    raise ValueError(f"Folder '{home_folder}' not found in the current working directory.")

print("Home Path:", home_path)
source_code_dir = home_path / 'Source_code'
dataset_dir = home_path / 'Dataset'


Home Path: c:\Users\evanlee\Documents\Bmi_NAS_evan\evan_home


In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import json
import pickle
import xgboost as xgb

In [3]:
import scanpy as sc

# adata = sc.read_h5ad(r"C:\Users\evanlee\Documents\Research_datasets\PBMC_Hao\GSE164378_Hao\Harmony_noZ\Hao_Harmony_test_no_scale.h5ad")
# adata = sc.read_h5ad('/Users/evanli/Documents/Research_datasets/PBMC_Hao/GSE164378_Hao/Harmony_noZ/Hao_Harmony_test_no_scale.h5ad')
adata = sc.read_h5ad(dataset_dir / 'PBMC_Hao/GSE164378_Hao/Harmony_noZ/Hao_Harmony_test_no_scale.h5ad')

print('Original adata:', adata.shape)
adata.obs['celltype.l1'] = adata.obs['celltype.l1'].str.replace(' ', '_')
label = adata.obs['celltype.l1'].tolist()
types = np.unique(label).tolist()
print('all cell types:', types)
print('====================')
# del adata


Original adata: (161764, 33538)
all cell types: ['B', 'CD4_T', 'CD8_T', 'DC', 'Mono', 'NK', 'other', 'other_T']


In [3]:
adata.obs.head()

Unnamed: 0,celltype.l1,celltype.l2,celltype.l3,Batch,donor,time,lane,Phase,nCount_ADT,nFeature_ADT,nCount_RNA,nFeature_RNA,leiden
L1_AAACCCAAGAAACTCA,Mono,CD14 Mono,CD14 Mono,Batch1,P2,7,L1,G1,7535,217,10823,2915,4
L1_AAACCCAAGACATACA,CD4_T,CD4 TCM,CD4 TCM_1,Batch1,P1,7,L1,G1,6013,209,5864,1617,2
L1_AAACCCACAACTGGTT,CD8_T,CD8 Naive,CD8 Naive,Batch1,P4,2,L1,S,6620,213,5067,1381,5
L1_AAACCCACACGTACTA,NK,NK,NK_2,Batch1,P3,7,L1,G1,3567,202,4786,1890,3
L1_AAACCCACAGCATACT,CD8_T,CD8 Naive,CD8 Naive,Batch1,P4,7,L1,G1,6402,215,6505,1621,5


## Read features

In [5]:
import os
# os.chdir(r"C:\Users\evanlee\Documents\GitHub\EvanPys\Progress\PBMC_Hao_batch_noZ\Level1\feature_selection_k3")
# os.chdir('/Users/evanli/Documents/EvanPys/Progress/PBMC_Hao_batch_noZ/Level1/feature_selection_k3')
os.chdir(source_code_dir / 'PBMC_Hao_batch_noZ/Level1/feature_selection_k3')

features_dict = {}
# Read features for each celltype
for celltype in types:
    try:
        feature_df = pd.read_csv(f'{celltype}_features.txt', names=['Gene', 'Weight', 'Tendency'], sep='\t')
        features_dict[celltype] = feature_df
    except:
        print('skipping:', celltype)
        continue

In [6]:
features_dict.keys()

dict_keys(['B', 'CD4_T', 'CD8_T', 'DC', 'Mono', 'NK', 'other', 'other_T'])

In [7]:
count_df = pd.DataFrame(columns=['Feature_count', 'Positive_feature_count'])
for celltype in features_dict.keys():
    feature_df = features_dict[celltype]
    feature_count = feature_df.shape[0]
    positive_count = feature_df[feature_df['Tendency'] == 1].shape[0]
    count_df.loc[celltype] = [feature_count, positive_count]
count_df

Unnamed: 0,Feature_count,Positive_feature_count
B,19,10
CD4_T,201,95
CD8_T,23,9
DC,50,23
Mono,50,20
NK,33,17
other,5,3
other_T,247,112


In [8]:
# os.chdir(r"C:\Users\evanlee\Documents\GitHub\EvanPys\Progress\PBMC_Hao_batch_noZ\Level1\DEG_L1\L1_DEG_table")
os.chdir(source_code_dir / 'PBMC_Hao_batch_noZ/Level1/DEG_L1/L1_DEG_table')
deg_dict = {}
for celltype in types:
    DEG_table = pd.read_csv(celltype + '_DEG1000.csv', index_col=0)
    n_features = count_df.loc[celltype, 'Feature_count']
    DEGn = DEG_table['names'][:n_features].tolist()

    deg_dict[celltype] = DEGn

In [9]:
isinstance(deg_dict['B'], list)

True

## XGBoost

In [10]:
import os
import sys
import importlib
# sys.path.append('/Users/evanli/Documents/EvanPys/Progress')
# sys.path.append('/home/jovyan/work/GitHub/EvanPys/Progress')
# sys.path.append(r'C:\Users\evanlee\Documents\GitHub\EvanPys\Progress')
sys.path.append(str(source_code_dir))
from evan_library import evan_models as emd
importlib.reload(emd)


<module 'evan_library.evan_models' from 'c:\\Users\\evanlee\\Documents\\Bmi_NAS_evan\\evan_home\\Source_code\\evan_library\\evan_models.py'>

In [20]:
# initialize
evan_xgb = emd.EvanModels(adata, deg_dict, level='l1')

In [None]:
# os.chdir(r"C:\Users\evanlee\Documents\GitHub\EvanPys\Progress\PBMC_Hao_batch_noZ\Level1\DEG_L1")
os.chdir(source_code_dir / 'PBMC_Hao_batch_noZ/Level1/DEG_L1')
all_metrics_df, cv_results_dict, likelihood_dict = evan_xgb.run_XGB_kfold_for_types(save_path='./')

K-fold CV for: B


Parameters: { "verbose" } are not used.



Cross-validation...
likelihood > 0.5: 13794
[0.9993431773397031, 0.9973687801270916, 0.9949275362318841, 0.9961451491338659, 0.9999246219461895, 0.9994520639344817, 0.995787951246836]
K-fold CV for: CD4_T


Parameters: { "verbose" } are not used.



Cross-validation...
likelihood > 0.5: 41025
[0.9548338054971433, 0.9096560982382271, 0.9124415754960019, 0.9110402278665433, 0.9900984277073036, 0.9698760160670318, 0.880781532935934]
K-fold CV for: CD8_T


Parameters: { "verbose" } are not used.



Cross-validation...
likelihood > 0.5: 25362
[0.9744766647884966, 0.9308945045121364, 0.9050797546012271, 0.917799384491936, 0.9911204002115213, 0.9682331203605706, 0.9028253265564208]
K-fold CV for: DC


Parameters: { "verbose" } are not used.



Cross-validation...
likelihood > 0.5: 3577
[0.9985009008890449, 0.9705542615405388, 0.9616858051810333, 0.9660752542385598, 0.9998227678838554, 0.9920007422693345, 0.965332193416074]
K-fold CV for: Mono


Parameters: { "verbose" } are not used.



Cross-validation...
likelihood > 0.5: 49149
[0.99410405230628, 0.9836215812260779, 0.9971434427051991, 0.9903361955691674, 0.9985449454991846, 0.9946901847289602, 0.9861422146062372]
K-fold CV for: NK


Parameters: { "verbose" } are not used.



Cross-validation...
likelihood > 0.5: 18646
[0.9934936049609486, 0.9699915485598053, 0.9737462022862633, 0.9718605736153336, 0.9993490196445685, 0.9951663771814815, 0.9681874309875885]
K-fold CV for: other


Parameters: { "verbose" } are not used.



Cross-validation...
likelihood > 0.5: 2749
[0.9909822193188136, 0.8905224084015921, 0.6572229005114667, 0.7561038115809874, 0.875133465563492, 0.7335397564298219, 0.7607270216784651]
K-fold CV for: other_T


Parameters: { "verbose" } are not used.



Cross-validation...
likelihood > 0.5: 6704
[0.9924735904545674, 0.9447787559241124, 0.8716636085937779, 0.9067297681981217, 0.9959380956929893, 0.9597194501178876, 0.9036208168113692]


In [22]:
all_metrics_df

Unnamed: 0,Accuracy,Precision,Recall,F1-score,ROC-AUC,PR-AUC,MCC
B,0.999343,0.997369,0.994928,0.996145,0.999925,0.999452,0.995788
CD4_T,0.954834,0.909656,0.912442,0.91104,0.990098,0.969876,0.880782
CD8_T,0.974477,0.930895,0.90508,0.917799,0.99112,0.968233,0.902825
DC,0.998501,0.970554,0.961686,0.966075,0.999823,0.992001,0.965332
Mono,0.994104,0.983622,0.997143,0.990336,0.998545,0.99469,0.986142
NK,0.993494,0.969992,0.973746,0.971861,0.999349,0.995166,0.968187
other,0.990982,0.890522,0.657223,0.756104,0.875133,0.73354,0.760727
other_T,0.992474,0.944779,0.871664,0.90673,0.995938,0.959719,0.903621


In [None]:
cv_results_dict

In [23]:
!cd

C:\Users\evanlee\Documents\GitHub\EvanPys\Progress\PBMC_Hao_batch_noZ\Level1\DEG_L1


In [24]:
all_metrics_df.to_csv('XGBclassifier_DEG_metrics_l1.csv')