In [1]:
from pathlib import Path

# Get the current working directory as a Path object
current_path = Path.cwd()
home_folder = 'evan_home'

# Traverse up the directory tree until you find the target folder
for parent in [current_path] + list(current_path.parents):
    if parent.name == home_folder:
        home_path = parent
        break
else:
    raise ValueError(f"Folder '{home_folder}' not found in the current working directory.")

print("Home Path:", home_path)
source_code_dir = home_path / 'Source_code'
dataset_dir = home_path / 'Dataset'


Home Path: c:\Users\evanlee\Documents\Bmi_NAS_evan\evan_home


In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import json
import pickle
import xgboost as xgb

In [3]:
import scanpy as sc

# adata = sc.read_h5ad(r"C:\Users\evanlee\Documents\Research_datasets\PBMC_Hao\GSE164378_Hao\Harmony_noZ\Hao_Harmony_test_no_scale.h5ad")
# adata = sc.read_h5ad('/Users/evanli/Documents/Research_datasets/PBMC_Hao/GSE164378_Hao/Harmony_noZ/Hao_Harmony_test_no_scale.h5ad')
adata = sc.read_h5ad(dataset_dir / 'PBMC_Hao/GSE164378_Hao/Harmony_noZ/Hao_Harmony_test_no_scale.h5ad')

print('Original adata:', adata.shape)
adata.obs['celltype.l1'] = adata.obs['celltype.l1'].str.replace(' ', '_')
label = adata.obs['celltype.l1'].tolist()
types = np.unique(label).tolist()
print('all cell types:', types)
print('====================')
# del adata


Original adata: (161764, 33538)
all cell types: ['B', 'CD4_T', 'CD8_T', 'DC', 'Mono', 'NK', 'other', 'other_T']


In [3]:
adata.obs.head()

Unnamed: 0,celltype.l1,celltype.l2,celltype.l3,Batch,donor,time,lane,Phase,nCount_ADT,nFeature_ADT,nCount_RNA,nFeature_RNA,leiden
L1_AAACCCAAGAAACTCA,Mono,CD14 Mono,CD14 Mono,Batch1,P2,7,L1,G1,7535,217,10823,2915,4
L1_AAACCCAAGACATACA,CD4_T,CD4 TCM,CD4 TCM_1,Batch1,P1,7,L1,G1,6013,209,5864,1617,2
L1_AAACCCACAACTGGTT,CD8_T,CD8 Naive,CD8 Naive,Batch1,P4,2,L1,S,6620,213,5067,1381,5
L1_AAACCCACACGTACTA,NK,NK,NK_2,Batch1,P3,7,L1,G1,3567,202,4786,1890,3
L1_AAACCCACAGCATACT,CD8_T,CD8 Naive,CD8 Naive,Batch1,P4,7,L1,G1,6402,215,6505,1621,5


## Read features

In [4]:
import os
# os.chdir(r"C:\Users\evanlee\Documents\GitHub\EvanPys\Progress\PBMC_Hao_batch_noZ\Level1\feature_selection_k3")
# os.chdir('/Users/evanli/Documents/EvanPys/Progress/PBMC_Hao_batch_noZ/Level1/feature_selection_k3')
os.chdir(source_code_dir / 'PBMC_Hao_batch_noZ/Level1/feature_selection_k3')

features_dict = {}
# Read features for each celltype
for celltype in types:
    try:
        feature_df = pd.read_csv(f'{celltype}_features.txt', names=['Gene', 'Weight', 'Tendency'], sep='\t')
        features_dict[celltype] = feature_df
    except:
        print('skipping:', celltype)
        continue

In [5]:
features_dict.keys()

dict_keys(['B', 'CD4_T', 'CD8_T', 'DC', 'Mono', 'NK', 'other', 'other_T'])

In [6]:
count_df = pd.DataFrame(columns=['Feature_count', 'Positive_feature_count'])
for celltype in features_dict.keys():
    feature_df = features_dict[celltype]
    feature_count = feature_df.shape[0]
    positive_count = feature_df[feature_df['Tendency'] == 1].shape[0]
    count_df.loc[celltype] = [feature_count, positive_count]
count_df

Unnamed: 0,Feature_count,Positive_feature_count
B,19,10
CD4_T,201,95
CD8_T,23,9
DC,50,23
Mono,50,20
NK,33,17
other,5,3
other_T,247,112


## XGBoost

In [7]:
import os
import sys
import importlib
# sys.path.append('/Users/evanli/Documents/EvanPys/Progress')
# sys.path.append('/home/jovyan/work/GitHub/EvanPys/Progress')
# sys.path.append(r'C:\Users\evanlee\Documents\GitHub\EvanPys\Progress')
sys.path.append(str(source_code_dir))
from evan_library import evan_models as emd
importlib.reload(emd)


<module 'evan_library.evan_models' from 'c:\\Users\\evanlee\\Documents\\Bmi_NAS_evan\\evan_home\\Source_code\\evan_library\\evan_models.py'>

In [8]:
# initialize
evan_xgb = emd.EvanModels(adata, features_dict, level='l1')

In [None]:
# os.chdir(r"C:\Users\evanlee\Documents\GitHub\EvanPys\Progress\PBMC_Hao_batch_noZ\Level1\XGB_model2")
os.chdir(source_code_dir / 'PBMC_Hao_batch_noZ/Level1/XGB_model2')
all_metrics_df, cv_results_dict, likelihood_dict = evan_xgb.run_XGB_kfold_for_types(save_path='./')

K-fold CV for: B


Parameters: { "verbose" } are not used.



Cross-validation...
likelihood > 0.5: 13802
[0.9996599986476268, 0.9982792253235419, 0.9977355072463767, 0.9980062386338615, 0.9999949916144631, 0.9999422088081674, 0.9978210653356949]
K-fold CV for: CD4_T


Parameters: { "verbose" } are not used.



Cross-validation...
likelihood > 0.5: 41053
[0.9775753114619563, 0.9536671791833591, 0.9580804532325159, 0.9558665710656309, 0.9969533474682384, 0.9902737918605162, 0.9408420504076016]
K-fold CV for: CD8_T


Parameters: { "verbose" } are not used.



Cross-validation...
likelihood > 0.5: 25318
[0.9804730689946253, 0.9487762019458129, 0.9259877300613496, 0.9372365478338154, 0.9948523470365682, 0.9805426659965111, 0.9257772474930974]
K-fold CV for: DC


Parameters: { "verbose" } are not used.



Cross-validation...
likelihood > 0.5: 3575
[0.9986090835853529, 0.9730002202357781, 0.9641230116648993, 0.9685208690600625, 0.9998787081079596, 0.9950686776385769, 0.9678310122147498]
K-fold CV for: Mono


Parameters: { "verbose" } are not used.



Cross-validation...
likelihood > 0.5: 49086
[0.9953404271299213, 0.9878437055665659, 0.9968884089774448, 0.9923452667370884, 0.9989095830291571, 0.9956651951830265, 0.9890176560290851]
K-fold CV for: NK


Parameters: { "verbose" } are not used.



Cross-validation...
likelihood > 0.5: 18638
[0.9939572425716298, 0.9735641432509647, 0.974081076044866, 0.9738198407471563, 0.9995150429925838, 0.9965325753367882, 0.9704060549790802]
K-fold CV for: other


Parameters: { "verbose" } are not used.



Cross-validation...
likelihood > 0.5: 2950
[0.9915617652126013, 0.8786175005016311, 0.7004296320739152, 0.7791618571171857, 0.9091592012320622, 0.7758682820688323, 0.7802524404348621]
K-fold CV for: other_T


Parameters: { "verbose" } are not used.



Cross-validation...
likelihood > 0.5: 6712
[0.993679051586102, 0.9546454889803615, 0.8919183858796661, 0.9221565845867128, 0.9977457041186579, 0.9713581140448138, 0.9194751960133385]


In [10]:
all_metrics_df

Unnamed: 0,Accuracy,Precision,Recall,F1-score,ROC-AUC,PR-AUC,MCC
B,0.99966,0.998279,0.997736,0.998006,0.999995,0.999942,0.997821
CD4_T,0.977575,0.953667,0.95808,0.955867,0.996953,0.990274,0.940842
CD8_T,0.980473,0.948776,0.925988,0.937237,0.994852,0.980543,0.925777
DC,0.998609,0.973,0.964123,0.968521,0.999879,0.995069,0.967831
Mono,0.99534,0.987844,0.996888,0.992345,0.99891,0.995665,0.989018
NK,0.993957,0.973564,0.974081,0.97382,0.999515,0.996533,0.970406
other,0.991562,0.878618,0.70043,0.779162,0.909159,0.775868,0.780252
other_T,0.993679,0.954645,0.891918,0.922157,0.997746,0.971358,0.919475


In [11]:
cv_results_dict

{'B': {'fit_time': [10.132755756378174,
   10.246147394180298,
   10.286897659301758,
   9.937744855880737,
   10.286897659301758],
  'score_time': [0.20319414138793945,
   0.1551530361175537,
   0.1575927734375,
   0.2488265037536621,
   0.15859222412109375],
  'test_accuracy': [0.9995750106247344,
   0.9995363573139634,
   0.9996522679854726,
   0.9997681786569816,
   0.9997681786569816],
  'test_precision': [0.9990913221263062,
   0.9986376021798365,
   0.9972862957937585,
   0.9977396021699819,
   0.998641304347826],
  'test_recall': [0.9959239130434783,
   0.9959239130434783,
   0.998641304347826,
   0.9995471014492754,
   0.998641304347826],
  'test_f1_score': [0.9975051031980041,
   0.9972789115646259,
   0.9979633401221996,
   0.9986425339366516,
   0.998641304347826],
  'test_roc_auc': [0.9999983930948992,
   0.9999988712927899,
   0.9999974556261195,
   0.9999845041891491,
   0.9999957338693582],
  'test_average_precision': [0.9999828959920366,
   0.9999879038420043,
   0.999

In [13]:
all_metrics_df.to_csv('XGBclassifier_binary_metrics_l1.csv')