In [1]:
from pathlib import Path

# Get the current working directory as a Path object
current_path = Path.cwd()
home_folder = 'evan_home'

# Traverse up the directory tree until you find the target folder
for parent in [current_path] + list(current_path.parents):
    if parent.name == home_folder:
        home_path = parent
        break
else:
    raise ValueError(f"Folder '{home_folder}' not found in the current working directory.")

print("Home Path:", home_path)
source_code_dir = home_path / 'Source_code'
dataset_dir = home_path / 'Dataset'


Home Path: c:\Users\evanlee\Documents\Bmi_NAS_evan\evan_home


In [2]:
import sys
# sys.path.append('/Users/evanli/Documents/EvanPys/Progress')
# sys.path.append('/home/jovyan/work/GitHub/EvanPys/Progress')
# sys.path.append(r'C:\Users\evanlee\Documents\GitHub\EvanPys\Progress')
sys.path.append(str(source_code_dir))
from ADlasso2 import AD2_w_utils_lossdiff_noZ as ad

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import pickle
import json

In [3]:
import scanpy as sc

# adata = sc.read_h5ad(r"C:\Users\evanlee\Documents\Research_datasets\HCC_Lu\HCC_Lu_preprocessed_noscale.h5ad")
adata = sc.read_h5ad(dataset_dir / 'HCC_Lu/HCC_Lu_preprocessed_noscale.h5ad')
print('Original adata:', adata.shape)

Original adata: (71915, 25712)


In [3]:
def plot_feature_property(adata, Y, AD_object, celltype):
    # Get feature property
    prop = ad.featureProperty(adata.X, Y, AD_object)
    prop['featureID'] = adata.var_names
    print(prop.head())

    # Plot feature property
    # Filter the data
    positive_data = prop[prop['select'] == 'PreLect_positive']
    negative_data = prop[prop['select'] == 'PreLect_negative']
    other_data = prop[prop['select'] == 'No selected']

    # Plot the other dots with grey color and alpha=0.5
    sns.scatterplot(x="prevalence_1", y="prevalence_0", color='#BCBCBC', alpha=0.5, data=other_data, label='Others')
    # Plot the positive dots with red color and alpha=1
    sns.scatterplot(x="prevalence_1", y="prevalence_0", color='r', alpha=1, data=positive_data, label='PreLect_positive')
    # Plot the negative dots with blue color and alpha=1
    sns.scatterplot(x="prevalence_1", y="prevalence_0", color='b', alpha=1, data=negative_data, label='PreLect_negative')

    # Get the current axes
    ax = plt.gca()

    # Get the handles and labels from the scatterplot
    handles, labels = ax.get_legend_handles_labels()
    order = [1, 2, 0]
    # Set the legend
    try:
        plt.legend(handles=[handles[idx] for idx in order], labels=[labels[idx] for idx in order], loc='upper left', fontsize='small')
    except:
        print('Failed to plot legend')
    plt.xlabel('Target prevalence')
    plt.ylabel('Other prevalence')
    plt.title(f'{celltype}: selection profile')
    # plt.show()
    plt.savefig(f'./feature_property_plot_new/{celltype}_selection_profile.png', dpi=300)
    plt.close('all')
    del adata

    return prop



In [None]:
# os.chdir(r'C:\Users\evanlee\Documents\GitHub\EvanPys\Progress\HCC_case_study\feature_selection_k3')
os.chdir(source_code_dir / 'HCC_case_study/feature_selection_k3')

clusters = [f'Leiden_{i}' for i in range(24)]
label = adata.obs['leiden'].tolist()
for clus in clusters:
    print('=====')
    print(clus)
    # Binary classification of a celltype
    celltype_label = [1 if x == clus else 0 for x in label]
    # create index for a celltype
    celltype_indices = [idx for idx, label in enumerate(celltype_label) if label == 1]

    with open(f'{clus}_PreL_model.pkl', 'rb') as f:
        prelect = pickle.load(f)
    
    property_df = plot_feature_property(adata, celltype_label, prelect, clus)
    property_df.to_csv(f'./feature_property_plot_new/{clus}_property_df.csv')
    print(property_df.head())

=====
Leiden_0
   meanAbundance  Variance       select  prevalence  prevalence_0  \
0       0.000839  0.000518  No selected    0.002322      0.001496   
1       0.046556  0.029140  No selected    0.117222      0.082202   
2       0.040168  0.022520  No selected    0.107113      0.082916   
3       0.001608  0.000998  No selected    0.005131      0.003158   
4       0.000339  0.000173  No selected    0.001196      0.000848   

   prevalence_1      featureID  
0      0.006549   RP11-34P13.7  
1      0.296394     FO538757.2  
2      0.230907     AP006222.2  
3      0.015224  RP4-669L17.10  
4      0.002977   RP5-857K21.4  
   meanAbundance  Variance       select  prevalence  prevalence_0  \
0       0.000839  0.000518  No selected    0.002322      0.001496   
1       0.046556  0.029140  No selected    0.117222      0.082202   
2       0.040168  0.022520  No selected    0.107113      0.082916   
3       0.001608  0.000998  No selected    0.005131      0.003158   
4       0.000339  0.000173 