## Feature selection using loss convergence
At optimal lambda from previous weight difference lambda tuning

In [1]:
import os
import sys
# sys.path.append('/home/evanlee/PBMC_Hao')
sys.path.append('/home/jovyan/work/GitHub/EvanPys/Progress')
from ADlasso2 import AD2_w_utils_loss_nopvl as ad

import numpy as np
import pandas as pd
from pathlib import Path
import scanpy as sc
import sklearn
from scipy.sparse import csr_matrix
from sklearn.metrics.cluster import adjusted_rand_score
import copy
import json
import time
import multiprocessing as mp


In [2]:
# %% Feature selection with optimal lambda
def pipeline_feature_selection(data, celltype, label, opt_lmbd, output_path=''):
    print('====================')
    print('Starting job for {}'.format(celltype))
    st = time.time()

    # Binary classification of a celltype
    celltype_label = [1 if x == celltype else 0 for x in label]
    # create index for a celltype
    celltype_indices = [idx for idx, label in enumerate(celltype_label) if label == 1]

    # Find marker genes with optimal lambda
    pvl = ad.get_prevalence(data.X, celltype_indices)
    print('Fitting with optimal lambda:', opt_lmbd)
    opt_res = ad.ADlasso2(lmbd=opt_lmbd, loss_threshold=1e-2, tol=1e-5, echo=True, device='cuda')  # cuda
    opt_res.fit(data.X, celltype_label, pvl)
    
    # Export selection results
    os.chdir(output_path)
    opt_res.writeList(outpath=output_path+f'/{celltype}_features.txt', featureNameList=data.var_names)
    print(f'{celltype} feature list exported.')

    et = time.time()
    elapsed = (et-st)/60
    # print(f'Elapsed time for {celltype}: {elapsed} minutes')

    # Ouput description
    description = f'''Optimal lambda: {opt_lmbd}
    median of selected prevalence: {np.median([pvl[i]  for i, w in enumerate(opt_res.feature_set) if w != 0])}
    minimal loss: {opt_res.loss_}
    minimal weight diff: {opt_res.convergence_}
    total selected feature: {np.sum(opt_res.feature_set)}
    Time elapsed: {elapsed}\n'''
    print('---Selection result for {}'.format(celltype))
    print(description)

    with open(f'{celltype}_description.txt', 'w') as f:
        f.write(description)


In [3]:
# %% Multi-processing to run feature selection

# Define a function to execute pipeline_feature_selection for a single cell type
def run_pipeline_feature_selection(celltype):
    st = time.time()
    # Read adata
    adata = sc.read_h5ad('/home/jovyan/work/Research_datasets/Hao_PBMC_level2_rep_cells.h5ad')
    print('Original adata:', adata.shape)  # (32349, 20568)

    ### Remove the genes whose expression is zero in all cells of this celltype
    adata_celltype = adata[adata.obs['celltype.l2'] == celltype]
    print('adata celltype shape:', adata_celltype.shape)

    # Remove explicit zeros from the sparse matrix
    adata_celltype.X.eliminate_zeros()

    # Find the columns that are all zeros
    all_zeros = np.where(adata_celltype.X.getnnz(axis=0) == 0)[0]

    # Remove the columns that are all zeros from the anndata object
    adata = adata[:, ~adata_celltype.var_names.isin(adata_celltype.var_names[all_zeros])]
    print('adata shape after removing all zero columns for celltype cells:', adata.shape)
    del adata_celltype, all_zeros

    # L1 celltype as labels
    label = adata.obs['celltype.l2'].tolist()


    ### Read optimal lambda dictionary from json (v3 lambda_decision_new)
    with open('/home/jovyan/work/GitHub/EvanPys/Progress/PBMC_Hao/Level2_loss_converge/L2_optimal_lambda.json', 'r') as f:
        opt_lambda_dict = json.load(f)
    opt_lmbd = opt_lambda_dict[celltype]
    print('optimal lambda:', opt_lmbd)

    pipeline_feature_selection(adata, celltype, label, opt_lmbd, output_path='/home/jovyan/work/GitHub/EvanPys/Progress/PBMC_Hao/test_without_prevalence')

    et = time.time()
    print(f'Elapsed time for {celltype}: {(et-st)/60:.2f} minutes')


In [4]:
# %% Main code
adata = sc.read_h5ad('/home/jovyan/work/Research_datasets/Hao_PBMC_level2_rep_cells.h5ad')
label = adata.obs['celltype.l2'].tolist()
types = np.unique(label).tolist()
print('all cell types:', types)


all cell types: ['ASDC', 'B_intermediate', 'B_memory', 'B_naive', 'CD14_Mono', 'CD16_Mono', 'CD4_CTL', 'CD4_Naive', 'CD4_Proliferating', 'CD4_TCM', 'CD4_TEM', 'CD8_Naive', 'CD8_Proliferating', 'CD8_TCM', 'CD8_TEM', 'Doublet', 'Eryth', 'HSPC', 'ILC', 'MAIT', 'NK', 'NK_CD56bright', 'NK_Proliferating', 'Plasmablast', 'Platelet', 'Treg', 'cDC1', 'cDC2', 'dnT', 'gdT', 'pDC']


In [5]:
celltype = 'ASDC'
run_pipeline_feature_selection(celltype)

Original adata: (32340, 20568)
adata celltype shape: (15, 20568)
adata shape after removing all zero columns for celltype cells: (32340, 9595)
optimal lambda: 0.00014677992676220703
Starting job for ASDC
Fitting with optimal lambda: 0.00014677992676220703
Convergence with loss threshold
Convergence with loss threshold
Converge history:`
{0: 0.6931473016738892, 100: 0.02825481817126274, 200: 0.013114890083670616}
minimum epoch =  247 ; minimum lost =  0.009952339343726635 ; diff weight =  0.0007679003174416721
ASDC feature list exported.
---Selection result for ASDC
Optimal lambda: 0.00014677992676220703
    median of selected prevalence: 0.26666666666666666
    minimal loss: 0.009952339343726635
    minimal weight diff: 0.0007679003174416721
    total selected feature: 7757
    Time elapsed: 2.0861826380093893

Elapsed time for ASDC: 2.13 minutes
