In [39]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import numpy as np
from omegaconf import OmegaConf
from gfs.utils import get_datetime, get_paths
from gfs.data.hemisphere import PyGAnnDataGraphDataModule
import yaml
from sklearn.linear_model import LassoCV
from sklearn.feature_selection import SelectFromModel

In [38]:
# Read the YAML config file
with open("../gfs/configs/antelope.yaml", "r") as f:
    config = yaml.safe_load(f)

# Convert to OmegaConf
config = OmegaConf.create(config)

In [3]:
paths = get_paths()
expname = get_datetime(expname=config.expname)
log_path = paths["data_root"] + f"logs/{expname}"

datamodule = PyGAnnDataGraphDataModule(
    data_dir=paths["data_root"],
    file_names=config.data.file_names,
    cell_type=config.data.cell_type,
    spatial_coords=config.data.spatial_coords,
    self_loops_only=config.data.self_loops_only,
    batch_size=config.data.batch_size,
    n_hops=config.data.n_hops,
    d_threshold=config.data.d_threshold,
    n_splits=config.data.n_splits,
    cv=config.data.cv,
    rand_seed=config.data.rand_seed,
)
datamodule.setup("fit")



In [7]:
X_train = datamodule.data.x[datamodule.data.train_mask, :-3]
y_train = datamodule.data.celltype[datamodule.data.train_mask]

X_test = datamodule.data.x[datamodule.data.val_mask, :-3]
y_test = datamodule.data.celltype[datamodule.data.val_mask]


In [8]:
# Use lasso CV for feature selection
lasso_cv = LassoCV(cv=5)
lasso_cv.fit(X_train, y_train)

In [41]:
from warnings import simplefilter
from sklearn.exceptions import ConvergenceWarning
simplefilter("ignore", category=ConvergenceWarning)


In [43]:
for num_features in [10, 20, 100]:
    # select num_feature features
    sfm = SelectFromModel(lasso_cv, prefit=True, max_features=num_features)
    X_train_selected = sfm.transform(X_train)
    print(f"Number of features selected: {X_train_selected.shape[1]}")
    selected_feature_indices = sfm.get_support(indices=True)
    print("Indices of top k features:", selected_feature_indices)

    # use selected k features to train a new logistic regression model
    lr = LogisticRegression(random_state=42)
    lr.fit(X_train[:, selected_feature_indices], y_train)
    y_pred = lr.predict(X_test[:, selected_feature_indices])

    accuracy = accuracy_score(y_test, y_pred)
    print(f"Num_features ({num_features}) Accuracy: {accuracy:.3f} \n")




Number of features selected: 10
Indices of top k features: [  2  12  35  46  63 137 196 326 373 435]
Num_features (10) Accuracy: 0.664 

Number of features selected: 20
Indices of top k features: [  2  12  35  46  51  52  63  74  83 119 137 153 178 196 271 303 326 363
 373 435]
Num_features (20) Accuracy: 0.792 

Number of features selected: 100
Indices of top k features: [  0   2   3   8  12  22  24  27  28  30  32  35  41  45  46  48  51  52
  53  56  57  58  63  66  71  74  75  76  82  83  90  92  96  97  98 103
 119 120 122 131 137 138 142 145 146 148 153 154 156 157 158 159 163 167
 168 173 178 196 205 208 214 215 217 221 230 245 257 264 265 269 271 273
 281 287 300 303 319 324 325 326 334 336 343 354 360 363 364 372 373 378
 389 405 416 417 429 435 438 442 450 474]
Num_features (100) Accuracy: 0.928 

