# Bulk Walkthrough

Output directory structure (**TODO**: move to README)

```
bulk_walkthrough_output/
├── checkpoints/
│   ├── checkpoint_241201_143022.pkl
│   └── checkpoint_my_experiment_001.pkl
├── 241201_143022_data/
│   ├── pso_selected_genes.pkl
│   ├── pso_selected_genes.txt
│   ├── pso_final_results.pkl
│   └── ...
└── my_experiment_001_data/
    └── ...
```

Last updated: 26.08.2025

In [None]:
# autoreload is used to reload modules automatically before entering the
# execution of code typed at the IPython prompt.
%load_ext autoreload
%autoreload 2

In [None]:
import logging
import os
import pickle
from datetime import datetime

import pandas as pd

import PAGEpy
from PAGEpy import plot_functions, pso, utils
from PAGEpy.dataset_class import GeneExpressionDataset
from PAGEpy.models import AdvancedNN, TrainingConfig

PAGEpy.setup_jupyter_logging(level=logging.INFO)
logger = logging.getLogger("Bulk Walkthrough")

In [None]:
# Initialize CUDA for GPU support
gpu_available = utils.init_tensorflow()

### Configure output filenames

By default the run ID is the current date & time, manually change this to resume a previous run.

In [None]:
run_id = datetime.now().strftime("%y%m%d_%H%M%S")
#  run_id = "TEST"

output_dir = "bulk_walkthrough_output"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

data_directory = os.path.join(output_dir, f"{run_id}_data")
if not os.path.exists(data_directory):
    os.makedirs(data_directory)

## Create Dataset

In [None]:
# TODO: instead of file patterns, use whole filenames
current_data = GeneExpressionDataset(
    data_dir="../../bulk_data/",
    counts_pattern="count_matrix.mtx",
    barcodes_pattern="sample_names.txt",
    # I think here it's possible to set already processed gene list:
    genes_pattern="gene_names.txt",
    metadata_pattern="response_labels.csv",
    gene_selection="Diff",
    pval_cutoff=0.00005,
    pval_correction="benjamini-hochberg",
    features_out_filename=os.path.join(
        data_directory, "feature_set.pkl"),
    train_samples_out_filename=os.path.join(
        data_directory, "train_samples.txt"),
    positive_label="yes"  # TODO: find out how Sean originally encoded it
)

In [None]:
# Use genes selected during dataset creation...
# genes_path = current_data.selected_features

# ...or load selected genes from .pkl file
genes_path = os.path.join(
    data_directory, "feature_set.pkl")

with open(genes_path, "rb") as f:
    current_genes = pickle.load(f)
logger.info(f"Loaded {len(current_genes)} genes as 'current_genes'")

## Initialize and train NN Model

Set NN model parameters

In [None]:
# retrieved default params used in Sean's code
config = TrainingConfig(
    report_frequency=1,
    auc_threshold=1,
    learning_rate=0.001)

training_params = {
    # 'n_epochs': 50,
    'n_epochs': 500,
    'batch_size': 64,
    'seed': 42,
}

Train NN model before PSO

In [None]:
# Initialize NN model
initial_model = AdvancedNN(
    n_input_features=len(current_genes),
    config=config,
)

# Train model
train_history = initial_model.train(
    x_train=current_data.x_train,
    y_train=current_data.y_train,
    x_test=current_data.x_test,
    y_test=current_data.y_test,
    **training_params,
)

In [None]:
plot_functions.plot_model_history(
    model_history=train_history,
    report_frequency=initial_model.config.report_frequency,
    y_train=current_data.y_train,
    y_test=current_data.y_test,
    save_path=os.path.join(  # uncomment to save plot on disc
        data_directory, "initial_model_history.png"),
    data_save_path=os.path.join(
        data_directory, "initial_training_metrics.csv")
)

## Run binary PSO

Run pso algorithm to find features (genes) that lead the NN model to a better solution.

TO DO: is the algorithm leveraging the GPU??

In [None]:
from PAGEpy.models import SimpleNN


best_solution, best_fitness = pso.run_binary_pso(
    run_id=run_id,
    input_data=current_data,
    feature_names=current_genes,
    pop_size=200,
    # pop_size=3,
    n_generations=15,
    # n_generations=4,
    w=1,
    c1=2,
    c2=1.5,
    n_reps=4,
    #  n_reps=1,
    verbose=True,
    # verbose=False,
    adaptive_metrics=False,
    output_prefix=data_directory,
    model_class=SimpleNN,
)

In [None]:
# Load PSO results
loaded_fitness_scores = pd.read_pickle(
    os.path.join(data_directory, "pso_fitness_scores.pkl"))
loaded_particle_history = pd.read_pickle(
    os.path.join(data_directory, "pso_particle_history.pkl"))

In [None]:
# Plot PSO results
plot_functions.plot_pso_fitness_evolution(
    fitness_history=loaded_fitness_scores,
    save_path=os.path.join(data_directory, "pso_fitness_evolution.png"))
plot_functions.plot_population_diversity(
    particle_history=loaded_particle_history,
    save_path=os.path.join(data_directory, "pso_population_diversity.png"))
plot_functions.plot_feature_selection_frequency(
    particle_history=loaded_particle_history,
    save_path=os.path.join(data_directory, "pso_feature_selection_frequency.png"))

## Initialize and train improved NN Model

In [None]:
# the optimised solution is saved locally, however, the returned varible from
# the binary_pso function can also be passed to the model (see below)
# pso_genes = [item for item, m in zip(current_genes, best_solution) if m == 1]

# othervise just load the locally stored result file

# Load the saved genes list
with open(os.path.join(data_directory, "pso_selected_genes.pkl"), "rb") as f:
    pso_genes = pickle.load(f)

In [None]:
# now use this gene list to train a new model, which you can then evaluate
n_pso_input_features = len(pso_genes)

improved_model = AdvancedNN(
    n_input_features=n_pso_input_features,
    config=config,
)

# select feature subset and scale data
x_train, x_test, y_train, y_test = current_data.get_scaled_feature_subset(
    feature_subset=pso_genes
)

# Train model
improved_train_history = improved_model.train(
    x_train=x_train,
    y_train=y_train,
    x_test=x_test,
    y_test=y_test,
    **training_params,
)

In [None]:
plot_functions.plot_model_history(
    model_history=improved_train_history,
    report_frequency=improved_model.config.report_frequency,
    y_train=y_train,
    y_test=y_test,
    save_path=os.path.join(  # uncomment to save plot on disc
        data_directory, "improved_model_history.png"),
    data_save_path=os.path.join(
        data_directory, "improved_training_metrics.csv")
)