💡💡 **Train a NN with as many genes as possible to see what's a possible upper bound in accuracy** ✨

- with bulk data

**Author:** Prisca Dotti

**Last modified:** 10.09.2025

In [14]:
import tensorflow as tf
print(tf.__version__)

2.16.2


In [None]:
# autoreload is used to reload modules automatically before entering the
# execution of code typed at the IPython prompt.
%load_ext autoreload
%autoreload 2

In [None]:
import logging
import os
import pickle
from datetime import datetime

import pandas as pd

import PAGEpy
from PAGEpy import plot_functions, pso, utils
from PAGEpy.dataset_class import GeneExpressionDataset
from PAGEpy.models import AdvancedNN, TrainingConfig

PAGEpy.setup_jupyter_logging(level=logging.INFO)
logger = logging.getLogger("Train Large NN")

In [None]:
# Initialize CUDA for GPU support
gpu_available = utils.init_tensorflow()

### Configure output filenames

By default the run ID is the current date & time, manually change this to resume a previous run.

In [None]:
# run_id = datetime.now().strftime("%y%m%d_%H%M%S")
run_id = "TEST"

output_dir = "large_nn_output"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

data_directory = os.path.join(output_dir, f"{run_id}_data")
if not os.path.exists(data_directory):
    os.makedirs(data_directory)

## Create Dataset

In [None]:
current_data = GeneExpressionDataset(
    data_dir="../../bulk_data/",
    counts_pattern="count_matrix.mtx",
    barcodes_pattern="sample_names.txt",
    # I think here it's possible to set already processed gene list:
    genes_pattern="gene_names.txt",
    metadata_pattern="response_labels.csv",
    gene_selection="",
    features_out_filename=os.path.join(
        data_directory, "feature_set.pkl"),
    train_samples_out_filename=os.path.join(
        data_directory, "train_samples.txt"),
    positive_label="yes"  # TODO: find out how Sean originally encoded it
)

In [None]:
current_genes = current_data.selected_features

## Initialize and train NN Model

Set NN model parameters

In [None]:
# retrieved default params used in Sean's code
config = TrainingConfig(
    report_frequency=1,
    auc_threshold=1,
    learning_rate=0.001,
    multiplier=12)

training_params = {
    # 'n_epochs': 50,
    'n_epochs': 500,
    'batch_size': 64,
    'seed': 42,
}

Train NN model with all genes

In [None]:
# Initialize NN model
initial_model = AdvancedNN(
    n_input_features=len(current_genes),
    config=config,
)

# Train model
train_history = initial_model.train(
    x_train=current_data.x_train,
    y_train=current_data.y_train,
    x_test=current_data.x_test,
    y_test=current_data.y_test,
    **training_params,
)

In [None]:
plot_functions.plot_model_history(
    model_history=train_history,
    report_frequency=initial_model.config.report_frequency,
    y_train=current_data.y_train,
    y_test=current_data.y_test,
    save_path=os.path.join(  # uncomment to save plot on disc
        data_directory, "large_NN_history.png"),
    data_save_path=os.path.join(
        data_directory, "large_NN_metrics.csv")
)