# DESeq2 MLFLow example

This notebook shows how ML FLow can be used to keep track of a Bioinformatics analysis.

We use the tutorial from the Python DESEq2 package as an example of Differential Expression analysis.

## Notebook parameters

These parameters control some of the options for the DESEq2 analysis. Modify them to customize the behaviour of the analysis.
The values are tracked in MLflow.

## Load libraries

In [0]:
!pip install pydeseq2


In [0]:
import os
import pickle as pkl

import numpy as np

from pydeseq2.dds import DeseqDataSet
from pydeseq2.default_inference import DefaultInference
from pydeseq2.ds import DeseqStats
from pydeseq2.utils import load_example_data

SAVE = False  # whether to save the outputs of this notebook

if SAVE:
    # Replace this with the path to directory where you would like results to be
    # saved
    OUTPUT_PATH = "../output_files/synthetic_example"
    os.makedirs(OUTPUT_PATH, exist_ok=True)  # Create path if it doesn't exist

## Initialize ML Flow experiment

In [0]:
import mlflow
import databricks.connect as db_connect
import mlflow.tracking._model_registry.utils

# Workaround to set the registry URI manually
mlflow.tracking._model_registry.utils._get_registry_uri_from_spark_session = lambda: "databricks-uc"

mlflow.login() # This prints an INFO-log: Login successful!
# mlflow.set_model_uri("databricks")
spark_ctx = db_connect.DatabricksSession.builder.serverless(True).getOrCreate()
#train_and_log_ml_model(spark_ctx)

In [0]:

# Set the tracking URI to your Databricks workspace
mlflow.set_tracking_uri("databricks")


In [0]:
import mlflow

client = mlflow.tracking.MlflowClient()
experiment_name = "/Shared/DESEQ2_tutorial"

# Check if the experiment already exists
experiment = client.get_experiment_by_name(experiment_name)
if experiment is not None:
    experiment_id = experiment.experiment_id
else:
    experiment_id = mlflow.create_experiment(experiment_name)

print(f"Experiment ID: {experiment_id}")

In [0]:
mlflow.start_run()

### Logging Run parameters

In [0]:
refit_cooks = False # Refit after filtering outliers based on Cooks distance
lfcShrink = True # Apply LFC Shrinkage to results

mlflow.log_params({
    'refit_cooks': refit_cooks,
    'lfcShrink': lfcShrink
})

## Load data

In [0]:
counts_df = load_example_data(
    modality="raw_counts",
    dataset="synthetic",
    debug=False,
)
#display(counts_df)
metadata = load_example_data(
    modality="metadata",
    dataset="synthetic",
    debug=False,
)

In [0]:
import mlflow
from mlflow.models.signature import infer_signature
from mlflow.data import from_pandas

# Convert the DataFrame to an MLflow Dataset
mlflow_dataset = from_pandas(counts_df)

# Log the dataset
mlflow.log_input(mlflow_dataset)

In [0]:
import mlflow
from mlflow.models.signature import infer_signature
from mlflow.data import from_pandas

# Convert the DataFrame to an MLflow Dataset
mlflow_metadata = from_pandas(metadata)

# Log the dataset
mlflow.log_input(mlflow_metadata)

## Initializing DESeq dataset

In [0]:
inference = DefaultInference(n_cpus=8)
dds = DeseqDataSet(
    counts=counts_df,
    metadata=metadata,
    design="~condition",  # compare samples based on the "condition"
    # column ("B" vs "A")
    refit_cooks=True,
    inference=inference,
)

### Computing Normalization

In [0]:
dds.fit_size_factors()

dds.obsm["size_factors"]

### Fit GeneWise dispersion

In [0]:
dds.fit_genewise_dispersions()

dds.varm["genewise_dispersions"]

### Fit Dispersion Trend coefficient

In [0]:
dds.fit_dispersion_trend()
dds.uns["trend_coeffs"]
dds.varm["fitted_dispersions"]

### Fit Dispersion Priors

In [0]:
dds.fit_dispersion_prior()
print(
    f"logres_prior={dds.uns['_squared_logres']}, sigma_prior={dds.uns['prior_disp_var']}"
)

### Map Dispersion

In [0]:
dds.fit_MAP_dispersions()
dds.varm["MAP_dispersions"]
dds.varm["dispersions"]

## Computing LFC

In [0]:
dds.fit_LFC()
dds.varm["LFC"]

### Calculate Cooks Distance

In [0]:
dds.calculate_cooks()
if dds.refit_cooks:
    # Replace outlier counts
    dds.refit()

### Refit Cooks Distance

In [0]:
dds.calculate_cooks()

if refit_cooks:
    if dds.refit_cooks:
        # Replace outlier counts
        dds.refit()
if SAVE:
    with open(os.path.join(OUTPUT_PATH, "dds_detailed_pipe.pkl"), "wb") as f:
        pkl.dump(dds, f)

## Statistical Analysis

In [0]:
ds = DeseqStats(
    dds,
    contrast=np.array([0, 1]),
    alpha=0.05,
    cooks_filter=True,
    independent_filter=True,
)

### Wald test

In [0]:
ds.run_wald_test()
ds.p_values

### Cooks refiltering

In [0]:
if refit_cooks:
    if ds.cooks_filter:
        ds._cooks_filtering()
    ds.p_values

## P-Value adjustment

In [0]:
if ds.independent_filter:
    ds._independent_filtering()
else:
    ds._p_value_adjustment()

## Summary

In [0]:
ds.summary()


# Shrinkage

In [0]:
if lfcShrink:
    results = ds.lfc_shrink(coeff="condition[T.B]")
else:
    results = ds.summary()

In [0]:
ds.results_df

In [0]:
if SAVE:
    with open(os.path.join(OUTPUT_PATH, "shrunk_results_detailed_pipe.pkl"), "wb") as f:
        pkl.dump(ds, f)

## Log results to mlflow

In [0]:
n_significant_genes = len(ds.results_df[ds.results_df["padj"] < 0.05])
n_significant_genes
is_gene3_significant = "gene3" in ds.results_df[ds.results_df["padj"] < 0.05].index
upregulated_genes = len(ds.results_df[ds.results_df["log2FoldChange"] > 0])
downregulated_genes = len(ds.results_df[ds.results_df["log2FoldChange"] < 0])
average_fold_ratio = ds.results_df["log2FoldChange"].mean()
n_significant_genes, is_gene3_significant, average_fold_ratio, upregulated_genes, downregulated_genes


In [0]:
mlflow.log_metrics(
    {
        "n_significant_genes": n_significant_genes,
        "is_gene3_significant": is_gene3_significant,
        "upregulated_genes": upregulated_genes,
        "downregulated_genes": downregulated_genes,
        "average_fold_ratio": average_fold_ratio
    }
)

In [0]:
mlflow.end_run()