In [1]:
import os
import sys
import pandas as pd

notebook_dir = os.path.abspath(os.path.dirname("__file__"))
project_root = os.path.abspath(os.path.join(notebook_dir, ".."))  # Go up one level

In [2]:
# Import from mozzarellm package
from mozzarellm import analyze_gene_clusters, reshape_to_clusters
from mozzarellm.prompts import ROBUST_SCREEN_CONTEXT, ROBUST_CLUSTER_PROMPT
from mozzarellm.configs import (
    DEFAULT_CONFIG,
    DEFAULT_ANTHROPIC_CONFIG,
    DEFAULT_GEMINI_CONFIG,
    DEFAULT_OPENAI_CONFIG,
    DEFAULT_OPENAI_REASONING_CONFIG,
)

# You can also set these variables in your notebook or script
# os.environ["OPENAI_API_KEY"] = "your_openai_key_here"
# os.environ["ANTHROPIC_API_KEY"] = "your_anthropic_key_here"
# os.environ["GOOGLE_API_KEY"] = "your_google_key_here"

In [3]:
# Read sample data from the notebook dir:
sample_data = pd.read_csv(os.path.join(notebook_dir, "sample_data.csv"))

In [6]:
# Reshape sample data
cluster_df, gene_features = reshape_to_clusters(
    input_df=sample_data, uniprot_col="uniprot_function", verbose=True
)

Using provided DataFrame with 140 rows
Found 140 genes across 6 clusters
Extracting gene features from uniprot_function column


In [8]:
display(cluster_df)
display(gene_features)

Unnamed: 0,cluster_id,genes
0,21,AATF;ABT1;BYSL;BMS1;C1orf131;EIF3M;EIF4A1;ESF1...
1,37,SRSF3;PDPK1;RICTOR;RPTOR;SEH1L;SGF29;PRKAR1A;P...
2,121,CCDC174;FAM32A;GABPA;SP2;N6AMT1;SETD2;SON;POU5...
3,149,KRAS;BRAF;NDUFV2;NDUFA6;NDUFC1;RAD23B;SNAPC1;N...
4,167,POMP;PSMA2;PSMB7;PSMB3;PSMA7;PSMB2;PSMA1;PSMA4...
5,197,SPAST;NCOR2;NCAPD3;HNRNPD;MCM3;METTL14;METTL3;...


Unnamed: 0,gene_symbol,uniprot_function
0,AATF,"Part of the small subunit (SSU) processome, fi..."
1,ABT1,Could be a novel TATA-binding protein (TBP) wh...
2,BYSL,Required for processing of 20S pre-rRNA precur...
3,BMS1,GTPase required for the synthesis of 40S ribos...
4,C1orf131,"Part of the small subunit (SSU) processome, fi..."
...,...,...
135,METTL3,The METTL3-METTL14 heterodimer forms a N6-meth...
136,FITM1,Plays an important role in the formation of li...
137,PTCHD4,Could act as a repressor of canonical hedgehog...
138,VRK1,Serine/threonine kinase involved in the regula...


In [9]:
print(ROBUST_SCREEN_CONTEXT)


Genes grouped within a cluster tend to exhibit similar morphological phenotypes in this context, suggesting that they may participate in the same biological process or pathway. However, not all clusters will correspond to a defined or coherent biological pathway.

When evaluating pathway confidence, apply these stringent criteria:

HIGH CONFIDENCE:
- Multiple well-established genes (≥3) with strong literature support in the same specific pathway
- Clear functional relationship between genes that explains the observed phenotypic clustering
- Genes represent different aspects or components of the same biological process
- The pathway assignment explains >60% of genes in the cluster

MEDIUM CONFIDENCE:
- Some established genes (1-2) from a specific pathway, with additional supporting genes
- Functional relationship is plausible but has some gaps or uncertainties
- Some genes in the cluster have unclear relationship to the proposed pathway
- The pathway assignment explains 40-60% of genes

In [10]:
print(ROBUST_CLUSTER_PROMPT)


Analyze gene cluster {cluster_id} to identify the dominant biological pathway and classify genes:

Genes: {gene_list}

For each cluster:
1. Identify the dominant biological pathway, focusing on specific molecular mechanisms rather than general terms
2. For clusters with coherent biological signatures, classify each gene into one of three mutually exclusive categories:
   - ESTABLISHED: Well-known members of the identified pathway with clear functional roles in this pathway
   - UNCHARACTERIZED: Genes with minimal to no functional annotation in ANY published literature
   - NOVEL_ROLE: Genes with published functional annotation in OTHER pathways that may have additional roles in the dominant pathway

3. For both UNCHARACTERIZED and NOVEL_ROLE genes:
   - Assign a priority score (1-10) for follow-up investigation
   - Provide a rationale explaining why this gene merits investigation

4. Provide a concise summary of the key findings for each cluster

When classifying and prioritizing gen

In [11]:
RESULTS_DIR = os.path.join(project_root, "results", "example_data")

In [None]:
# Create results directory if it doesn't exist
os.makedirs(RESULTS_DIR, exist_ok=True)

# Run analysis with OpenAI GPT-4o
openai_results = analyze_gene_clusters(
    # Input data options
    input_df=cluster_df,
    # Model and configuration
    model_name="o4-mini",
    config_path=DEFAULT_OPENAI_REASONING_CONFIG,
    # Analysis context and prompts
    screen_context=ROBUST_SCREEN_CONTEXT,
    cluster_analysis_prompt=ROBUST_CLUSTER_PROMPT,
    # Gene annotations
    gene_annotations_df=gene_features,
    # Processing options
    batch_size=1,
    # Output options
    output_file=f"{RESULTS_DIR}/gpt-4o",
    save_outputs=True,
    outputs_to_generate=["json", "clusters", "flagged_genes"],
)

ERROR:root:Error loading config file: stat: path should be string, bytes, os.PathLike or integer, not dict
INFO:cluster_analysis_20250506_143344.log:Processing 6 clusters with model o4-mini


Loaded data with 6 rows and columns: ['cluster_id', 'genes']
Created annotations dictionary with 140 entries from DataFrame


Processing clusters:   0%|          | 0/6 [00:00<?, ?it/s]INFO:cluster_analysis_20250506_143344.log:Accessing OpenAI API


Using default template for type: cluster
Appending output format instructions to template
Added 39 gene feature descriptions to prompt


INFO:cluster_analysis_20250506_143344.log:API call successful: 11799 tokens, $0.1180
INFO:cluster_analysis_20250506_143344.log:Success for cluster 21
Processing clusters:  17%|█▋        | 1/6 [00:47<03:56, 47.38s/it]INFO:cluster_analysis_20250506_143344.log:Accessing OpenAI API


Using default template for type: cluster
Appending output format instructions to template
Added 33 gene feature descriptions to prompt
