In [1]:
import os
import sys

notebook_dir = os.path.abspath(os.path.dirname("__file__"))
project_root = os.path.abspath(os.path.join(notebook_dir, ".."))  # Go up one level
if project_root not in sys.path:
    sys.path.insert(0, project_root)

In [2]:
# Import necessary libraries
from dotenv import load_dotenv

# Import from mozzarellm package
from mozzarellm import analyze_gene_clusters

# Load environment variables (for API keys)
load_dotenv()

# Set up paths and parameters
PROJECT_NAME = "example_analysis"
RESULTS_DIR = os.path.join(project_root, "results", PROJECT_NAME)
PROCESSED_FILE = os.path.join(project_root, "data", "sample_gene_sets.csv")
GENE_FEATURES = os.path.join(
    project_root, "data", "HeLa_essentials/essentials_uniprot.csv"
)

In [3]:
# Create results directory if it doesn't exist
os.makedirs(RESULTS_DIR, exist_ok=True)

# Run analysis with OpenAI GPT-4o
print("Running analysis with OpenAI GPT-4o...")
openai_results = analyze_gene_clusters(
    input_file=PROCESSED_FILE,
    output_file=f"{RESULTS_DIR}/gpt-4o",
    config_path="config_openai.json",
    model_name="gpt-4o",
    custom_prompt_path="top_targets.txt",
    gene_features_path=GENE_FEATURES,
    screen_info_path="HeLa_interphase_screen_info.txt",
    gene_column="genes",
    gene_sep=";",
    batch_size=1,
    cluster_id_column="cluster_id",
)

# Run analysis with Anthropic Claude-3-7-Sonnet
print("Running analysis with Anthropic Claude-3-7-Sonnet...")
claude_results = analyze_gene_clusters(
    input_file=PROCESSED_FILE,
    output_file=f"{RESULTS_DIR}/claude-3-7-sonnet",
    config_path="config_anthropic.json",
    model_name="claude-3-7-sonnet-20250219",
    custom_prompt_path="top_targets.txt",
    gene_features_path=GENE_FEATURES,
    screen_info_path="HeLa_interphase_screen_info.txt",
    gene_column="genes",
    gene_sep=";",
    batch_size=1,
    cluster_id_column="cluster_id",
)

print(f"Analysis complete. Results saved to {RESULTS_DIR}/")

INFO:cluster_analysis_20250503_185128.log:Processing 7 clusters with model gpt-4o


Running analysis with OpenAI GPT-4o...
Loaded data with 7 rows and columns: ['cluster_id', 'genes']
Loaded features for 1095 genes
Loaded screen information: 2129 characters


Processing clusters:   0%|          | 0/7 [00:00<?, ?it/s]INFO:cluster_analysis_20250503_185128.log:Accessing OpenAI API


Attempting to load template from: top_targets.txt
Template found at: /lab/barcheese01/mdiberna/mozzarellm/mozzarellm/prompts/top_targets.txt
Successfully loaded template (3457 characters)
Appending output format instructions to template
Added 6 gene feature descriptions to prompt


INFO:cluster_analysis_20250503_185128.log:API call successful: 3679 tokens, $0.0368
INFO:cluster_analysis_20250503_185128.log:Success for cluster 149
Processing clusters:  14%|█▍        | 1/7 [00:06<00:39,  6.51s/it]INFO:cluster_analysis_20250503_185128.log:Accessing OpenAI API


Attempting to load template from: top_targets.txt
Template found at: /lab/barcheese01/mdiberna/mozzarellm/mozzarellm/prompts/top_targets.txt
Successfully loaded template (3457 characters)
Appending output format instructions to template
Added 2 gene feature descriptions to prompt


INFO:cluster_analysis_20250503_185128.log:API call successful: 2312 tokens, $0.0231
INFO:cluster_analysis_20250503_185128.log:Success for cluster 121
Processing clusters:  29%|██▊       | 2/7 [00:14<00:35,  7.18s/it]INFO:cluster_analysis_20250503_185128.log:Accessing OpenAI API


Attempting to load template from: top_targets.txt
Template found at: /lab/barcheese01/mdiberna/mozzarellm/mozzarellm/prompts/top_targets.txt
Successfully loaded template (3457 characters)
Appending output format instructions to template
Added 5 gene feature descriptions to prompt


INFO:cluster_analysis_20250503_185128.log:API call successful: 2569 tokens, $0.0257
INFO:cluster_analysis_20250503_185128.log:Success for cluster 21
Processing clusters:  43%|████▎     | 3/7 [00:22<00:30,  7.51s/it]INFO:cluster_analysis_20250503_185128.log:Accessing OpenAI API


Attempting to load template from: top_targets.txt
Template found at: /lab/barcheese01/mdiberna/mozzarellm/mozzarellm/prompts/top_targets.txt
Successfully loaded template (3457 characters)
Appending output format instructions to template
Added 3 gene feature descriptions to prompt


INFO:cluster_analysis_20250503_185128.log:API call successful: 2734 tokens, $0.0273
INFO:cluster_analysis_20250503_185128.log:Success for cluster 167
Processing clusters:  57%|█████▋    | 4/7 [00:30<00:23,  7.95s/it]INFO:cluster_analysis_20250503_185128.log:Accessing OpenAI API


Attempting to load template from: top_targets.txt
Template found at: /lab/barcheese01/mdiberna/mozzarellm/mozzarellm/prompts/top_targets.txt
Successfully loaded template (3457 characters)
Appending output format instructions to template
Added 2 gene feature descriptions to prompt


INFO:cluster_analysis_20250503_185128.log:API call successful: 2645 tokens, $0.0265
INFO:cluster_analysis_20250503_185128.log:Success for cluster 197
INFO:cluster_analysis_20250503_185128.log:Saved progress for 5 clusters
Processing clusters:  71%|███████▏  | 5/7 [00:36<00:13,  7.00s/it]INFO:cluster_analysis_20250503_185128.log:Accessing OpenAI API


Attempting to load template from: top_targets.txt
Template found at: /lab/barcheese01/mdiberna/mozzarellm/mozzarellm/prompts/top_targets.txt
Successfully loaded template (3457 characters)
Appending output format instructions to template
Added 3 gene feature descriptions to prompt


INFO:cluster_analysis_20250503_185128.log:API call successful: 2234 tokens, $0.0223
INFO:cluster_analysis_20250503_185128.log:Success for cluster 37
Processing clusters:  86%|████████▌ | 6/7 [00:42<00:06,  6.85s/it]INFO:cluster_analysis_20250503_185128.log:Accessing OpenAI API


Attempting to load template from: top_targets.txt
Template found at: /lab/barcheese01/mdiberna/mozzarellm/mozzarellm/prompts/top_targets.txt
Successfully loaded template (3457 characters)
Appending output format instructions to template
No relevant gene features found for this cluster


INFO:cluster_analysis_20250503_185128.log:API call successful: 1797 tokens, $0.0180
INFO:cluster_analysis_20250503_185128.log:Success for cluster 94
Processing clusters: 100%|██████████| 7/7 [00:46<00:00,  6.64s/it]
INFO:cluster_analysis_20250503_185128.log:Completed analysis for 7 clusters
INFO:cluster_analysis_20250503_185215.log:Processing 7 clusters with model claude-3-7-sonnet-20250219


Running analysis with Anthropic Claude-3-7-Sonnet...
Loaded data with 7 rows and columns: ['cluster_id', 'genes']
Loaded features for 1095 genes
Loaded screen information: 2129 characters


Processing clusters:   0%|          | 0/7 [00:00<?, ?it/s]INFO:cluster_analysis_20250503_185215.log:Using Anthropic Claude API


Attempting to load template from: top_targets.txt
Template found at: /lab/barcheese01/mdiberna/mozzarellm/mozzarellm/prompts/top_targets.txt
Successfully loaded template (3457 characters)
Appending output format instructions to template
Added 6 gene feature descriptions to prompt


INFO:cluster_analysis_20250503_185215.log:Anthropic API call successful: model=claude-3-7-sonnet-20250219
INFO:cluster_analysis_20250503_185215.log:Success for cluster 149
Processing clusters:  14%|█▍        | 1/7 [00:21<02:09, 21.58s/it]INFO:cluster_analysis_20250503_185215.log:Using Anthropic Claude API


Attempting to load template from: top_targets.txt
Template found at: /lab/barcheese01/mdiberna/mozzarellm/mozzarellm/prompts/top_targets.txt
Successfully loaded template (3457 characters)
Appending output format instructions to template
Added 2 gene feature descriptions to prompt


INFO:cluster_analysis_20250503_185215.log:Anthropic API call successful: model=claude-3-7-sonnet-20250219
INFO:cluster_analysis_20250503_185215.log:Success for cluster 121
Processing clusters:  29%|██▊       | 2/7 [00:39<01:37, 19.50s/it]INFO:cluster_analysis_20250503_185215.log:Using Anthropic Claude API


Attempting to load template from: top_targets.txt
Template found at: /lab/barcheese01/mdiberna/mozzarellm/mozzarellm/prompts/top_targets.txt
Successfully loaded template (3457 characters)
Appending output format instructions to template
Added 5 gene feature descriptions to prompt


INFO:cluster_analysis_20250503_185215.log:Anthropic API call successful: model=claude-3-7-sonnet-20250219
INFO:cluster_analysis_20250503_185215.log:Success for cluster 21
Processing clusters:  43%|████▎     | 3/7 [01:08<01:35, 23.85s/it]INFO:cluster_analysis_20250503_185215.log:Using Anthropic Claude API


Attempting to load template from: top_targets.txt
Template found at: /lab/barcheese01/mdiberna/mozzarellm/mozzarellm/prompts/top_targets.txt
Successfully loaded template (3457 characters)
Appending output format instructions to template
Added 3 gene feature descriptions to prompt


INFO:cluster_analysis_20250503_185215.log:Anthropic API call successful: model=claude-3-7-sonnet-20250219
INFO:cluster_analysis_20250503_185215.log:Success for cluster 167
Processing clusters:  57%|█████▋    | 4/7 [01:21<00:57, 19.31s/it]INFO:cluster_analysis_20250503_185215.log:Using Anthropic Claude API


Attempting to load template from: top_targets.txt
Template found at: /lab/barcheese01/mdiberna/mozzarellm/mozzarellm/prompts/top_targets.txt
Successfully loaded template (3457 characters)
Appending output format instructions to template
Added 2 gene feature descriptions to prompt


INFO:cluster_analysis_20250503_185215.log:Anthropic API call successful: model=claude-3-7-sonnet-20250219
INFO:cluster_analysis_20250503_185215.log:Success for cluster 197
INFO:cluster_analysis_20250503_185215.log:Saved progress for 5 clusters
Processing clusters:  71%|███████▏  | 5/7 [01:34<00:34, 17.30s/it]INFO:cluster_analysis_20250503_185215.log:Using Anthropic Claude API


Attempting to load template from: top_targets.txt
Template found at: /lab/barcheese01/mdiberna/mozzarellm/mozzarellm/prompts/top_targets.txt
Successfully loaded template (3457 characters)
Appending output format instructions to template
Added 3 gene feature descriptions to prompt


INFO:cluster_analysis_20250503_185215.log:Anthropic API call successful: model=claude-3-7-sonnet-20250219
INFO:cluster_analysis_20250503_185215.log:Success for cluster 37
Processing clusters:  86%|████████▌ | 6/7 [01:49<00:16, 16.55s/it]INFO:cluster_analysis_20250503_185215.log:Using Anthropic Claude API


Attempting to load template from: top_targets.txt
Template found at: /lab/barcheese01/mdiberna/mozzarellm/mozzarellm/prompts/top_targets.txt
Successfully loaded template (3457 characters)
Appending output format instructions to template
No relevant gene features found for this cluster


INFO:cluster_analysis_20250503_185215.log:Anthropic API call successful: model=claude-3-7-sonnet-20250219
INFO:cluster_analysis_20250503_185215.log:Success for cluster 94
Processing clusters: 100%|██████████| 7/7 [02:00<00:00, 17.19s/it]
INFO:cluster_analysis_20250503_185215.log:Completed analysis for 7 clusters


Analysis complete. Results saved to /lab/barcheese01/mdiberna/mozzarellm/results/example_analysis/
