In [1]:
import os
import sys

notebook_dir = os.path.abspath(os.path.dirname('__file__'))
project_root = os.path.abspath(os.path.join(notebook_dir, '..'))  # Go up one level
if project_root not in sys.path:
    sys.path.insert(0, project_root)

In [2]:
# Import necessary libraries
from dotenv import load_dotenv

# Import from mozzarellm package
from mozzarellm import analyze_gene_clusters

# Load environment variables (for API keys)
load_dotenv()

# Set up paths and parameters
PROJECT_NAME = "example_analysis"
RESULTS_DIR = os.path.join(project_root, "results", PROJECT_NAME)
PROCESSED_FILE = os.path.join(project_root, "data", "sample_gene_sets.csv")
GENE_FEATURES = os.path.join(project_root, "data", "HeLa_essentials/essentials_uniprot.csv")

In [3]:
# # Step 1: Reshape clusters - now using the refactored function
# print("Preprocessing data: Converting raw data to cluster format...")
# cluster_df = reshape_to_clusters(
#     input_file=INPUT_FILE,
#     output_file=PROCESSED_FILE,
#     sep=",",
#     gene_col="gene_symbol_0",
#     cluster_col="cluster",
#     gene_sep=";"
# )

# # Display a sample of the reshaped data
# print("Sample of reshaped data:")
# display(cluster_df.head())

In [4]:
# Create results directory if it doesn't exist
os.makedirs(RESULTS_DIR, exist_ok=True)

# Step 1: Run analysis with OpenAI GPT-4o
print("Running analysis with OpenAI GPT-4o...")
openai_results = analyze_gene_clusters(
    input_file=PROCESSED_FILE,
    output_file=f"{RESULTS_DIR}/gpt-4o",
    config_path=os.path.join(project_root, "config_openai.json"),
    model_name="gpt-4o",
    custom_prompt_path=os.path.join(project_root, "mozzarellm", "prompts", "top_targets.txt"),
    gene_features_path=GENE_FEATURES,
    screen_info_path=os.path.join(project_root, "mozzarellm", "prompts", "HeLa_interphase_screen_info.txt"),
    gene_column="genes",
    gene_sep=";",
    batch_size=1,
    cluster_id_column="cluster_id",
)

# Step 2: Run analysis with Anthropic Claude-3-7-Sonnet
print("Running analysis with Anthropic Claude-3-7-Sonnet...")
claude_results = analyze_gene_clusters(
    input_file=PROCESSED_FILE,
    output_file=f"{RESULTS_DIR}/claude-3-7-sonnet",
    config_path=os.path.join(project_root, "config_anthropic.json"),
    model_name="claude-3-7-sonnet-20250219",
    custom_prompt_path=os.path.join(project_root, "mozzarellm", "prompts", "top_targets.txt"),
    gene_features_path=GENE_FEATURES,
    screen_info_path=os.path.join(project_root, "mozzarellm", "prompts", "HeLa_interphase_screen_info.txt"),
    gene_column="genes",
    gene_sep=";",
    batch_size=1,
    cluster_id_column="cluster_id",
)

print(f"Analysis complete. Results saved to {RESULTS_DIR}/")

Running analysis with OpenAI GPT-4o...
Loaded data with 7 rows and columns: ['cluster_id', 'genes']
Loaded features for 1095 genes
Loaded screen information: 2129 characters


Processing clusters:   0%|          | 0/7 [00:00<?, ?it/s]

Added 6 gene feature descriptions to prompt


Processing clusters:  14%|█▍        | 1/7 [00:09<00:54,  9.16s/it]

Added 2 gene feature descriptions to prompt


Processing clusters:  29%|██▊       | 2/7 [00:14<00:35,  7.07s/it]

Added 5 gene feature descriptions to prompt


Processing clusters:  43%|████▎     | 3/7 [00:23<00:30,  7.63s/it]

Added 3 gene feature descriptions to prompt


Processing clusters:  57%|█████▋    | 4/7 [00:28<00:20,  6.69s/it]

Added 2 gene feature descriptions to prompt


INFO:cluster_analysis_20250501_180431.log:Saved progress for 5 clusters
Processing clusters:  71%|███████▏  | 5/7 [00:37<00:14,  7.49s/it]INFO:cluster_analysis_20250501_180431.log:Accessing OpenAI API


Added 3 gene feature descriptions to prompt


INFO:cluster_analysis_20250501_180431.log:API call successful: 2272 tokens, $0.0227
INFO:cluster_analysis_20250501_180431.log:Success for cluster 37
Processing clusters:  86%|████████▌ | 6/7 [00:45<00:07,  7.69s/it]INFO:cluster_analysis_20250501_180431.log:Accessing OpenAI API


No relevant gene features found for this cluster


INFO:cluster_analysis_20250501_180431.log:API call successful: 1796 tokens, $0.0180
INFO:cluster_analysis_20250501_180431.log:Success for cluster 94
Processing clusters: 100%|██████████| 7/7 [00:48<00:00,  6.99s/it]
INFO:cluster_analysis_20250501_180431.log:Completed analysis for 7 clusters
INFO:cluster_analysis_20250501_180520.log:Processing 7 clusters with model claude-3-7-sonnet-20250219


Running analysis with Anthropic Claude-3-7-Sonnet...
Loaded data with 7 rows and columns: ['cluster_id', 'genes']
Loaded features for 1095 genes
Loaded screen information: 2129 characters


Processing clusters:   0%|          | 0/7 [00:00<?, ?it/s]INFO:cluster_analysis_20250501_180520.log:Using Anthropic Claude API


Added 6 gene feature descriptions to prompt


INFO:cluster_analysis_20250501_180520.log:Anthropic API call successful: model=claude-3-7-sonnet-20250219
INFO:cluster_analysis_20250501_180520.log:Success for cluster 149
Processing clusters:  14%|█▍        | 1/7 [00:23<02:19, 23.23s/it]INFO:cluster_analysis_20250501_180520.log:Using Anthropic Claude API


Added 2 gene feature descriptions to prompt


INFO:cluster_analysis_20250501_180520.log:Anthropic API call successful: model=claude-3-7-sonnet-20250219
INFO:cluster_analysis_20250501_180520.log:Success for cluster 121
Processing clusters:  29%|██▊       | 2/7 [00:41<01:42, 20.43s/it]INFO:cluster_analysis_20250501_180520.log:Using Anthropic Claude API


Added 5 gene feature descriptions to prompt


INFO:cluster_analysis_20250501_180520.log:Anthropic API call successful: model=claude-3-7-sonnet-20250219
INFO:cluster_analysis_20250501_180520.log:Success for cluster 21
Processing clusters:  43%|████▎     | 3/7 [01:03<01:24, 21.12s/it]INFO:cluster_analysis_20250501_180520.log:Using Anthropic Claude API


Added 3 gene feature descriptions to prompt


INFO:cluster_analysis_20250501_180520.log:Anthropic API call successful: model=claude-3-7-sonnet-20250219
INFO:cluster_analysis_20250501_180520.log:Success for cluster 167
Processing clusters:  57%|█████▋    | 4/7 [01:15<00:52, 17.37s/it]INFO:cluster_analysis_20250501_180520.log:Using Anthropic Claude API


Added 2 gene feature descriptions to prompt


INFO:cluster_analysis_20250501_180520.log:Anthropic API call successful: model=claude-3-7-sonnet-20250219
INFO:cluster_analysis_20250501_180520.log:Success for cluster 197
INFO:cluster_analysis_20250501_180520.log:Saved progress for 5 clusters
Processing clusters:  71%|███████▏  | 5/7 [01:30<00:32, 16.49s/it]INFO:cluster_analysis_20250501_180520.log:Using Anthropic Claude API


Added 3 gene feature descriptions to prompt


INFO:cluster_analysis_20250501_180520.log:Anthropic API call successful: model=claude-3-7-sonnet-20250219
INFO:cluster_analysis_20250501_180520.log:Success for cluster 37
Processing clusters:  86%|████████▌ | 6/7 [01:42<00:15, 15.11s/it]INFO:cluster_analysis_20250501_180520.log:Using Anthropic Claude API


No relevant gene features found for this cluster


INFO:cluster_analysis_20250501_180520.log:Anthropic API call successful: model=claude-3-7-sonnet-20250219
INFO:cluster_analysis_20250501_180520.log:Success for cluster 94
Processing clusters: 100%|██████████| 7/7 [01:52<00:00, 16.06s/it]
INFO:cluster_analysis_20250501_180520.log:Completed analysis for 7 clusters


Analysis complete. Results saved to /lab/barcheese01/mdiberna/mozzarellm/results/example_analysis/
