In [1]:
import os
import sys
import pandas as pd
from collections import defaultdict

notebook_dir = os.path.abspath(os.path.dirname("__file__"))
project_root = os.path.abspath(os.path.join(notebook_dir, ".."))
if project_root not in sys.path:
    sys.path.insert(0, project_root)

In [2]:
# Read the Funk dataset and clean
funk = pd.read_excel(
    os.path.join(project_root, "data/benchmarks/1-s2.0-S0092867422013599-mmc4.xlsx"),
    skiprows=1,
)
# Select only the rows where cluster is in 149, 121, 21, 167, 197, 37
funk = funk[
    funk['Interphase cluster'].isin([149, 121, 21, 167, 197, 37])
].reset_index(drop=True)
# Rename "Gene symbol" to "gene_symbol" and "Interphase cluster" to "cluster"
funk.rename(
    columns={
        "Gene symbol": "gene_symbol",
        "Interphase cluster": "cluster",
    },
    inplace=True,
)
# Drop all other columns
funk = funk[["cluster", "gene_symbol"]]
# Sort by cluster
funk.sort_values(by=["cluster"], inplace=True)
# Display the Funk dataset
display(funk)

Unnamed: 0,cluster,gene_symbol
0,21,AATF
1,21,ABT1
6,21,BYSL
4,21,BMS1
7,21,C1orf131
...,...,...
56,197,METTL3
31,197,FITM1
98,197,PTCHD4
134,197,VRK1


In [3]:
# Read the Wainberg dataset and clean
wainberg = pd.read_excel(
    os.path.join(project_root, 'data/benchmarks/Supplementary_Data_2.xlsx'),
    sheet_name='Co-essential Modules',
    skiprows=(0,1),
)
# Select Module # 2067 and 2213
wainberg = wainberg[
    wainberg['Module #'].isin([2067, 2213])
].reset_index(drop=True)
# Create a new dataframe to store the reshaped data
reshaped_data = []
# Process each row in the original dataframe
for _, row in wainberg.iterrows():
    module_num = row['Module #']
    
    # Process 'Genes' column
    if pd.notna(row['Genes']):
        reshaped_data.append({
            'cluster': module_num,
            'gene_symbol': row['Genes']
        })
    
    # Process all the 'Unnamed: X' columns that contain gene symbols
    for col in wainberg.columns:
        if col.startswith('Unnamed:') and pd.notna(row[col]):
            # Skip empty cells or non-gene values
            if isinstance(row[col], str) and row[col].strip() != '':
                reshaped_data.append({
                    'cluster': module_num,
                    'gene_symbol': row[col]
                })
            elif isinstance(row[col], (int, float)) and not pd.isna(row[col]):
                # Convert numeric values to string if they appear to be gene IDs
                reshaped_data.append({
                    'cluster': module_num,
                    'gene_symbol': str(row[col])
                })
# Create the reshaped dataframe
wainberg = pd.DataFrame(reshaped_data)
# Sort by cluster
wainberg.sort_values(by=["cluster"], inplace=True)
# Display the Wainberg dataset
display(wainberg)

Unnamed: 0,cluster,gene_symbol
0,2067,AAGAB
1,2067,AGFG1
2,2067,AP2A1
3,2067,AP2A2
4,2067,AP2B1
5,2067,AP2M1
6,2067,AP2S1
7,2067,C15orf57
8,2067,FCHO2
9,2067,PICALM


In [4]:
schaffer = pd.read_excel(
    os.path.join(project_root, "data/benchmarks/U2OS Cell Map Assemblies.xlsx")
)
# Select Assembly ID C5255, C5415
schaffer = schaffer[
    schaffer['Assembly ID'].isin(['C5255', 'C5415'])
].reset_index(drop=True)
# Create a new dataframe to store the reshaped data
reshaped_data = []

# Process each row in the original dataframe
for _, row in schaffer.iterrows():
    cluster = row['Assembly ID']
    
    # Handle the case where Proteins might be missing or NaN
    if pd.notna(row['Proteins']):
        # Split the space-separated proteins
        proteins = str(row['Proteins']).split()
        
        # Add each protein to the reshaped data
        for protein in proteins:
            if protein.strip():  # Skip empty strings
                reshaped_data.append({
                    'cluster': cluster,
                    'gene_symbol': protein.strip()
                })
# Create the reshaped dataframe
schaffer = pd.DataFrame(reshaped_data)
# Rename "Gene Symbol" to "gene_symbol" and "Assembly ID" to "cluster"
schaffer.rename(
    columns={
        "Gene Symbol": "gene_symbol",
        "Assembly ID": "cluster",
    },
    inplace=True,
)
# Sort by cluster
schaffer.sort_values(by=["cluster"], inplace=True)
# Display the Schaffer dataset
display(schaffer)

Unnamed: 0,cluster,gene_symbol
0,C5255,RPP30
13,C5255,RNASEH1
12,C5255,POP1
11,C5255,OSGEP
10,C5255,POP7
9,C5255,C18orf21
7,C5255,RPP14
8,C5255,RPP40
5,C5255,RPP38
4,C5255,GON7


In [5]:
# Read in the uniprot_data.tsv
uniprot_data = pd.read_csv(os.path.join(project_root, "data/benchmarks/uniprot_data.tsv"), sep="\t")
# Create multiple rows for each gene name with position information
expanded_rows = []
for _, row in uniprot_data.iterrows():
    if pd.notna(row['gene_names']):
        gene_names = row['gene_names'].split()
        for position, gene in enumerate(gene_names):
            new_row = row.copy()
            new_row['gene_name'] = gene
            new_row['position'] = position
            expanded_rows.append(new_row)
# Create expanded dataframe
expanded_df = pd.DataFrame(expanded_rows)
# Sort by gene_name and position, then drop duplicates keeping the one with lower position
expanded_df = expanded_df.sort_values(['gene_name', 'position'])
expanded_df = expanded_df.drop_duplicates('gene_name', keep='first')
# Select and rename columns
uniprot_data = expanded_df[["gene_name", "entry", "function"]].rename(
    columns={"function": "uniprot_function"}
)

In [6]:
# Merge each dataset with UniProt data
for dataset_name in ['funk', 'wainberg', 'schaffer']:
    dataset = locals()[dataset_name]  # Access dataset from local variables
    
    # Merge with UniProt data
    dataset = dataset.merge(
        uniprot_data, 
        how="left", 
        left_on="gene_symbol", 
        right_on="gene_name"
    ).drop(columns="gene_name", errors='ignore')

    # Print the number of rows in the merged dataset and the number of rows missing a uniprot_function
    print(f"{dataset_name} dataset: {len(dataset)} rows, {dataset['uniprot_function'].isna().sum()} missing uniprot_function")
    
    # Update the original dataset variable
    locals()[dataset_name] = dataset

funk dataset: 140 rows, 5 missing uniprot_function
wainberg dataset: 18 rows, 0 missing uniprot_function
schaffer dataset: 18 rows, 1 missing uniprot_function


In [7]:
display(funk)
display(wainberg)
display(schaffer)

Unnamed: 0,cluster,gene_symbol,entry,uniprot_function
0,21,AATF,Q9NY61,"Part of the small subunit (SSU) processome, fi..."
1,21,ABT1,Q9ULW3,Could be a novel TATA-binding protein (TBP) wh...
2,21,BYSL,Q13895,Required for processing of 20S pre-rRNA precur...
3,21,BMS1,Q14692,GTPase required for the synthesis of 40S ribos...
4,21,C1orf131,Q8NDD1,"Part of the small subunit (SSU) processome, fi..."
...,...,...,...,...
135,197,METTL3,Q86U44,The METTL3-METTL14 heterodimer forms a N6-meth...
136,197,FITM1,A5D6W6,Plays an important role in the formation of li...
137,197,PTCHD4,Q6ZW05,Could act as a repressor of canonical hedgehog...
138,197,VRK1,Q99986,Serine/threonine kinase involved in the regula...


Unnamed: 0,cluster,gene_symbol,entry,uniprot_function
0,2067,AAGAB,Q6PD74,May be involved in endocytic recycling of grow...
1,2067,AGFG1,P52594,Required for vesicle docking or fusion during ...
2,2067,AP2A1,O95782,Component of the adaptor protein complex 2 (AP...
3,2067,AP2A2,O94973,Component of the adaptor protein complex 2 (AP...
4,2067,AP2B1,P63010,Component of the adaptor protein complex 2 (AP...
5,2067,AP2M1,Q96CW1,Component of the adaptor protein complex 2 (AP...
6,2067,AP2S1,P53680,Component of the adaptor protein complex 2 (AP...
7,2067,C15orf57,Q9BV29,Regulates clathrin-mediated endocytsois of car...
8,2067,FCHO2,Q0JRZ9,Functions in an early step of clathrin-mediate...
9,2067,PICALM,Q13492,Cytoplasmic adapter protein that plays a criti...


Unnamed: 0,cluster,gene_symbol,entry,uniprot_function
0,C5255,RPP30,P78346,"Component of ribonuclease P, a ribonucleoprote..."
1,C5255,RNASEH1,O60930,Endonuclease that specifically degrades the RN...
2,C5255,POP1,Q99575,"Component of ribonuclease P, a ribonucleoprote..."
3,C5255,OSGEP,Q9NPF4,Component of the EKC/KEOPS complex that is req...
4,C5255,POP7,O75817,"Component of ribonuclease P, a ribonucleoprote..."
5,C5255,C18orf21,Q32NC0,
6,C5255,RPP14,O95059,"Component of ribonuclease P, a ribonucleoprote..."
7,C5255,RPP40,O75818,"Component of ribonuclease P, a ribonucleoprote..."
8,C5255,RPP38,P78345,"Component of ribonuclease P, a ribonucleoprote..."
9,C5255,GON7,Q9BXV9,Component of the EKC/KEOPS complex that is req...


In [8]:
# Within benchmarks, create a new directory for each of the cleaned datasets, and save the cleaned datasets as name_clustering.csv
funk_output_dir = os.path.join(project_root, "data/benchmarks/funk")
os.makedirs(funk_output_dir, exist_ok=True)
funk.to_csv(
    os.path.join(funk_output_dir, "clustering.csv"),
    index=False,
)
wainberg_output_dir = os.path.join(project_root, "data/benchmarks/wainberg")
os.makedirs(wainberg_output_dir, exist_ok=True)
wainberg.to_csv(
    os.path.join(wainberg_output_dir, "clustering.csv"),
    index=False,
)
schaffer_output_dir = os.path.join(project_root, "data/benchmarks/schaffer")
os.makedirs(schaffer_output_dir, exist_ok=True)
schaffer.to_csv(
    os.path.join(schaffer_output_dir, "clustering.csv"),
    index=False,
)

In [9]:
from mozzarellm import analyze_gene_clusters, reshape_to_clusters

### Reshape clusters

In [11]:
GENE_COL = "gene_symbol"
CLUSTER_COL = "cluster"
UNIPROT_COL = "uniprot_function"
GENE_SEP = ";"

for dataset_name in ['funk', 'wainberg', 'schaffer']:
    # Define the cluster dataframe
    cluster_df = locals()[dataset_name]
    # Define the output file paths
    PROCESSED_FILE = os.path.join(
        project_root, f"data/benchmarks/{dataset_name}/cluster_data.csv"
    )
    GENE_FEATURES_FILE = os.path.join(
        project_root, f"data/benchmarks/{dataset_name}/uniprot_data.csv"
    )
    cluster_reshaped_df = reshape_to_clusters(
        input_df=cluster_df,
        output_file=PROCESSED_FILE,
        gene_col=GENE_COL,
        cluster_col=CLUSTER_COL,
        gene_sep=GENE_SEP,
        uniprot_col=UNIPROT_COL,
        gene_features_output=GENE_FEATURES_FILE,
        verbose=True
    )

Using provided DataFrame with 140 rows
Found 140 genes across 6 clusters
Extracting gene features from uniprot_function column
Saving gene features to /lab/barcheese01/mdiberna/mozzarellm/data/benchmarks/funk/uniprot_data.csv
Writing 6 clusters to output file: /lab/barcheese01/mdiberna/mozzarellm/data/benchmarks/funk/cluster_data.csv
Done!
Using provided DataFrame with 18 rows
Found 18 genes across 2 clusters
Extracting gene features from uniprot_function column
Saving gene features to /lab/barcheese01/mdiberna/mozzarellm/data/benchmarks/wainberg/uniprot_data.csv
Writing 2 clusters to output file: /lab/barcheese01/mdiberna/mozzarellm/data/benchmarks/wainberg/cluster_data.csv
Done!
Using provided DataFrame with 18 rows
Found 18 genes across 2 clusters
Extracting gene features from uniprot_function column
Saving gene features to /lab/barcheese01/mdiberna/mozzarellm/data/benchmarks/schaffer/uniprot_data.csv
Could not convert cluster_id to numeric: Unable to parse string "C5255" at positio

### Knowledge cutoffs:
- o4-mini: May 2024
- o3: May 2024
- o3-mini: Sep 2023
- gpt-4.1: May 2024
- gpt-4o: Sep 2023
- claude-3-7-sonnet-20250219: Nov 2024
- claude-3-5-haiku-20241022: Apr 2024
- gemini-2.5-pro-preview-03-25: Jan 2025
- gemini-2.5-flash-preview-04-17: Jan 2025

In [12]:
# Define the models and configurations to test
# models_to_test = {
#     "config_openai.json": ["o4-mini"],
#     "config_anthropic.json": ["claude-3-7-sonnet-20250219"],
# }
# Define the models and configurations to test
models_to_test = {
    "config_openai.json": ["o4-mini", "o3-mini", "gpt-4.1", "gpt-4o"],
    "config_anthropic.json": ["claude-3-7-sonnet-20250219", "claude-3-5-haiku-20241022"],
    "config_gemini.json": ["gemini-2.5-pro-preview-03-25", "gemini-2.5-flash-preview-04-17"]
}

# Set common parameters
GENE_COL = "gene_symbol"
CLUSTER_COL = "cluster"
UNIPROT_COL = "uniprot_function"
GENE_SEP = ";"
GENE_COL_LLM = "genes"
CLUSTER_COL_LLM = "cluster_id"
SCREEN_INFO_PATH = "HeLa_interphase_screen_info.txt" 
CUSTOM_PROMPT_PATH = "top_targets.txt"

# Dictionary to track results
results_summary = {}

# Loop through datasets
for dataset_name in ['funk']: # , 'wainberg', 'schaffer']:
    print(f"\n{'='*60}")
    print(f"Processing dataset: {dataset_name}")
    print(f"{'='*60}")
    
    results_summary[dataset_name] = {}
    
    # Define the cluster dataframe
    cluster_df = locals()[dataset_name]
    
    # Define the output file paths
    PROCESSED_FILE = os.path.join(
        project_root, f"data/benchmarks/{dataset_name}/cluster_data.csv"
    )
    GENE_FEATURES_FILE = os.path.join(
        project_root, f"data/benchmarks/{dataset_name}/uniprot_data.csv"
    )
    
    # Loop through each configuration and model
    for config_file, models in models_to_test.items():
        # Full path to config file
        CONFIG_PATH = os.path.join(
            project_root, "mozzarellm/configs", config_file
        )
        
        for MODEL_NAME in models:
            print(f"\n{'-'*50}")
            print(f"Running {MODEL_NAME} on {dataset_name}")
            print(f"{'-'*50}")
            
            # Define results directory for this model
            RESULTS_DIR = os.path.join(
                project_root, f"results/{dataset_name}/{MODEL_NAME.replace('/', '_')}"
            )
            os.makedirs(RESULTS_DIR, exist_ok=True)
            
            try:
                # Run the analysis
                results = analyze_gene_clusters(
                    input_file=PROCESSED_FILE,
                    output_file=f"{RESULTS_DIR}/{MODEL_NAME.replace('/', '_')}",
                    config_path=CONFIG_PATH,
                    model_name=MODEL_NAME,  
                    custom_prompt_path=CUSTOM_PROMPT_PATH,
                    gene_features_path=GENE_FEATURES_FILE,
                    screen_info_path=SCREEN_INFO_PATH,
                    gene_column=GENE_COL_LLM,
                    gene_sep=GENE_SEP,
                    batch_size=1,
                    cluster_id_column=CLUSTER_COL_LLM,
                )
                
                results_summary[dataset_name][MODEL_NAME] = "Success"
                print(f"✓ Successfully analyzed {dataset_name} with {MODEL_NAME}")
                
            except Exception as e:
                results_summary[dataset_name][MODEL_NAME] = f"Failed: {str(e)}"
                print(f"✗ Error analyzing {dataset_name} with {MODEL_NAME}: {str(e)}")

INFO:cluster_analysis_20250505_144653.log:Processing 6 clusters with model o4-mini



Processing dataset: funk

--------------------------------------------------
Running o4-mini on funk
--------------------------------------------------
Loaded data with 6 rows and columns: ['cluster_id', 'genes']
Loaded features for 140 genes
Loaded screen information: 2129 characters


Processing clusters:   0%|          | 0/6 [00:00<?, ?it/s]INFO:cluster_analysis_20250505_144653.log:Accessing OpenAI API


Attempting to load template from: top_targets.txt
Template found at: /lab/barcheese01/mdiberna/mozzarellm/mozzarellm/prompts/top_targets.txt
Successfully loaded template (3457 characters)
Appending output format instructions to template
Added 39 gene feature descriptions to prompt


INFO:cluster_analysis_20250505_144653.log:API call successful: 13443 tokens, $0.1344
INFO:cluster_analysis_20250505_144653.log:Success for cluster 21
Processing clusters:  17%|█▋        | 1/6 [00:50<04:12, 50.55s/it]INFO:cluster_analysis_20250505_144653.log:Accessing OpenAI API


Attempting to load template from: top_targets.txt
Template found at: /lab/barcheese01/mdiberna/mozzarellm/mozzarellm/prompts/top_targets.txt
Successfully loaded template (3457 characters)
Appending output format instructions to template
Added 33 gene feature descriptions to prompt


INFO:cluster_analysis_20250505_144653.log:API call successful: 21150 tokens, $0.2115
INFO:cluster_analysis_20250505_144653.log:Success for cluster 37
Processing clusters:  33%|███▎      | 2/6 [01:37<03:14, 48.70s/it]INFO:cluster_analysis_20250505_144653.log:Accessing OpenAI API


Attempting to load template from: top_targets.txt
Template found at: /lab/barcheese01/mdiberna/mozzarellm/mozzarellm/prompts/top_targets.txt
Successfully loaded template (3457 characters)
Appending output format instructions to template
Added 22 gene feature descriptions to prompt


INFO:cluster_analysis_20250505_144653.log:API call successful: 10565 tokens, $0.1057
INFO:cluster_analysis_20250505_144653.log:Success for cluster 121
Processing clusters:  50%|█████     | 3/6 [02:17<02:13, 44.54s/it]INFO:cluster_analysis_20250505_144653.log:Accessing OpenAI API


Attempting to load template from: top_targets.txt
Template found at: /lab/barcheese01/mdiberna/mozzarellm/mozzarellm/prompts/top_targets.txt
Successfully loaded template (3457 characters)
Appending output format instructions to template
Added 19 gene feature descriptions to prompt


INFO:cluster_analysis_20250505_144653.log:API call successful: 7955 tokens, $0.0796
INFO:cluster_analysis_20250505_144653.log:Success for cluster 149
Processing clusters:  67%|██████▋   | 4/6 [02:46<01:16, 38.24s/it]INFO:cluster_analysis_20250505_144653.log:Accessing OpenAI API


Attempting to load template from: top_targets.txt
Template found at: /lab/barcheese01/mdiberna/mozzarellm/mozzarellm/prompts/top_targets.txt
Successfully loaded template (3457 characters)
Appending output format instructions to template
Added 16 gene feature descriptions to prompt


INFO:cluster_analysis_20250505_144653.log:API call successful: 6852 tokens, $0.0685
INFO:cluster_analysis_20250505_144653.log:Success for cluster 167
INFO:cluster_analysis_20250505_144653.log:Saved progress for 5 clusters
Processing clusters:  83%|████████▎ | 5/6 [03:11<00:33, 33.67s/it]INFO:cluster_analysis_20250505_144653.log:Accessing OpenAI API


Attempting to load template from: top_targets.txt
Template found at: /lab/barcheese01/mdiberna/mozzarellm/mozzarellm/prompts/top_targets.txt
Successfully loaded template (3457 characters)
Appending output format instructions to template
Added 11 gene feature descriptions to prompt


INFO:cluster_analysis_20250505_144653.log:API call successful: 11053 tokens, $0.1105
INFO:cluster_analysis_20250505_144653.log:Success for cluster 197
Processing clusters: 100%|██████████| 6/6 [03:49<00:00, 38.23s/it]
INFO:cluster_analysis_20250505_144653.log:Completed analysis for 6 clusters
INFO:cluster_analysis_20250505_145042.log:Processing 6 clusters with model o3-mini


✓ Successfully analyzed funk with o4-mini

--------------------------------------------------
Running o3-mini on funk
--------------------------------------------------
Loaded data with 6 rows and columns: ['cluster_id', 'genes']
Loaded features for 140 genes
Loaded screen information: 2129 characters


Processing clusters:   0%|          | 0/6 [00:00<?, ?it/s]INFO:cluster_analysis_20250505_145042.log:Accessing OpenAI API


Attempting to load template from: top_targets.txt
Template found at: /lab/barcheese01/mdiberna/mozzarellm/mozzarellm/prompts/top_targets.txt
Successfully loaded template (3457 characters)
Appending output format instructions to template
Added 39 gene feature descriptions to prompt


INFO:cluster_analysis_20250505_145042.log:API call successful: 12935 tokens, $0.1294
INFO:cluster_analysis_20250505_145042.log:Success for cluster 21
Processing clusters:  17%|█▋        | 1/6 [00:54<04:34, 54.97s/it]INFO:cluster_analysis_20250505_145042.log:Accessing OpenAI API


Attempting to load template from: top_targets.txt
Template found at: /lab/barcheese01/mdiberna/mozzarellm/mozzarellm/prompts/top_targets.txt
Successfully loaded template (3457 characters)
Appending output format instructions to template
Added 33 gene feature descriptions to prompt


INFO:cluster_analysis_20250505_145042.log:API call successful: 21134 tokens, $0.2113
INFO:cluster_analysis_20250505_145042.log:Success for cluster 37
Processing clusters:  33%|███▎      | 2/6 [01:47<03:34, 53.75s/it]INFO:cluster_analysis_20250505_145042.log:Accessing OpenAI API


Attempting to load template from: top_targets.txt
Template found at: /lab/barcheese01/mdiberna/mozzarellm/mozzarellm/prompts/top_targets.txt
Successfully loaded template (3457 characters)
Appending output format instructions to template
Added 22 gene feature descriptions to prompt


INFO:cluster_analysis_20250505_145042.log:API call successful: 12219 tokens, $0.1222
INFO:cluster_analysis_20250505_145042.log:Success for cluster 121
Processing clusters:  50%|█████     | 3/6 [02:30<02:26, 48.72s/it]INFO:cluster_analysis_20250505_145042.log:Accessing OpenAI API


Attempting to load template from: top_targets.txt
Template found at: /lab/barcheese01/mdiberna/mozzarellm/mozzarellm/prompts/top_targets.txt
Successfully loaded template (3457 characters)
Appending output format instructions to template
Added 19 gene feature descriptions to prompt


INFO:cluster_analysis_20250505_145042.log:API call successful: 8729 tokens, $0.0873
INFO:cluster_analysis_20250505_145042.log:Success for cluster 149
Processing clusters:  67%|██████▋   | 4/6 [03:04<01:25, 42.76s/it]INFO:cluster_analysis_20250505_145042.log:Accessing OpenAI API


Attempting to load template from: top_targets.txt
Template found at: /lab/barcheese01/mdiberna/mozzarellm/mozzarellm/prompts/top_targets.txt
Successfully loaded template (3457 characters)
Appending output format instructions to template
Added 16 gene feature descriptions to prompt


INFO:cluster_analysis_20250505_145042.log:API call successful: 7314 tokens, $0.0731
INFO:cluster_analysis_20250505_145042.log:Success for cluster 167
INFO:cluster_analysis_20250505_145042.log:Saved progress for 5 clusters
Processing clusters:  83%|████████▎ | 5/6 [03:25<00:34, 34.92s/it]INFO:cluster_analysis_20250505_145042.log:Accessing OpenAI API


Attempting to load template from: top_targets.txt
Template found at: /lab/barcheese01/mdiberna/mozzarellm/mozzarellm/prompts/top_targets.txt
Successfully loaded template (3457 characters)
Appending output format instructions to template
Added 11 gene feature descriptions to prompt


INFO:cluster_analysis_20250505_145042.log:API call successful: 10238 tokens, $0.1024
INFO:cluster_analysis_20250505_145042.log:Success for cluster 197
Processing clusters: 100%|██████████| 6/6 [03:56<00:00, 39.41s/it]
INFO:cluster_analysis_20250505_145042.log:Completed analysis for 6 clusters
INFO:cluster_analysis_20250505_145439.log:Processing 6 clusters with model gpt-4.1


✓ Successfully analyzed funk with o3-mini

--------------------------------------------------
Running gpt-4.1 on funk
--------------------------------------------------
Loaded data with 6 rows and columns: ['cluster_id', 'genes']
Loaded features for 140 genes
Loaded screen information: 2129 characters


Processing clusters:   0%|          | 0/6 [00:00<?, ?it/s]INFO:cluster_analysis_20250505_145439.log:Accessing OpenAI API


Attempting to load template from: top_targets.txt
Template found at: /lab/barcheese01/mdiberna/mozzarellm/mozzarellm/prompts/top_targets.txt
Successfully loaded template (3457 characters)
Appending output format instructions to template
Added 39 gene feature descriptions to prompt


INFO:cluster_analysis_20250505_145439.log:API call successful: 9065 tokens, $0.0907
INFO:cluster_analysis_20250505_145439.log:Success for cluster 21
Processing clusters:  17%|█▋        | 1/6 [00:22<01:54, 22.91s/it]INFO:cluster_analysis_20250505_145439.log:Accessing OpenAI API


Attempting to load template from: top_targets.txt
Template found at: /lab/barcheese01/mdiberna/mozzarellm/mozzarellm/prompts/top_targets.txt
Successfully loaded template (3457 characters)
Appending output format instructions to template
Added 33 gene feature descriptions to prompt


INFO:cluster_analysis_20250505_145439.log:API call successful: 16739 tokens, $0.1674
INFO:cluster_analysis_20250505_145439.log:Success for cluster 37
Processing clusters:  33%|███▎      | 2/6 [00:52<01:48, 27.09s/it]INFO:cluster_analysis_20250505_145439.log:Accessing OpenAI API


Attempting to load template from: top_targets.txt
Template found at: /lab/barcheese01/mdiberna/mozzarellm/mozzarellm/prompts/top_targets.txt
Successfully loaded template (3457 characters)
Appending output format instructions to template
Added 22 gene feature descriptions to prompt


INFO:cluster_analysis_20250505_145439.log:API call successful: 7687 tokens, $0.0769
INFO:cluster_analysis_20250505_145439.log:Success for cluster 121
Processing clusters:  50%|█████     | 3/6 [01:08<01:05, 21.79s/it]INFO:cluster_analysis_20250505_145439.log:Accessing OpenAI API


Attempting to load template from: top_targets.txt
Template found at: /lab/barcheese01/mdiberna/mozzarellm/mozzarellm/prompts/top_targets.txt
Successfully loaded template (3457 characters)
Appending output format instructions to template
Added 19 gene feature descriptions to prompt


INFO:cluster_analysis_20250505_145439.log:API call successful: 5601 tokens, $0.0560
INFO:cluster_analysis_20250505_145439.log:Success for cluster 149
Processing clusters:  67%|██████▋   | 4/6 [01:21<00:36, 18.31s/it]INFO:cluster_analysis_20250505_145439.log:Accessing OpenAI API


Attempting to load template from: top_targets.txt
Template found at: /lab/barcheese01/mdiberna/mozzarellm/mozzarellm/prompts/top_targets.txt
Successfully loaded template (3457 characters)
Appending output format instructions to template
Added 16 gene feature descriptions to prompt


INFO:cluster_analysis_20250505_145439.log:API call successful: 6055 tokens, $0.0606
INFO:cluster_analysis_20250505_145439.log:Success for cluster 167
INFO:cluster_analysis_20250505_145439.log:Saved progress for 5 clusters
Processing clusters:  83%|████████▎ | 5/6 [01:38<00:17, 17.71s/it]INFO:cluster_analysis_20250505_145439.log:Accessing OpenAI API


Attempting to load template from: top_targets.txt
Template found at: /lab/barcheese01/mdiberna/mozzarellm/mozzarellm/prompts/top_targets.txt
Successfully loaded template (3457 characters)
Appending output format instructions to template
Added 11 gene feature descriptions to prompt


INFO:cluster_analysis_20250505_145439.log:API call successful: 7768 tokens, $0.0777
INFO:cluster_analysis_20250505_145439.log:Success for cluster 197
Processing clusters: 100%|██████████| 6/6 [01:51<00:00, 18.50s/it]
INFO:cluster_analysis_20250505_145439.log:Completed analysis for 6 clusters
INFO:cluster_analysis_20250505_145630.log:Processing 6 clusters with model gpt-4o


✓ Successfully analyzed funk with gpt-4.1

--------------------------------------------------
Running gpt-4o on funk
--------------------------------------------------
Loaded data with 6 rows and columns: ['cluster_id', 'genes']
Loaded features for 140 genes
Loaded screen information: 2129 characters


Processing clusters:   0%|          | 0/6 [00:00<?, ?it/s]INFO:cluster_analysis_20250505_145630.log:Accessing OpenAI API


Attempting to load template from: top_targets.txt
Template found at: /lab/barcheese01/mdiberna/mozzarellm/mozzarellm/prompts/top_targets.txt
Successfully loaded template (3457 characters)
Appending output format instructions to template
Added 39 gene feature descriptions to prompt


INFO:cluster_analysis_20250505_145630.log:API call successful: 8883 tokens, $0.0888
INFO:cluster_analysis_20250505_145630.log:Success for cluster 21
Processing clusters:  17%|█▋        | 1/6 [00:09<00:47,  9.54s/it]INFO:cluster_analysis_20250505_145630.log:Accessing OpenAI API


Attempting to load template from: top_targets.txt
Template found at: /lab/barcheese01/mdiberna/mozzarellm/mozzarellm/prompts/top_targets.txt
Successfully loaded template (3457 characters)
Appending output format instructions to template
Added 33 gene feature descriptions to prompt


INFO:cluster_analysis_20250505_145630.log:API call successful: 16406 tokens, $0.1641
INFO:cluster_analysis_20250505_145630.log:Success for cluster 37
Processing clusters:  33%|███▎      | 2/6 [00:18<00:36,  9.10s/it]INFO:cluster_analysis_20250505_145630.log:Accessing OpenAI API


Attempting to load template from: top_targets.txt
Template found at: /lab/barcheese01/mdiberna/mozzarellm/mozzarellm/prompts/top_targets.txt
Successfully loaded template (3457 characters)
Appending output format instructions to template
Added 22 gene feature descriptions to prompt


INFO:cluster_analysis_20250505_145630.log:API call successful: 7320 tokens, $0.0732
INFO:cluster_analysis_20250505_145630.log:Success for cluster 121
Processing clusters:  50%|█████     | 3/6 [00:24<00:23,  7.91s/it]INFO:cluster_analysis_20250505_145630.log:Accessing OpenAI API


Attempting to load template from: top_targets.txt
Template found at: /lab/barcheese01/mdiberna/mozzarellm/mozzarellm/prompts/top_targets.txt
Successfully loaded template (3457 characters)
Appending output format instructions to template
Added 19 gene feature descriptions to prompt


INFO:cluster_analysis_20250505_145630.log:API call successful: 5465 tokens, $0.0547
INFO:cluster_analysis_20250505_145630.log:Success for cluster 149
Processing clusters:  67%|██████▋   | 4/6 [00:31<00:14,  7.27s/it]INFO:cluster_analysis_20250505_145630.log:Accessing OpenAI API


Attempting to load template from: top_targets.txt
Template found at: /lab/barcheese01/mdiberna/mozzarellm/mozzarellm/prompts/top_targets.txt
Successfully loaded template (3457 characters)
Appending output format instructions to template
Added 16 gene feature descriptions to prompt


INFO:cluster_analysis_20250505_145630.log:API call successful: 5810 tokens, $0.0581
INFO:cluster_analysis_20250505_145630.log:Success for cluster 167
INFO:cluster_analysis_20250505_145630.log:Saved progress for 5 clusters
Processing clusters:  83%|████████▎ | 5/6 [00:35<00:06,  6.34s/it]INFO:cluster_analysis_20250505_145630.log:Accessing OpenAI API


Attempting to load template from: top_targets.txt
Template found at: /lab/barcheese01/mdiberna/mozzarellm/mozzarellm/prompts/top_targets.txt
Successfully loaded template (3457 characters)
Appending output format instructions to template
Added 11 gene feature descriptions to prompt


INFO:cluster_analysis_20250505_145630.log:API call successful: 7690 tokens, $0.0769
INFO:cluster_analysis_20250505_145630.log:Success for cluster 197
Processing clusters: 100%|██████████| 6/6 [00:50<00:00,  8.38s/it]
INFO:cluster_analysis_20250505_145630.log:Completed analysis for 6 clusters
INFO:cluster_analysis_20250505_145720.log:Processing 6 clusters with model claude-3-7-sonnet-20250219


✓ Successfully analyzed funk with gpt-4o

--------------------------------------------------
Running claude-3-7-sonnet-20250219 on funk
--------------------------------------------------
Loaded data with 6 rows and columns: ['cluster_id', 'genes']
Loaded features for 140 genes
Loaded screen information: 2129 characters


Processing clusters:   0%|          | 0/6 [00:00<?, ?it/s]INFO:cluster_analysis_20250505_145720.log:Using Anthropic Claude API


Attempting to load template from: top_targets.txt
Template found at: /lab/barcheese01/mdiberna/mozzarellm/mozzarellm/prompts/top_targets.txt
Successfully loaded template (3457 characters)
Appending output format instructions to template
Added 39 gene feature descriptions to prompt


INFO:cluster_analysis_20250505_145720.log:Anthropic API call successful: model=claude-3-7-sonnet-20250219
INFO:cluster_analysis_20250505_145720.log:Success for cluster 21
Processing clusters:  17%|█▋        | 1/6 [00:16<01:24, 16.95s/it]INFO:cluster_analysis_20250505_145720.log:Using Anthropic Claude API


Attempting to load template from: top_targets.txt
Template found at: /lab/barcheese01/mdiberna/mozzarellm/mozzarellm/prompts/top_targets.txt
Successfully loaded template (3457 characters)
Appending output format instructions to template
Added 33 gene feature descriptions to prompt


INFO:cluster_analysis_20250505_145720.log:Anthropic API call successful: model=claude-3-7-sonnet-20250219
INFO:cluster_analysis_20250505_145720.log:Success for cluster 37
Processing clusters:  33%|███▎      | 2/6 [00:47<01:38, 24.74s/it]INFO:cluster_analysis_20250505_145720.log:Using Anthropic Claude API


Attempting to load template from: top_targets.txt
Template found at: /lab/barcheese01/mdiberna/mozzarellm/mozzarellm/prompts/top_targets.txt
Successfully loaded template (3457 characters)
Appending output format instructions to template
Added 22 gene feature descriptions to prompt


INFO:cluster_analysis_20250505_145720.log:Anthropic API call successful: model=claude-3-7-sonnet-20250219
INFO:cluster_analysis_20250505_145720.log:Success for cluster 121
Processing clusters:  50%|█████     | 3/6 [01:12<01:15, 25.14s/it]INFO:cluster_analysis_20250505_145720.log:Using Anthropic Claude API


Attempting to load template from: top_targets.txt
Template found at: /lab/barcheese01/mdiberna/mozzarellm/mozzarellm/prompts/top_targets.txt
Successfully loaded template (3457 characters)
Appending output format instructions to template
Added 19 gene feature descriptions to prompt


INFO:cluster_analysis_20250505_145720.log:Anthropic API call successful: model=claude-3-7-sonnet-20250219
INFO:cluster_analysis_20250505_145720.log:Success for cluster 149
Processing clusters:  67%|██████▋   | 4/6 [01:32<00:46, 23.13s/it]INFO:cluster_analysis_20250505_145720.log:Using Anthropic Claude API


Attempting to load template from: top_targets.txt
Template found at: /lab/barcheese01/mdiberna/mozzarellm/mozzarellm/prompts/top_targets.txt
Successfully loaded template (3457 characters)
Appending output format instructions to template
Added 16 gene feature descriptions to prompt


INFO:cluster_analysis_20250505_145720.log:Anthropic API call successful: model=claude-3-7-sonnet-20250219
INFO:cluster_analysis_20250505_145720.log:Success for cluster 167
INFO:cluster_analysis_20250505_145720.log:Saved progress for 5 clusters
Processing clusters:  83%|████████▎ | 5/6 [01:47<00:19, 19.94s/it]INFO:cluster_analysis_20250505_145720.log:Using Anthropic Claude API


Attempting to load template from: top_targets.txt
Template found at: /lab/barcheese01/mdiberna/mozzarellm/mozzarellm/prompts/top_targets.txt
Successfully loaded template (3457 characters)
Appending output format instructions to template
Added 11 gene feature descriptions to prompt


INFO:cluster_analysis_20250505_145720.log:Anthropic API call successful: model=claude-3-7-sonnet-20250219
INFO:cluster_analysis_20250505_145720.log:Success for cluster 197
Processing clusters: 100%|██████████| 6/6 [02:04<00:00, 20.79s/it]
INFO:cluster_analysis_20250505_145720.log:Completed analysis for 6 clusters
INFO:cluster_analysis_20250505_145925.log:Processing 6 clusters with model claude-3-5-haiku-20241022


✓ Successfully analyzed funk with claude-3-7-sonnet-20250219

--------------------------------------------------
Running claude-3-5-haiku-20241022 on funk
--------------------------------------------------
Loaded data with 6 rows and columns: ['cluster_id', 'genes']
Loaded features for 140 genes
Loaded screen information: 2129 characters


Processing clusters:   0%|          | 0/6 [00:00<?, ?it/s]INFO:cluster_analysis_20250505_145925.log:Using Anthropic Claude API


Attempting to load template from: top_targets.txt
Template found at: /lab/barcheese01/mdiberna/mozzarellm/mozzarellm/prompts/top_targets.txt
Successfully loaded template (3457 characters)
Appending output format instructions to template
Added 39 gene feature descriptions to prompt


INFO:cluster_analysis_20250505_145925.log:Anthropic API call successful: model=claude-3-5-haiku-20241022
INFO:cluster_analysis_20250505_145925.log:Success for cluster 21
Processing clusters:  17%|█▋        | 1/6 [00:14<01:12, 14.56s/it]INFO:cluster_analysis_20250505_145925.log:Using Anthropic Claude API


Attempting to load template from: top_targets.txt
Template found at: /lab/barcheese01/mdiberna/mozzarellm/mozzarellm/prompts/top_targets.txt
Successfully loaded template (3457 characters)
Appending output format instructions to template
Added 33 gene feature descriptions to prompt


INFO:cluster_analysis_20250505_145925.log:Anthropic API call successful: model=claude-3-5-haiku-20241022
INFO:cluster_analysis_20250505_145925.log:Success for cluster 37
Processing clusters:  33%|███▎      | 2/6 [00:25<00:50, 12.69s/it]INFO:cluster_analysis_20250505_145925.log:Using Anthropic Claude API


Attempting to load template from: top_targets.txt
Template found at: /lab/barcheese01/mdiberna/mozzarellm/mozzarellm/prompts/top_targets.txt
Successfully loaded template (3457 characters)
Appending output format instructions to template
Added 22 gene feature descriptions to prompt


INFO:cluster_analysis_20250505_145925.log:Anthropic API call successful: model=claude-3-5-haiku-20241022
INFO:cluster_analysis_20250505_145925.log:Success for cluster 121
Processing clusters:  50%|█████     | 3/6 [00:38<00:37, 12.54s/it]INFO:cluster_analysis_20250505_145925.log:Using Anthropic Claude API


Attempting to load template from: top_targets.txt
Template found at: /lab/barcheese01/mdiberna/mozzarellm/mozzarellm/prompts/top_targets.txt
Successfully loaded template (3457 characters)
Appending output format instructions to template
Added 19 gene feature descriptions to prompt


INFO:cluster_analysis_20250505_145925.log:Anthropic API call successful: model=claude-3-5-haiku-20241022
INFO:cluster_analysis_20250505_145925.log:Success for cluster 149
Processing clusters:  67%|██████▋   | 4/6 [00:49<00:24, 12.09s/it]INFO:cluster_analysis_20250505_145925.log:Using Anthropic Claude API


Attempting to load template from: top_targets.txt
Template found at: /lab/barcheese01/mdiberna/mozzarellm/mozzarellm/prompts/top_targets.txt
Successfully loaded template (3457 characters)
Appending output format instructions to template
Added 16 gene feature descriptions to prompt


INFO:cluster_analysis_20250505_145925.log:Anthropic API call successful: model=claude-3-5-haiku-20241022
INFO:cluster_analysis_20250505_145925.log:Success for cluster 167
INFO:cluster_analysis_20250505_145925.log:Saved progress for 5 clusters
Processing clusters:  83%|████████▎ | 5/6 [00:58<00:10, 10.75s/it]INFO:cluster_analysis_20250505_145925.log:Using Anthropic Claude API


Attempting to load template from: top_targets.txt
Template found at: /lab/barcheese01/mdiberna/mozzarellm/mozzarellm/prompts/top_targets.txt
Successfully loaded template (3457 characters)
Appending output format instructions to template
Added 11 gene feature descriptions to prompt


INFO:cluster_analysis_20250505_145925.log:Anthropic API call successful: model=claude-3-5-haiku-20241022
INFO:cluster_analysis_20250505_145925.log:Success for cluster 197
Processing clusters: 100%|██████████| 6/6 [01:10<00:00, 11.68s/it]
INFO:cluster_analysis_20250505_145925.log:Completed analysis for 6 clusters
INFO:cluster_analysis_20250505_150035.log:Processing 6 clusters with model gemini-2.5-pro-preview-03-25


✓ Successfully analyzed funk with claude-3-5-haiku-20241022

--------------------------------------------------
Running gemini-2.5-pro-preview-03-25 on funk
--------------------------------------------------
Loaded data with 6 rows and columns: ['cluster_id', 'genes']
Loaded features for 140 genes
Loaded screen information: 2129 characters


Processing clusters:   0%|          | 0/6 [00:00<?, ?it/s]INFO:cluster_analysis_20250505_150035.log:Using Google Gemini API


Attempting to load template from: top_targets.txt
Template found at: /lab/barcheese01/mdiberna/mozzarellm/mozzarellm/prompts/top_targets.txt
Successfully loaded template (3457 characters)
Appending output format instructions to template
Added 39 gene feature descriptions to prompt


INFO:cluster_analysis_20250505_150035.log:Google Gemini API call successful: model=gemini-2.5-pro-preview-03-25
INFO:cluster_analysis_20250505_150035.log:Success for cluster 21
Processing clusters:  17%|█▋        | 1/6 [01:08<05:40, 68.12s/it]INFO:cluster_analysis_20250505_150035.log:Using Google Gemini API


Attempting to load template from: top_targets.txt
Template found at: /lab/barcheese01/mdiberna/mozzarellm/mozzarellm/prompts/top_targets.txt
Successfully loaded template (3457 characters)
Appending output format instructions to template
Added 33 gene feature descriptions to prompt


INFO:cluster_analysis_20250505_150035.log:Google Gemini API call successful: model=gemini-2.5-pro-preview-03-25
INFO:cluster_analysis_20250505_150035.log:Success for cluster 37
Processing clusters:  33%|███▎      | 2/6 [02:25<04:54, 73.59s/it]INFO:cluster_analysis_20250505_150035.log:Using Google Gemini API


Attempting to load template from: top_targets.txt
Template found at: /lab/barcheese01/mdiberna/mozzarellm/mozzarellm/prompts/top_targets.txt
Successfully loaded template (3457 characters)
Appending output format instructions to template
Added 22 gene feature descriptions to prompt


INFO:cluster_analysis_20250505_150035.log:Google Gemini API call successful: model=gemini-2.5-pro-preview-03-25
ERROR:root:Failed to parse JSON object from regex match: Expecting property name enclosed in double quotes: line 24 column 6 (char 416)
ERROR:root:Failed to parse JSON object using indices: Expecting property name enclosed in double quotes: line 1 column 2 (char 1)
INFO:cluster_analysis_20250505_150035.log:Success for cluster 121
Processing clusters:  50%|█████     | 3/6 [03:12<03:03, 61.25s/it]INFO:cluster_analysis_20250505_150035.log:Using Google Gemini API


Attempting to load template from: top_targets.txt
Template found at: /lab/barcheese01/mdiberna/mozzarellm/mozzarellm/prompts/top_targets.txt
Successfully loaded template (3457 characters)
Appending output format instructions to template
Added 19 gene feature descriptions to prompt


INFO:cluster_analysis_20250505_150035.log:Google Gemini API call successful: model=gemini-2.5-pro-preview-03-25
INFO:cluster_analysis_20250505_150035.log:Success for cluster 149
Processing clusters:  67%|██████▋   | 4/6 [04:08<01:58, 59.38s/it]INFO:cluster_analysis_20250505_150035.log:Using Google Gemini API


Attempting to load template from: top_targets.txt
Template found at: /lab/barcheese01/mdiberna/mozzarellm/mozzarellm/prompts/top_targets.txt
Successfully loaded template (3457 characters)
Appending output format instructions to template
Added 16 gene feature descriptions to prompt


INFO:cluster_analysis_20250505_150035.log:Google Gemini API call successful: model=gemini-2.5-pro-preview-03-25
INFO:cluster_analysis_20250505_150035.log:Success for cluster 167
INFO:cluster_analysis_20250505_150035.log:Saved progress for 5 clusters
Processing clusters:  83%|████████▎ | 5/6 [04:43<00:50, 50.40s/it]INFO:cluster_analysis_20250505_150035.log:Using Google Gemini API


Attempting to load template from: top_targets.txt
Template found at: /lab/barcheese01/mdiberna/mozzarellm/mozzarellm/prompts/top_targets.txt
Successfully loaded template (3457 characters)
Appending output format instructions to template
Added 11 gene feature descriptions to prompt


INFO:cluster_analysis_20250505_150035.log:Google Gemini API call successful: model=gemini-2.5-pro-preview-03-25
INFO:cluster_analysis_20250505_150035.log:Success for cluster 197
Processing clusters: 100%|██████████| 6/6 [05:31<00:00, 55.33s/it]
INFO:cluster_analysis_20250505_150035.log:Completed analysis for 6 clusters
INFO:cluster_analysis_20250505_150607.log:Processing 6 clusters with model gemini-2.5-flash-preview-04-17


✓ Successfully analyzed funk with gemini-2.5-pro-preview-03-25

--------------------------------------------------
Running gemini-2.5-flash-preview-04-17 on funk
--------------------------------------------------
Loaded data with 6 rows and columns: ['cluster_id', 'genes']
Loaded features for 140 genes
Loaded screen information: 2129 characters


Processing clusters:   0%|          | 0/6 [00:00<?, ?it/s]INFO:cluster_analysis_20250505_150607.log:Using Google Gemini API


Attempting to load template from: top_targets.txt
Template found at: /lab/barcheese01/mdiberna/mozzarellm/mozzarellm/prompts/top_targets.txt
Successfully loaded template (3457 characters)
Appending output format instructions to template
Added 39 gene feature descriptions to prompt


INFO:cluster_analysis_20250505_150607.log:Google Gemini API call successful: model=gemini-2.5-flash-preview-04-17
INFO:cluster_analysis_20250505_150607.log:Success for cluster 21
Processing clusters:  17%|█▋        | 1/6 [00:33<02:48, 33.75s/it]INFO:cluster_analysis_20250505_150607.log:Using Google Gemini API


Attempting to load template from: top_targets.txt
Template found at: /lab/barcheese01/mdiberna/mozzarellm/mozzarellm/prompts/top_targets.txt
Successfully loaded template (3457 characters)
Appending output format instructions to template
Added 33 gene feature descriptions to prompt


INFO:cluster_analysis_20250505_150607.log:Google Gemini API call successful: model=gemini-2.5-flash-preview-04-17
ERROR: Error for cluster 37: None
ERROR:cluster_analysis_20250505_150607.log:Error for cluster 37: None
Processing clusters:  33%|███▎      | 2/6 [01:16<02:37, 39.30s/it]INFO:cluster_analysis_20250505_150607.log:Using Google Gemini API


Attempting to load template from: top_targets.txt
Template found at: /lab/barcheese01/mdiberna/mozzarellm/mozzarellm/prompts/top_targets.txt
Successfully loaded template (3457 characters)
Appending output format instructions to template
Added 22 gene feature descriptions to prompt


INFO:cluster_analysis_20250505_150607.log:Google Gemini API call successful: model=gemini-2.5-flash-preview-04-17
INFO:cluster_analysis_20250505_150607.log:Success for cluster 121
Processing clusters:  50%|█████     | 3/6 [01:49<01:48, 36.26s/it]INFO:cluster_analysis_20250505_150607.log:Using Google Gemini API


Attempting to load template from: top_targets.txt
Template found at: /lab/barcheese01/mdiberna/mozzarellm/mozzarellm/prompts/top_targets.txt
Successfully loaded template (3457 characters)
Appending output format instructions to template
Added 19 gene feature descriptions to prompt


INFO:cluster_analysis_20250505_150607.log:Google Gemini API call successful: model=gemini-2.5-flash-preview-04-17
INFO:cluster_analysis_20250505_150607.log:Success for cluster 149
Processing clusters:  67%|██████▋   | 4/6 [02:23<01:10, 35.47s/it]INFO:cluster_analysis_20250505_150607.log:Using Google Gemini API


Attempting to load template from: top_targets.txt
Template found at: /lab/barcheese01/mdiberna/mozzarellm/mozzarellm/prompts/top_targets.txt
Successfully loaded template (3457 characters)
Appending output format instructions to template
Added 16 gene feature descriptions to prompt


INFO:cluster_analysis_20250505_150607.log:Google Gemini API call successful: model=gemini-2.5-flash-preview-04-17
INFO:cluster_analysis_20250505_150607.log:Success for cluster 167
Processing clusters:  83%|████████▎ | 5/6 [02:39<00:28, 28.24s/it]INFO:cluster_analysis_20250505_150607.log:Using Google Gemini API


Attempting to load template from: top_targets.txt
Template found at: /lab/barcheese01/mdiberna/mozzarellm/mozzarellm/prompts/top_targets.txt
Successfully loaded template (3457 characters)
Appending output format instructions to template
Added 11 gene feature descriptions to prompt


INFO:cluster_analysis_20250505_150607.log:Google Gemini API call successful: model=gemini-2.5-flash-preview-04-17
INFO:cluster_analysis_20250505_150607.log:Success for cluster 197
INFO:cluster_analysis_20250505_150607.log:Saved progress for 5 clusters
Processing clusters: 100%|██████████| 6/6 [03:10<00:00, 31.71s/it]
INFO:cluster_analysis_20250505_150607.log:Completed analysis for 5 clusters


✓ Successfully analyzed funk with gemini-2.5-flash-preview-04-17


In [13]:
import os
import pandas as pd
from collections import defaultdict

# Datasets to analyze
datasets = ['funk', 'wainberg', 'schaffer']
# Models to check
models_to_check = [
    "claude-3-7-sonnet-20250219",
    "claude-3-5-haiku-20241022",
    "gemini-2.5-flash-preview-04-17",
    "gemini-2.5-pro-preview-03-25",
    "gpt-4.1",
    "gpt-4o",
    "o3-mini",
    "o4-mini"
]

def parse_gene_list(gene_str):
    """Parse gene list string into a list of genes, considering different separators"""
    if not gene_str or pd.isna(gene_str):
        return []
    
    gene_str = str(gene_str).strip()
    
    # Try different separators
    for separator in [';', ',', ' ']:
        if separator in gene_str:
            return [g.strip() for g in gene_str.split(separator) if g.strip()]
    
    # If no separator found but string is not empty
    return [gene_str] if gene_str else []

def load_data(project_root):
    """Load benchmark findings and prepare data structures"""
    # Load findings.csv
    findings_path = os.path.join(project_root, "data/benchmarks/findings.csv")
    try:
        findings_df = pd.read_csv(findings_path)
    except Exception as e:
        print(f"Error loading findings file: {str(e)}")
        return None
    
    # Prepare benchmark data
    benchmark_data = defaultdict(dict)
    
    for dataset, group in findings_df.groupby('dataset'):
        if dataset not in datasets:
            continue
            
        print(f"\nAnalyzing dataset: {dataset}")
        
        # Process findings for this dataset
        for _, row in group.iterrows():
            cluster = str(row['cluster'])
            gene = row['gene']
            function = row['function']
            
            if cluster not in benchmark_data[dataset]:
                benchmark_data[dataset][cluster] = {
                    'function': function,
                    'genes': []
                }
            
            benchmark_data[dataset][cluster]['genes'].append(gene)
    
    return benchmark_data

def load_clusters_data(project_root, dataset, model):
    """Load clusters data for a specific model and dataset"""
    safe_model_name = model.replace('/', '_')
    clusters_file = os.path.join(
        project_root, 
        f"results/{dataset}/{safe_model_name}/{safe_model_name}_clusters.csv"
    )
    
    if not os.path.exists(clusters_file):
        print(f"No clusters file found for {model} on {dataset}")
        return None
    
    try:
        return pd.read_csv(clusters_file)
    except Exception as e:
        print(f"Error loading clusters for {model} on {dataset}: {str(e)}")
        return None

def analyze_model_output(project_root, benchmark_data):
    """Analyze and compare model outputs with benchmark data"""
    results = defaultdict(lambda: defaultdict(dict))
    
    for dataset, clusters in benchmark_data.items():
        # Count benchmark clusters and genes for this dataset
        benchmark_clusters = list(clusters.keys())
        benchmark_genes = []
        for cluster_data in clusters.values():
            benchmark_genes.extend(cluster_data['genes'])
        benchmark_genes = list(set(benchmark_genes))  # Remove duplicates
        
        for model in models_to_check:
            print(f"  Checking model: {model}")
            
            # Load clusters.csv for this model and dataset
            safe_model_name = model.replace('/', '_')
            clusters_file = os.path.join(
                project_root, 
                f"results/{dataset}/{safe_model_name}/{safe_model_name}_clusters.csv"
            )
            
            if not os.path.exists(clusters_file):
                # Record failure and continue to next model
                results[dataset][model] = create_empty_result(benchmark_clusters, benchmark_genes, 
                                                             'Failed - No clusters.csv found')
                continue
            
            try:
                clusters_df = pd.read_csv(clusters_file)
            except Exception as e:
                # Record error and continue to next model
                results[dataset][model] = create_empty_result(benchmark_clusters, benchmark_genes, 
                                                             f'Failed - Error loading file: {str(e)}')
                continue
            
            # Check for required columns
            if 'cluster_id' not in clusters_df.columns:
                results[dataset][model] = create_empty_result(benchmark_clusters, benchmark_genes, 
                                                             'Failed - Missing cluster_id column')
                continue
            
            # Process model output
            results[dataset][model] = compare_model_to_benchmark(clusters_df, benchmark_clusters, 
                                                                benchmark_genes, clusters)
    
    return results

def create_empty_result(benchmark_clusters, benchmark_genes, status='Failed'):
    """Create a default result structure for failed analyses"""
    return {
        'status': status,
        'cluster_matches': {},
        'gene_categories': {},
        'total_clusters': len(benchmark_clusters),
        'found_clusters': 0,
        'found_cluster_percent': 0.0,
        'total_genes': len(benchmark_genes),
        'found_genes': 0,
        'found_gene_percent': 0.0,
        'novel_count': 0,
        'uncharacterized_count': 0
    }

def compare_model_to_benchmark(clusters_df, benchmark_clusters, benchmark_genes, benchmark_data):
    """Compare model output to benchmark data and compute metrics"""
    # Find all columns that might contain genes
    gene_columns = [col for col in clusters_df.columns 
                    if any(term in col.lower() for term in ['gene', 'genes'])]
    
    # Convert cluster IDs to strings for comparison
    clusters_df['cluster_id_str'] = clusters_df['cluster_id'].astype(str)
    
    # Initialize tracking
    cluster_matches = {}
    found_clusters = []
    gene_categories = {}
    found_genes = []
    novel_count = 0
    uncharacterized_count = 0
    
    # Check each benchmark cluster
    for benchmark_cluster, cluster_info in benchmark_data.items():
        benchmark_function = cluster_info['function'].lower()
        benchmark_genes_for_cluster = cluster_info['genes']
        
        # Find matching clusters in model output
        cluster_rows = clusters_df[clusters_df['cluster_id_str'] == benchmark_cluster]
        
        if not cluster_rows.empty:
            # Found exact cluster match
            cluster_matches[benchmark_cluster] = 'found'
            found_clusters.append(benchmark_cluster)
            
            # Check for function match
            function_match = check_function_match(cluster_rows, benchmark_function)
            if function_match:
                cluster_matches[benchmark_cluster] += '_with_matching_function'
            
            # Check each gene in this cluster
            for gene in benchmark_genes_for_cluster:
                gene_category = find_gene_category(cluster_rows, gene, gene_columns)
                
                if gene_category != 'not_found':
                    found_genes.append(gene)
                    
                    # Count special categories
                    if gene_category == 'novel_role':
                        novel_count += 1
                    elif gene_category == 'uncharacterized':
                        uncharacterized_count += 1
                
                gene_categories[gene] = gene_category
        else:
            # No exact match, check for similar function in other clusters
            function_cluster = find_function_in_other_clusters(clusters_df, benchmark_function)
            if function_cluster:
                cluster_matches[benchmark_cluster] = f"similar_function_in_{function_cluster}"
            else:
                cluster_matches[benchmark_cluster] = 'not_found'
    
    # Calculate metrics
    total_clusters = len(benchmark_clusters)
    found_cluster_count = len(found_clusters)
    found_cluster_percent = (found_cluster_count / total_clusters) * 100 if total_clusters > 0 else 0
    
    total_genes = len(benchmark_genes)
    found_gene_count = len(set(found_genes))  # Ensure uniqueness
    found_gene_percent = (found_gene_count / total_genes) * 100 if total_genes > 0 else 0
    
    # Print summary
    print(f"    Found {found_cluster_count}/{total_clusters} clusters ({found_cluster_percent:.1f}%)")
    print(f"    Found {found_gene_count}/{total_genes} genes ({found_gene_percent:.1f}%)")
    print(f"    Novel genes: {novel_count}, Uncharacterized genes: {uncharacterized_count}")
    
    return {
        'status': 'Success',
        'total_clusters': total_clusters,
        'found_clusters': found_cluster_count,
        'found_cluster_percent': found_cluster_percent,
        'cluster_matches': cluster_matches,
        'total_genes': total_genes,
        'found_genes': found_gene_count,
        'found_gene_percent': found_gene_percent,
        'novel_count': novel_count,
        'uncharacterized_count': uncharacterized_count,
        'gene_categories': gene_categories
    }

def check_function_match(cluster_rows, benchmark_function):
    """Check if any function column matches the benchmark function"""
    function_columns = ['cluster_biological_process', 'function', 'biological_process', 'description']
    
    for function_col in function_columns:
        if function_col in cluster_rows.columns:
            for _, row in cluster_rows.iterrows():
                if pd.notna(row[function_col]) and any(term in str(row[function_col]).lower() 
                                                      for term in benchmark_function.split()):
                    return True
    
    return False

def find_function_in_other_clusters(clusters_df, benchmark_function):
    """Find any cluster with a similar function to the benchmark"""
    function_columns = ['cluster_biological_process', 'function', 'biological_process', 'description']
    
    for function_col in function_columns:
        if function_col in clusters_df.columns:
            for _, row in clusters_df.iterrows():
                if pd.notna(row[function_col]) and any(term in str(row[function_col]).lower() 
                                                      for term in benchmark_function.split()):
                    return row['cluster_id']
    
    return None

def find_gene_category(cluster_rows, gene, gene_columns):
    """Find gene in cluster and determine its category"""
    # First check if gene is present in any gene column
    gene_found = False
    
    for _, row in cluster_rows.iterrows():
        for gene_col in gene_columns:
            if gene_col in row and pd.notna(row[gene_col]):
                gene_list = parse_gene_list(row[gene_col])
                if gene in gene_list:
                    gene_found = True
                    break
        
        if gene_found:
            # Now determine the category
            if 'novel_role_genes' in row and gene in parse_gene_list(row['novel_role_genes']):
                return 'novel_role'
            
            if 'uncharacterized_genes' in row and gene in parse_gene_list(row['uncharacterized_genes']):
                return 'uncharacterized'
            
            if 'established_genes' in row and gene in parse_gene_list(row['established_genes']):
                return 'established'
            
            # Found but not categorized
            return 'found'
    
    return 'missing_in_cluster'

def generate_summary_table(results):
    """Generate enhanced summary table of results with additional metrics"""
    rows = []
    
    for dataset, models in results.items():
        for model, data in models.items():
            # Skip failed analyses
            if data['status'] != 'Success':
                row = {
                    'Dataset': dataset,
                    'Model': model,
                    'Status': data['status'],
                    'Clusters Found': f"0/{data['total_clusters']} (0.0%)",
                    'Genes Found': f"0/{data['total_genes']} (0.0%)",
                    'Novel Role Genes': 0,
                    'Uncharacterized Genes': 0,
                    'Novel %': '0.0%',
                    'Unchar %': '0.0%'
                }
                rows.append(row)
                continue
            
            # Calculate the sum of novel and uncharacterized genes
            novel_unchar_count = data['novel_count'] + data['uncharacterized_count']
            # Calculate the percentage of novel and uncharacterized genes
            novel_unchar_percent = (novel_unchar_count / data['total_genes']) * 100 if data['total_genes'] > 0 else 0
            
            # Create a row with all metrics
            row = {
                'Dataset': dataset,
                'Model': model,
                'Status': data['status'],
                'Clusters Found': f"{data['found_clusters']}/{data['total_clusters']} ({data['found_cluster_percent']:.1f}%)",
                'Genes Found': f"{data['found_genes']}/{data['total_genes']} ({data['found_gene_percent']:.1f}%)",
                'Novel Role Genes': data['novel_count'],
                'Uncharacterized Genes': data['uncharacterized_count'],
                'Novel/Unchar Count': novel_unchar_count,
                'Novel/Unchar %': f"{novel_unchar_percent:.1f}%"
            }
            rows.append(row)

    # Convert to DataFrame
    if rows:
        return pd.DataFrame(rows)
    return None

def generate_cluster_table(results, project_root):
    """Generate a cluster information table with biological process and follow-up suggestion"""
    # Collect all unique datasets and clusters
    dataset_clusters = {}
    all_models = set()
    
    for dataset, models in results.items():
        dataset_clusters[dataset] = set()
        for model, data in models.items():
            all_models.add(model)
            if data['status'] == 'Success':
                dataset_clusters[dataset].update(data['cluster_matches'].keys())
    
    # Sort models for consistent column order
    sorted_models = sorted(all_models)
    
    # Create rows for the table
    rows = []
    for dataset, clusters in dataset_clusters.items():
        for cluster in sorted(clusters):
            # Create a row for each dataset-cluster combination
            row = {
                'Dataset': dataset,
                'Cluster': cluster
            }
            
            # Add columns for each model
            for model in sorted_models:
                if model in results[dataset] and results[dataset][model]['status'] == 'Success':
                    # Get the biological process and follow-up suggestion
                    process, followup = get_cluster_info(project_root, dataset, model, cluster)
                    row[f"{model}_biological_process"] = process
                    row[f"{model}_follow_up"] = followup
                else:
                    row[f"{model}_biological_process"] = 'N/A'
                    row[f"{model}_follow_up"] = 'N/A'
            
            rows.append(row)
    
    # Convert to DataFrame
    if rows:
        return pd.DataFrame(rows)
    return None

def get_cluster_info(project_root, dataset, model, cluster):
    """Get both biological process and follow-up suggestion for a specific cluster"""
    # Load the clusters file for this model using the provided function
    clusters_df = load_clusters_data(project_root, dataset, model)
    
    if clusters_df is None:
        return "File not found", "File not found"
    
    # Convert cluster IDs to strings for comparison
    clusters_df['cluster_id_str'] = clusters_df['cluster_id'].astype(str)
    
    # Find the matching cluster
    cluster_rows = clusters_df[clusters_df['cluster_id_str'] == cluster]
    
    if cluster_rows.empty:
        return "Cluster not found", "Cluster not found"
    
    # Get biological process
    process = "No process found"
    for column in ['cluster_biological_process', 'biological_process', 'function', 'description']:
        if column in cluster_rows.columns and pd.notna(cluster_rows.iloc[0][column]):
            process = cluster_rows.iloc[0][column]
            break
    
    # Get follow-up suggestion
    followup = "No follow-up found"
    if 'follow_up_suggestion' in cluster_rows.columns and pd.notna(cluster_rows.iloc[0]['follow_up_suggestion']):
        followup = cluster_rows.iloc[0]['follow_up_suggestion']
    
    return process, followup

def generate_streamlined_gene_table(results):
    """Generate a streamlined gene categorization table as requested"""
    # Collect all unique datasets and genes
    dataset_genes = {}
    all_models = set()
    
    for dataset, models in results.items():
        dataset_genes[dataset] = set()
        for model, data in models.items():
            all_models.add(model)
            if data['status'] == 'Success':
                dataset_genes[dataset].update(data['gene_categories'].keys())
    
    # Sort models for consistent column order
    sorted_models = sorted(all_models)
    
    # Create rows for the table
    rows = []
    for dataset, genes in dataset_genes.items():
        for gene in sorted(genes):
            # Create a row for each dataset-gene combination
            row = {
                'Dataset': dataset,
                'Gene': gene
            }
            
            # Add a column for each model
            for model in sorted_models:
                if model in results[dataset] and results[dataset][model]['status'] == 'Success':
                    gene_categories = results[dataset][model]['gene_categories']
                    # Get this gene's category for this model (or 'missing_cluster' if not found)
                    category = gene_categories.get(gene, 'missing_from_all_clusters')
                    row[model] = category
                else:
                    row[model] = 'N/A'  # Model analysis failed
            
            rows.append(row)
    
    # Convert to DataFrame
    if rows:
        return pd.DataFrame(rows)
    return None

def generate_reports(project_root, results):
    """Generate all summary reports"""
    # Basic summary table
    summary_df = generate_summary_table(results)
    if summary_df is not None:
        print("\n=== SUMMARY TABLE ===")
        print(summary_df.to_string())
        
        # Save to CSV
        summary_path = os.path.join(project_root, "benchmark_summary.csv")
        summary_df.to_csv(summary_path, index=False)
        print(f"\nSummary saved to {summary_path}")
    
    # Generate streamlined gene categorization table
    gene_table = generate_streamlined_gene_table(results)
    if gene_table is not None:
        print("\n=== GENE CATEGORIZATION TABLE ===")
        print(gene_table.to_string())
        
        # Save gene table to CSV
        gene_path = os.path.join(project_root, "gene_categorization.csv")
        gene_table.to_csv(gene_path, index=False)
        print(f"\nGene categorization saved to {gene_path}")
        
    # Generate cluster information table
    cluster_table = generate_cluster_table(results, project_root)
    if cluster_table is not None:
        print("\n=== CLUSTER INFORMATION TABLE ===")
        print(cluster_table.to_string())
        
        # Save cluster table to CSV
        cluster_path = os.path.join(project_root, "cluster_information.csv")
        cluster_table.to_csv(cluster_path, index=False)
        print(f"\nCluster information saved to {cluster_path}")

def main():
    """Main function for the benchmark analysis"""
    print("Analyzing gene cluster benchmarks...\n")
    
    # Load benchmark data
    benchmark_data = load_data(project_root)
    if not benchmark_data:
        print("Failed to load benchmark data. Check if findings.csv exists and is valid.")
        return
    
    # Analyze model outputs
    results = analyze_model_output(project_root, benchmark_data)
    
    # Generate and save reports
    generate_reports(project_root, results)
    
    print("\nAnalysis complete!")

if __name__ == "__main__":
    main()

Analyzing gene cluster benchmarks...


Analyzing dataset: funk

Analyzing dataset: schaffer

Analyzing dataset: wainberg
  Checking model: claude-3-7-sonnet-20250219
    Found 6/6 clusters (100.0%)
    Found 7/7 genes (100.0%)
    Novel genes: 3, Uncharacterized genes: 1
  Checking model: claude-3-5-haiku-20241022
    Found 6/6 clusters (100.0%)
    Found 7/7 genes (100.0%)
    Novel genes: 1, Uncharacterized genes: 1
  Checking model: gemini-2.5-flash-preview-04-17
    Found 5/6 clusters (83.3%)
    Found 6/7 genes (85.7%)
    Novel genes: 3, Uncharacterized genes: 0
  Checking model: gemini-2.5-pro-preview-03-25
    Found 6/6 clusters (100.0%)
    Found 7/7 genes (100.0%)
    Novel genes: 5, Uncharacterized genes: 0
  Checking model: gpt-4.1
    Found 6/6 clusters (100.0%)
    Found 7/7 genes (100.0%)
    Novel genes: 2, Uncharacterized genes: 0
  Checking model: gpt-4o
    Found 6/6 clusters (100.0%)
    Found 7/7 genes (100.0%)
    Novel genes: 2, Uncharacterized genes: 0
  Checkin