In [2]:
# Import necessary libraries
import os
from dotenv import load_dotenv

# Import from the refactored utilities
from utils.cluster_analyzer import analyze_gene_clusters

# Load environment variables (for API keys)
load_dotenv()

# Set up paths and parameters
DATA_DIR = "data"
RESULTS_DIR = "results/luke_benchmark_1"
INPUT_FILE = f"{DATA_DIR}/HeLa_essentials/cell_2022_clustering.csv"
PROCESSED_FILE = f"{DATA_DIR}/luke_interphase.csv"
GENE_FEATURES = f"{DATA_DIR}/HeLa_essentials/essentials_uniprot.csv"
PROJECT_NAME = "sample_gene_sets"

# Create results directory
os.makedirs(RESULTS_DIR, exist_ok=True)

In [None]:
# # Step 1: Reshape clusters - now using the refactored function
# print("Preprocessing data: Converting raw data to cluster format...")
# cluster_df = reshape_to_clusters(
#     input_file=INPUT_FILE,
#     output_file=PROCESSED_FILE,
#     sep=",",
#     gene_col="gene_symbol_0",
#     cluster_col="cluster",
#     gene_sep=";"
# )

# # Display a sample of the reshaped data
# print("Sample of reshaped data:")
# display(cluster_df.head())

In [3]:
# Step 2: Run analysis with OpenAI GPT-4o
print("Running analysis with OpenAI GPT-4o...")
openai_results = analyze_gene_clusters(
    input_file=PROCESSED_FILE,
    output_file=f"{RESULTS_DIR}/{PROJECT_NAME}_openai",
    config_path="config_openai.json",
    model_name="gpt-4o",
    custom_prompt_path="prompts/top_targets.txt",
    gene_features_path=GENE_FEATURES,
    screen_info_path="prompts/HeLa_interphase_screen_info.txt",
    gene_column="genes",
    gene_sep=";",
    batch_size=1,
    cluster_id_column="cluster_id",
)

# Step 3: Run analysis with Anthropic Claude-3-7-Sonnet
print("Running analysis with Anthropic Claude-3-7-Sonnet...")
claude_results = analyze_gene_clusters(
    input_file=PROCESSED_FILE,
    output_file=f"{RESULTS_DIR}/{PROJECT_NAME}_anthropic",
    config_path="config_anthropic.json",
    model_name="claude-3-7-sonnet-20250219",
    custom_prompt_path="prompts/top_targets.txt",
    gene_features_path=GENE_FEATURES,
    screen_info_path="prompts/HeLa_interphase_screen_info.txt",
    gene_column="genes",
    gene_sep=";",
    batch_size=1,
    cluster_id_column="cluster_id",
)

print(f"Analysis complete. Results saved to {RESULTS_DIR}/")

Running analysis with OpenAI GPT-4o...
Loaded data with 6 rows and columns: ['cluster_id', 'genes']
Loaded features for 1095 genes
Loaded screen information: 2129 characters


Processing clusters:   0%|          | 0/6 [00:00<?, ?it/s]

Added 6 gene feature descriptions to prompt


Processing clusters:  17%|█▋        | 1/6 [00:07<00:39,  7.95s/it]

Added 2 gene feature descriptions to prompt


Processing clusters:  33%|███▎      | 2/6 [00:15<00:30,  7.64s/it]

Added 5 gene feature descriptions to prompt


Processing clusters:  50%|█████     | 3/6 [00:24<00:24,  8.13s/it]

Added 3 gene feature descriptions to prompt


Processing clusters:  67%|██████▋   | 4/6 [00:31<00:15,  7.89s/it]

Added 2 gene feature descriptions to prompt


INFO:cluster_analysis_20250501_161139.log:Saved progress for 5 clusters
Processing clusters:  83%|████████▎ | 5/6 [00:37<00:07,  7.09s/it]INFO:cluster_analysis_20250501_161139.log:Accessing OpenAI API


Added 3 gene feature descriptions to prompt


INFO:cluster_analysis_20250501_161139.log:API call successful: 2476 tokens, $0.0248
INFO:cluster_analysis_20250501_161139.log:Success for cluster 37
Processing clusters: 100%|██████████| 6/6 [00:58<00:00,  9.82s/it]
INFO:cluster_analysis_20250501_161139.log:Completed analysis for 6 clusters
INFO:cluster_analysis_20250501_161238.log:Processing 6 clusters with model claude-3-7-sonnet-20250219


Running analysis with Anthropic Claude-3-7-Sonnet...
Loaded data with 6 rows and columns: ['cluster_id', 'genes']
Loaded features for 1095 genes
Loaded screen information: 2129 characters


Processing clusters:   0%|          | 0/6 [00:00<?, ?it/s]INFO:cluster_analysis_20250501_161238.log:Using Anthropic Claude API


Added 6 gene feature descriptions to prompt


INFO:cluster_analysis_20250501_161238.log:Anthropic API call successful: model=claude-3-7-sonnet-20250219
INFO:cluster_analysis_20250501_161238.log:Success for cluster 149
Processing clusters:  17%|█▋        | 1/6 [00:24<02:02, 24.58s/it]INFO:cluster_analysis_20250501_161238.log:Using Anthropic Claude API


Added 2 gene feature descriptions to prompt


INFO:cluster_analysis_20250501_161238.log:Anthropic API call successful: model=claude-3-7-sonnet-20250219
INFO:cluster_analysis_20250501_161238.log:Success for cluster 121
Processing clusters:  33%|███▎      | 2/6 [00:41<01:20, 20.14s/it]INFO:cluster_analysis_20250501_161238.log:Using Anthropic Claude API


Added 5 gene feature descriptions to prompt


INFO:cluster_analysis_20250501_161238.log:Anthropic API call successful: model=claude-3-7-sonnet-20250219
INFO:cluster_analysis_20250501_161238.log:Success for cluster 21
Processing clusters:  50%|█████     | 3/6 [01:02<01:01, 20.35s/it]INFO:cluster_analysis_20250501_161238.log:Using Anthropic Claude API


Added 3 gene feature descriptions to prompt


INFO:cluster_analysis_20250501_161238.log:Anthropic API call successful: model=claude-3-7-sonnet-20250219
INFO:cluster_analysis_20250501_161238.log:Success for cluster 167
Processing clusters:  67%|██████▋   | 4/6 [01:13<00:33, 16.62s/it]INFO:cluster_analysis_20250501_161238.log:Using Anthropic Claude API


Added 2 gene feature descriptions to prompt


INFO:cluster_analysis_20250501_161238.log:Anthropic API call successful: model=claude-3-7-sonnet-20250219
INFO:cluster_analysis_20250501_161238.log:Success for cluster 197
INFO:cluster_analysis_20250501_161238.log:Saved progress for 5 clusters
Processing clusters:  83%|████████▎ | 5/6 [01:31<00:17, 17.39s/it]INFO:cluster_analysis_20250501_161238.log:Using Anthropic Claude API


Added 3 gene feature descriptions to prompt


INFO:cluster_analysis_20250501_161238.log:Anthropic API call successful: model=claude-3-7-sonnet-20250219
INFO:cluster_analysis_20250501_161238.log:Success for cluster 37
Processing clusters: 100%|██████████| 6/6 [01:44<00:00, 17.47s/it]
INFO:cluster_analysis_20250501_161238.log:Completed analysis for 6 clusters


Analysis complete. Results saved to results/luke_benchmark_1/
