In [None]:
# Import necessary libraries
import os
from dotenv import load_dotenv

# Import from the refactored utilities
from utils.cluster_analyzer import analyze_gene_clusters

# Load environment variables (for API keys)
load_dotenv()

# Set up paths and parameters
DATA_DIR = "data"
RESULTS_DIR = "results/luke_benchmark_1"
INPUT_FILE = f"{DATA_DIR}/HeLa_essentials/cell_2022_clustering.csv"
PROCESSED_FILE = f"{DATA_DIR}/luke_interphase.csv"
GENE_FEATURES = f"{DATA_DIR}/HeLa_essentials/essentials_uniprot.csv"
PROJECT_NAME = "sample_gene_sets"

# Create results directory
os.makedirs(RESULTS_DIR, exist_ok=True)

In [None]:
# # Step 1: Reshape clusters - now using the refactored function
# print("Preprocessing data: Converting raw data to cluster format...")
# cluster_df = reshape_to_clusters(
#     input_file=INPUT_FILE,
#     output_file=PROCESSED_FILE,
#     sep=",",
#     gene_col="gene_symbol_0",
#     cluster_col="cluster",
#     gene_sep=";"
# )

# # Display a sample of the reshaped data
# print("Sample of reshaped data:")
# display(cluster_df.head())

In [None]:
# Step 2: Run analysis with OpenAI GPT-4o
print("Running analysis with OpenAI GPT-4o...")
openai_results = analyze_gene_clusters(
    input_file=PROCESSED_FILE,
    output_file=f"{RESULTS_DIR}/{PROJECT_NAME}_openai",
    config_path="config_openai.json",
    model_name="gpt-4o",
    custom_prompt_path="prompts/top_targets.txt",
    gene_features_path=GENE_FEATURES,
    screen_info_path="prompts/HeLa_interphase_screen_info.txt",
    gene_column="genes",
    gene_sep=";",
    batch_size=1,
)

# Step 3: Run analysis with Anthropic Claude-3-7-Sonnet
print("Running analysis with Anthropic Claude-3-7-Sonnet...")
claude_results = analyze_gene_clusters(
    input_file=PROCESSED_FILE,
    output_file=f"{RESULTS_DIR}/{PROJECT_NAME}_anthropic",
    config_path="config_anthropic.json",
    model_name="claude-3-7-sonnet-20250219",
    custom_prompt_path="prompts/top_targets.txt",
    gene_features_path=GENE_FEATURES,
    screen_info_path="prompts/HeLa_interphase_screen_info.txt",
    gene_column="genes",
    gene_sep=";",
    batch_size=1,
)

print(f"Analysis complete. Results saved to {RESULTS_DIR}/")

# Analysis code - load and visualize results
# (Same as in previous examples)