In [None]:
import pandas as pd
import pyarrow.dataset as ds
import gcsfs

# GCS base path
gcp_base_path = "gs://arc-ctc-tahoe100/2025-02-25/"

# Initialize GCS filesystem
fs = gcsfs.GCSFileSystem()


: 

In [None]:
# Load the main metadata table
obs_path = f"{gcp_base_path}metadata/obs_metadata.parquet"

# Load a large chunk or the whole thing depending on RAM
obs_df = ds.dataset(obs_path, filesystem=fs, format="parquet").to_table().to_pandas()
# Optional: limit for testing
# obs_df = ds.dataset(obs_path, filesystem=fs, format="parquet").head(100000).to_pandas()

# Preview
obs_df.head()


In [None]:
# Expected columns: 'drug', 'cell_line', 'cell_name'
# We assume 'drug' column contains the actual name (e.g., 'Bestatin (hydrochloride)')
# Some versions store it as a list of tuples

# Normalize 'drug' column if it's a list of tuples
if isinstance(obs_df['drug'].iloc[0], list):
    obs_df['drug_name'] = obs_df['drug'].apply(lambda x: x[0][0] if x else None)
else:
    obs_df['drug_name'] = obs_df['drug']

# Extract needed columns
tahoe_df = obs_df[['drug_name', 'cell_line']].dropna().drop_duplicates()

# Stats
num_drugs = tahoe_df['drug_name'].nunique()
num_cells = tahoe_df['cell_line'].nunique()
num_pairs = tahoe_df.shape[0]

print(f"🧪 Tahoe Dataset Summary:")
print(f"🔹 Unique Drugs: {num_drugs}")
print(f"🔹 Unique Cell Lines: {num_cells}")
print(f"🔹 (Drug, Cell Line) Pairs: {num_pairs}")


In [None]:
# Load GDSC and scRNA-seq data
gdsc_df = pd.read_csv("gdsc/gdsc_final_cleaned.csv", usecols=["SANGER_MODEL_ID"])
sc_df = pd.read_csv("sc_data/rnaseq_all_data.csv", usecols=["model_id"])

gdsc_ids = set(gdsc_df["SANGER_MODEL_ID"].astype(str))
sc_ids = set(sc_df["model_id"].astype(str))
tahoe_ids = set(tahoe_df["cell_line"].astype(str))

# Overlap
overlap_gdsc = tahoe_ids & gdsc_ids
overlap_sc = tahoe_ids & sc_ids

print(f"\n🔗 Overlap Comparison:")
print(f"🔁 Cell lines in both Tahoe & GDSC: {len(overlap_gdsc)}")
print(f"🔁 Cell lines in both Tahoe & scRNA-seq: {len(overlap_sc)}")


In [None]:
summary = {
    "Tahoe Unique Drugs": num_drugs,
    "Tahoe Unique Cell Lines": num_cells,
    "Tahoe (Drug, Cell Line) Pairs": num_pairs,
    "Overlap with GDSC": len(overlap_gdsc),
    "Overlap with scRNA-seq": len(overlap_sc)
}

pd.DataFrame(summary.items(), columns=["Metric", "Value"]).to_csv("statistics/tahoe_obs_summary.csv", index=False)
