In [None]:
# Connor Murray
# Compares cis-eQTLs from TopChef with GTEx (Heart_Left_Ventricle)

# Packages
import os
import pandas as pd
import pyarrow.parquet as pq
import numpy as np

# Paths
gtex_path = "/standard/vol185/cphg_Manichaikul/projects/GTEx/GTEx_v10_TissueSpecific-eQTL/"
chef_path = "/standard/vol185/cphg_Manichaikul/users/csm6hg/nextflow_dna/output/tensorqtl_nominal/"

# Map GTEx files to chromosomes
gtex_files = [os.path.join(gtex_path, f) for f in os.listdir(gtex_path)
              if f.endswith(".parquet") and "Heart_Left" in f]

gtex_chrom_map = {}
for f in gtex_files:
    base = os.path.basename(f)
    # Assumes format: Heart_Left_Ventricle.chr19.parquet
    parts = base.split(".")
    if len(parts) > 3:
        chrom = parts[3]  # e.g., 'chr19'
        gtex_chrom_map[chrom] = f

# Get TopChef cis-eQTL files
chef_files = [os.path.join(chef_path, f) for f in os.listdir(chef_path) if f.endswith(".parquet")]

# Collect stats
results = []

for file in chef_files:
    chrom = os.path.basename(file).split("_")[1]  # Format: "topchef_chr19.parquet"

    if chrom not in gtex_chrom_map:
        print(f"Skipping {chrom}: No GTEx file available")
        continue

    # Load and filter TopChef
    chef_df = pd.read_parquet(file)
    chef_df = chef_df[chef_df["pval_nominal"] < 1e-5].copy()
    if chef_df.empty:
        continue
    chef_df["snp"] = chef_df["variant_id"] + "_" + chef_df["phenotype_id"]
    chef_snps = set(chef_df["snp"])

    # Load and filter GTEx
    gtex_df = pd.read_parquet(gtex_chrom_map[chrom])
    gtex_df = gtex_df[gtex_df["pval_nominal"] < 1e-5].copy()
    if gtex_df.empty:
        continue
    gtex_df["chrom"] = gtex_df['variant_id'].str.split("_").str[0]
    gtex_df["position"] = gtex_df['variant_id'].str.split("_").str[1]
    gtex_df["genei"] = gtex_df['gene_id'].str.split(".").str[0]
    gtex_df["snp"] = gtex_df["chrom"] + ":" + gtex_df["position"] + "_" + gtex_df["genei"]
    gtex_snps = set(gtex_df["snp"])

    # Overlap calculations
    intersection = chef_snps & gtex_snps
    union = chef_snps | gtex_snps

    percent_overlap = (len(intersection) / len(chef_snps)) * 100
    jaccard_index = len(intersection) / len(union) if len(union) > 0 else 0

    results.append({
        "chromosome": chrom,
        "percent_overlap_with_gtex": round(percent_overlap, 2),
        "jaccard_index": round(jaccard_index, 4),
        "n_overlap": len(intersection),
        "n_chef": len(chef_snps),
        "n_gtex": len(gtex_snps),
        "n_union": len(union)
    })

    print(f"[{chrom}] Overlap: {len(intersection)} / {len(chef_snps)} | "
          f"Percent: {percent_overlap:.1f}% | Jaccard: {jaccard_index:.4f}")

# Save summary
summary_df = pd.DataFrame(results).sort_values("chromosome")
summary_df.to_csv("topchef_cisnominaleqtl_maxPC70_gtex_lv_overlap.txt", sep="\t", index=False)

print(summary_df)

# Output
#overlap_df.to_csv("topchef_cisnominaleqtl_maxPC70_gtex_lv_overlap.txt", sep="\t")

In [None]:
from matplotlib import pyplot as plt
from matplotlib_venn import venn2
import matplotlib

# Aggregate totals
n_chef_total = summary_df["n_chef"].sum()
n_gtex_total = summary_df["n_gtex"].sum()
n_overlap_total = summary_df["n_overlap"].sum()

only_chef = n_chef_total - n_overlap_total
only_gtex = n_gtex_total - n_overlap_total
union_total = only_chef + only_gtex + n_overlap_total

# Calculate percentages
percent_chef = (only_chef / union_total) * 100
percent_gtex = (only_gtex / union_total) * 100
percent_overlap = (n_overlap_total / union_total) * 100

# Plot Venn
plt.figure(figsize=(7, 7))
venn = venn2(subsets=(only_chef, only_gtex, n_overlap_total),
             set_labels=('TopChef', 'GTEx v10 Heart LV'),
             set_colors=('skyblue', 'orchid'),
             alpha=0.6)

# Modify labels to include percentage below counts
venn.get_label_by_id('10').set_text(f"{only_chef}\n({percent_chef:.1f}%)")
venn.get_label_by_id('01').set_text(f"{only_gtex}\n({percent_gtex:.1f}%)")
venn.get_label_by_id('11').set_text(f"{n_overlap_total}\n({percent_overlap:.1f}%)")

plt.tight_layout()
#plt.show()
plt.savefig("../output/topchef_gtex_eqtl_overlap_summary.pdf", format="pdf")