In [None]:
# Connor Murray
# Started 12.1.2024; modified 2-,4-2025
# Analyzing overlap of disease eQTLs and GTEx across tissue types

# Import libraries
import os
import pandas as pd
import numpy as np
import pyreadr as pr

# Working directory
os.chdir("/standard/vol185/cphg_Manichaikul/users/csm6hg")

# GTEx eQTLs V10 list of Parquet files
gtex_files = [os.path.join("data/gtex_full_ciseqtl_v10/", f) 
              for f in os.listdir("data/gtex_full_ciseqtl_v10") 
              if f.endswith(".eGenes.txt.gz")]

# Find indices of files containing: heart
heart_indices = [i for i, file in enumerate(gtex_files) if "heart" in file.lower()]
heart_files = [gtex_files[i] for i in heart_indices]
print(heart_indices)

# TOPCHEF eQTLs - mapped using tensorqtl
chef_df = pr.read_r("nextflow_dna/output/analyses/qtl.rna.saturation.rds")
chef_df = chef_df[None]
chef_df = chef_df[chef_df['maxPC'] == 70]
chef_df = chef_df[chef_df["pval_perm"] < 0.05] # Restrict to best model

# Load only the `variant_id` column
r = list(filter(lambda item: "Heart" in item, gtex_files))
gtex_df = pd.read_table(r[0])
gtex_df = gtex_df[gtex_df["pval_perm"] < 0.05]

# GTEx ensure correct ID
gtex_df = gtex_df.assign(
    chrom=gtex_df['variant_id'].str.split("_").str[0],
    position=gtex_df['variant_id'].str.split("_").str[1],
    genei=gtex_df['gene_id'].str.split(".").str[0],
    snp=lambda x: x['chrom'] + ":" + x['position'] + "_" + x['genei'])

# Chef dataset
chef_df = chef_df.assign(
    snp=lambda x: x['variant_id'] + "_" + x['phenotype_id'])
    
# Calculate proportion of variant IDs in GTEx SNPs
prop_table = chef_df['snp'].isin(gtex_df['snp'])
proportion = np.round(prop_table.value_counts(normalize=True)*100, 2)

# Add overlap column
chef_df['overlaps_gtex'] = chef_df['snp'].isin(gtex_df['snp'])
gtex_df['gtex_tissue'] = os.path.basename(r[0]).split(".")[0]
gtex_metrics = (gtex_df
                .loc[:, ['snp', 'slope', 'slope_se', 'pval_perm', 'gtex_tissue']]
                .rename(columns={'slope': 'gtex_slope',
                                 'slope_se': 'gtex_slope_se',
                                 'pval_perm': 'gtex_pval_perm'}))

# Create new dataset w/gtex information
chef_df1 = chef_df.merge(gtex_metrics, on='snp', how='left')
print(chef_df1)

# Output
chef_df1.to_csv("topchef_eqtl_maxPC70_gtex_lv.txt", sep="\t")

# Ensure both True and False proportions are present
true_prop = proportion.get(True, 0.0)
false_prop = proportion.get(False, 0.0)

true_prop