In [1]:
import os
import time
import dxpy
import logging
import pandas as pd
import re


In [2]:
RERUN_NEARBY_VARIANTS=True
RERUN_GWAS_QC=False

In [3]:
if RERUN_NEARBY_VARIANTS:
    from pyspark.sql import SparkSession
    import hail as hl

    # Had to set the configuration to navigate RDD partition error
    # Build spark
    builder = (
        SparkSession
        .builder
        .appName("Autosome QC")  # Set a meaningful application name
        # .config("spark.driver.memory", "96g")  # Set driver memory (e.g., 8 GB)
        # .config("spark.executor.memory", "108g")  # Set executor memory (e.g., 16 GB)
        # .config("spark.executor.cores", "30")  # Optional: Set number of cores per executor 
        .enableHiveSupport()
    )
    spark = builder.getOrCreate()

    hl.init(sc=spark.sparkContext, idempotent=True)


pip-installed Hail requires additional configuration options in Spark referring
  to the path to the Hail Python module directory HAIL_DIR,
  e.g. /path/to/python/site-packages/hail:
    spark.jars=HAIL_DIR/backend/hail-all-spark.jar
    spark.driver.extraClassPath=HAIL_DIR/backend/hail-all-spark.jar
    spark.executor.extraClassPath=./hail-all-spark.jarRunning on Apache Spark version 3.5.2
SparkUI available at http://ip-10-60-105-207.eu-west-2.compute.internal:8081
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.132-678e1f52b999
LOGGING: writing to /opt/notebooks/hail-20250214-0238-0.2.132-678e1f52b999.log


In [4]:
def save_in_hail_format(hail_obj, db_name, hail_obj_name, rerun):
    # Create DB if it does not exist
    stmt = f"CREATE DATABASE IF NOT EXISTS {db_name} LOCATION 'dnax://'"
    spark.sql(stmt).show()
    # Find database ID of newly created database using dxpy method
    db_uri = dxpy.find_one_data_object(name=f"{db_name}".lower(), classname="database")['id']
    # Write hail object
    url = f"dnax://{db_uri}/{hail_obj_name}"
    if rerun:
        hail_obj.write(url, overwrite=True)
    return url

def get_url(db_name, hail_obj_name):
    # Find database ID of newly created database using dxpy method
    db_uri = dxpy.find_one_data_object(name=f"{db_name}".lower(), classname="database")['id']
    # Write hail object
    url = f"dnax://{db_uri}/{hail_obj_name}"
    return url

def upload_file_to_project(filename, proj_dir):
    dxpy.upload_local_file(filename, folder=proj_dir, parents=True)
    print(f"*********{filename} uploaded!!*********")
    os.remove(filename)
    return

def get_nearby_variants(mt, interval):
    autosome_mt = hl.filter_intervals(mt, [
        hl.parse_locus_interval(interval, reference_genome='GRCh38')
    ])
    # further filter to keep rows
    autosome_mt = autosome_mt.filter_rows(
        (autosome_mt.variant_qc.AF[1] > 0.01) & (autosome_mt.variant_qc.AC[1] > 0)
    )

    autosome_mt = autosome_mt.select_rows(
        maf=autosome_mt.variant_qc.AF[1], mac=autosome_mt.variant_qc.AC[1]
    )
    nearby_locus_alleles = autosome_mt.rows()

    nearby_locus_df = nearby_locus_alleles.to_pandas()
    nearby_locus_df["alleles"] = nearby_locus_df.alleles.apply(lambda x: "_".join(x))

    return nearby_locus_df


# Filter autosomal variants to gene intervals

1MB both sides of the gene start

In [None]:
gene_interval_dict = {
    "YLPM1": "chr14:73763316-75763316",
    "RIF1": "chr2:150409883-152409883",
    "GIGYF1": "chr7:99679507-101679507",
    "SLC5A3": "chr21:33073578-35073578",
    "GRM7": "chr3:5770001-7770001"
}

if RERUN_NEARBY_VARIANTS:
    url = get_url("exomes", "autosomes_vqc.mt")
    mt = hl.read_matrix_table(url)
    print(mt.n_partitions())
    
    for gene in gene_interval_dict.keys():
        interval = gene_interval_dict[gene]

        nearby_locus_df = get_nearby_variants(mt, interval)
        proj_dir = f"/notebooks/bmi/data/downstream/shadow_effect/{gene}/"
        filename = "nearby_common_variants.tsv"
        nearby_locus_df.to_csv(filename, index=False, sep="\t")
        upload_file_to_project(filename, proj_dir)
    
    hl.stop()
    spark.sparkContext.stop()
    spark.stop()


63594
*********nearby_common_variants.tsv uploaded!!*********
*********nearby_common_variants.tsv uploaded!!*********


# Process BMI locus from Locke et al

In [None]:
if RERUN_GWAS_QC:
    gwas_file = "/mnt/project/notebooks/bmi/data/downstream/shadow_effect/locke_harmonised.tsv.gz"
    gwas_df = pd.read_csv(gwas_file, sep="\t", low_memory=False)
    print(len(gwas_df))

    # keep and arrange required fields
    # here since info is not avaialble it is replaced by a code columns which will not be used
    required_cols = [
        "hm_chrom", "hm_pos", "hm_rsid", "hm_effect_allele", "hm_other_allele", 
        "n", "standard_error", "p_value", "hm_beta", "hm_code", "hm_effect_allele_frequency"]

    gwas_df = gwas_df.loc[:, required_cols]
    gwas_df = gwas_df.loc[~gwas_df.hm_beta.isna()].reset_index(drop=True)
    gwas_df = gwas_df.loc[~gwas_df.duplicated(["hm_chrom", "hm_pos"], keep=False)]

    # remove ambiguous SNP
    gwas_df = gwas_df.loc[~
        ((gwas_df.hm_effect_allele=="A")&(gwas_df.hm_other_allele=="T")|
        (gwas_df.hm_effect_allele=="T")&(gwas_df.hm_other_allele=="A")|
        (gwas_df.hm_effect_allele=="C")&(gwas_df.hm_other_allele=="G")|
        (gwas_df.hm_effect_allele=="G")&(gwas_df.hm_other_allele=="C"))
    ]

    # rename columns and only keep autosomes
    gwas_df["hm_chrom"] = "chr" + gwas_df.hm_chrom
    gwas_df["hm_pos"] = gwas_df.hm_pos.astype(int)
    gwas_df.columns = [c.lstrip("hm_") for c in gwas_df.columns]
    gwas_df = gwas_df.loc[gwas_df.chrom.isin([f"chr{i}" for i in range(1, 23)])]

    proj_dir = f"/notebooks/bmi/data/downstream/shadow_effect/"
    filename = "locke_harmonised.qc.tsv.gz"
    gwas_df.to_csv(filename, sep="\t", index=False)
    upload_file_to_project(filename, proj_dir)
else:
    gwas_file = "/mnt/project/notebooks/bmi/data/downstream/shadow_effect/locke_harmonised.qc.tsv.gz"
    gwas_df = pd.read_csv(gwas_file, sep="\t", low_memory=False, dtype={"pos": str})


# Overlap nearby variants with significant gwas hits

In [None]:
for gene in gene_interval_dict.keys():
    nearby_locus_file = f"/mnt/project/notebooks/bmi/data/downstream/shadow_effect/{gene}/nearby_common_variants.tsv"
    nearby_locus_df = pd.read_csv(nearby_locus_file,  sep="\t")
    nearby_locus_df[["chrom", "pos"]] = nearby_locus_df.locus.astype(str).str.split(":", expand=True)
    nearby_locus_df[["other_allele", "effect_allele"]] = nearby_locus_df.alleles.str.split("_", expand=True)
    overlapped_df = nearby_locus_df.merge(gwas_df, on=["chrom", "pos", "effect_allele", "other_allele"])
    overlapped_df = overlapped_df.loc[overlapped_df.p_value<0.01]
    overlapped_df["variant"] = overlapped_df.chrom.str.lstrip("chr") + ":" + overlapped_df.pos + ":" + overlapped_df.other_allele + ":" + overlapped_df.effect_allele

    proj_dir = f"/notebooks/bmi/data/downstream/shadow_effect/{gene}/"
    filename = "conditional_variants.txt"
    overlapped_df.loc[:, "variant"].to_csv(filename, sep="\t", index=False, header=False)
    upload_file_to_project(filename, proj_dir)
