In [None]:
# By: Connor S. Murray
# Started: 12.8.2024
# This script gets SNP/gene Ensembl metadata from chromosome and position information
# Example input: ["chr1:1026225", "chr1:907986"]

# Libraries
# Libraries
import requests
import pandas as pd
import pyreadr as pr
import os
import time
from concurrent.futures import ThreadPoolExecutor

# Working directory
os.chdir("/standard/vol185/cphg_Manichaikul/users/csm6hg")

# Number of threads
cores=10

# Define retry logic for requests
def retry_request(func):
    def wrapper(*args, **kwargs):
        retries = 3  # Number of retries
        delay = 2  # Delay in seconds between retries
        last_status = None  # Track the last status code
        for attempt in range(retries):
            response = func(*args, **kwargs)
            last_status = response.status_code
            if response.status_code == 429:
                print(f"Rate limit hit. Retrying... Attempt {attempt + 1} of {retries}.")
                time.sleep(delay)  # Wait before retrying
            elif response.ok:
                if attempt > 0:  # If it worked after retries
                    print(f"Request successful after {attempt} retries.")
                return response  # Successful response
            else:
                print(f"Unexpected error (status {response.status_code}). Retrying...")
                time.sleep(delay)
        print(f"Request failed after {retries} retries. Last status code: {last_status}.")
        return response  # Final response, even if unsuccessful
    return wrapper

# Apply retry decorator only to requests.get calls
@retry_request
def make_request(url):
    return requests.get(url)

# Query for metadata function with retry logic
def get_rsid_and_extra_info(chrom, pos):
    """
    Query Ensembl API to get RSIDs, allele frequencies, ancestral/derived alleles,
    and overlapping gene information.
    """
    variation_url = f"https://rest.ensembl.org/overlap/region/homo_sapiens/{chrom}:{pos}-{pos}?feature=variation;content-type=text/x-gff3"
    variation_response = make_request(variation_url)

    rsid_list = []
    ancestral_allele = None
    derived_allele = None
    maf = None

    if variation_response.ok:
        variation_data = variation_response.text
        for line in variation_data.splitlines():
            if not line.startswith("#"):  # Skip comment lines
                fields = line.split("\t")
                if len(fields) >= 9:
                    attributes = fields[8]
                    rsid = None
                    consequence = None
                    for attr in attributes.split(";"):
                        if attr.startswith("ID="):
                            rsid = attr.split("=")[1].replace("sequence_variant:", "")
                        if attr.startswith("consequence_type="):
                            consequence = attr.split("=")[1].replace("consequence_type:", "")
                    if rsid:
                        rsid_list.append((rsid, consequence))

        # Query for allele frequency and ancestral/derived alleles
        for rsid, _ in rsid_list:
            allele_url = f"https://rest.ensembl.org/variation/homo_sapiens/{rsid}?content-type=application/json"
            allele_response = make_request(allele_url)

            if allele_response.ok:
                allele_data = allele_response.json()
                mapping = allele_data["mappings"][0]  # Use the first mapping
                ancestral_allele = mapping.get("ancestral_allele", "N/A")
                derived_allele = allele_data.get("minor_allele", "N/A")
                maf = allele_data.get("MAF", "N/A")
                
    else:
        print(f"Error: {variation_response.status_code} for variation query at {chrom}:{pos}")

    return rsid_list, ancestral_allele, derived_allele, maf

def get_gene_description(gene_id):
    """
    Fetches a gene's description from the Ensembl API given the gene ID.
    """
    gene_url = f"https://rest.ensembl.org/lookup/id/{gene_id}?content-type=application/json"
    response = make_request(gene_url)
    if response.ok:
        gene_info = response.json()
        return gene_info.get('description', 'No description available')
    return 'No description available'

def get_rsid_gene_and_info(chrom, pos):
    """
    Combines RSID, gene, gene description, and allele information into a single function.
    """
    rsid_list, ancestral, derived, maf = get_rsid_and_extra_info(chrom, pos)

    # Query for overlapping gene information
    gene_url = f"https://rest.ensembl.org/overlap/region/homo_sapiens/{chrom}:{pos}-{pos}?feature=gene;content-type=text/x-gff3"
    gene_response = make_request(gene_url)

    gene_id = None
    gene_name = None
    gene_description = None

    if gene_response.ok:
        gene_data = gene_response.text
        for line in gene_data.splitlines():
            if not line.startswith("#"):
                fields = line.split("\t")
                if len(fields) >= 9:
                    attributes = fields[8]
                    for attr in attributes.split(";"):
                        if attr.startswith("ID="):
                            gene_id = attr.split("=")[1].replace("gene:", "")
                        if attr.startswith("Name="):
                            gene_name = attr.split("=")[1]
                            break
        if gene_id:
            gene_description = get_gene_description(gene_id)  # Get gene description
    return rsid_list, gene_id, gene_name, gene_description, ancestral, derived, maf

def process_variant(variant):
    chrom, pos = variant.split(":")
    pos = pos.split("[")[0]  # Remove brackets and annotations
    
    print(f"Processing {chrom}:{pos}...")
    rsid_data, gene_id, gene_name, gene_description, ancestral, derived, maf = get_rsid_gene_and_info(chrom.replace("chr", ""), pos)

    result = []
    for rsid, consequence in rsid_data:
        result.append({
            "variant_id": variant,
            "chromosome": chrom,
            "position": pos,
            "rsid": rsid,
            "consequence": consequence,
            "ensembl_gene_id": gene_id,
            "gene_name": gene_name,
            "gene_description": gene_description,
            "ancestral_allele": ancestral,
            "derived_allele": derived,
            "maf": maf,
            "pval_perm": filtered_sig_genes.loc[filtered_sig_genes['variant_id'] == variant, 'pval_perm'].values[0]
        })
    return result

# Load sentinel variant list
sig_genes_file = "data/qtl.rna.saturation.maf01.11.12.24.rds"
output_file = "data/rsid_all_eGenes1.txt"

# Read the RDS file
sig_genes = pr.read_r(sig_genes_file)
sig_genes = sig_genes[None]  # Extract the DataFrame from the RDS content

# Filter the DataFrame based on the conditions for maxPC and pval_perm
#filtered_sig_genes = sig_genes[(sig_genes['maxPC'] == 11) & (sig_genes['pval_perm'] < 0.05)]
filtered_sig_genes = sig_genes[(sig_genes['maxPC'] == 11)]

# Extract unique variants
unique_variants = filtered_sig_genes['variant_id'].to_list()

# Use ThreadPoolExecutor to parallelize API requests
def main(unique_variants):
    # Write headers to the output file initially
    with open(output_file, "w") as file:
        file.write("\t".join([
            "variant_id", "chromosome", "position", "rsid", "consequence",
            "ensembl_gene_id", "gene_name", "gene_description", "ancestral_allele", 
            "derived_allele", "maf", "pval_perm"
        ]) + "\n")

    with ThreadPoolExecutor(max_workers=cores) as executor:
        for result in executor.map(process_variant, unique_variants):
            with open(output_file, "a") as file:  # Open in append mode
                for item in result:
                    file.write("\t".join([str(item.get(col, '')) for col in [
                        "variant_id", "chromosome", "position", "rsid", "consequence", 
                        "ensembl_gene_id", "gene_name", "gene_description", "ancestral_allele", 
                        "derived_allele", "maf", "pval_perm"
                    ]]) + "\n")

# Run
main(unique_variants)