In [33]:
# Connor Murray
# Extract RSID for each SNP
# 12.10.2024

# Load packages
import requests
import pandas as pd
import pyreadr as pr
import os

# Working directory
os.chdir("/standard/vol185/cphg_Manichaikul/users/csm6hg")

# Query for metadata function
def get_rsid_and_extra_info(chrom, pos):
    """
    Query Ensembl API to get RSIDs, allele frequencies, ancestral/derived alleles,
    and overlapping gene information.
    """
    # Query for variations (RSIDs, allele frequencies, and ancestral/derived alleles)
    variation_url = f"https://rest.ensembl.org/overlap/region/homo_sapiens/{chrom}:{pos}-{pos}?feature=variation;content-type=text/x-gff3"
    variation_response = requests.get(variation_url)
    
    if not variation_response.ok:
        print(f"Error: {variation_response.status_code} for variation query at {chrom}:{pos}")
        return [], None, None, None
    
    variation_data = variation_response.text
    rsid_list = []  # Store RSIDs and consequence
    ancestral_allele = None
    derived_allele = None
    maf = None
    
    # Parse variations (RSIDs)
    for line in variation_data.splitlines():
        if not line.startswith("#"):  # Skip comment lines
            fields = line.split("\t")
            if len(fields) >= 9:
                attributes = fields[8]
                rsid = None
                consequence = None
                for attr in attributes.split(";"):
                    if attr.startswith("ID="):
                        rsid = attr.split("=")[1].replace("sequence_variant:", "")
                    if attr.startswith("consequence_type="):
                        consequence = attr.split("=")[1].replace("consequence_type:", "")
                if rsid:
                    rsid_list.append((rsid, consequence))
    
    # Query for allele frequency and ancestral/derived alleles
    ancestral_allele = None
    derived_allele = None
    maf = None
    
    # Iterate over the rsid_list to query for allele information
    for rsid, _ in rsid_list:
        allele_url = f"https://rest.ensembl.org/variation/homo_sapiens/{rsid}?content-type=application/json"
        allele_response = requests.get(allele_url)
        
        if allele_response.ok:
            allele_data = allele_response.json()
            mapping = allele_data["mappings"][0]  # Use the first mapping, assuming there's only one
            ancestral_allele = mapping.get("ancestral_allele", "N/A")
            derived_allele = allele_data.get("minor_allele", "N/A")
            maf = allele_data.get("MAF", "N/A")
    
    return rsid_list, ancestral_allele, derived_allele, maf

def get_gene_description(gene_id):
    """
    Query Ensembl API to get the gene description for a given Ensembl gene ID.
    """
    url = f"https://rest.ensembl.org/lookup/id/{gene_id}?content-type=application/json"
    response = requests.get(url)
    
    if not response.ok:
        print(f"Error: {response.status_code} for description query on {gene_id}")
        return None
    
    data = response.json()
    return data.get("description", None)  # Gene description

def get_rsid_gene_and_info(chrom, pos):
    """
    Combines RSID, gene, gene description, and allele information into a single function.
    """
    rsid_list, ancestral, derived, maf = get_rsid_and_extra_info(chrom, pos)
    
    # Query for overlapping gene information
    gene_url = f"https://rest.ensembl.org/overlap/region/homo_sapiens/{chrom}:{pos}-{pos}?feature=gene;content-type=text/x-gff3"
    gene_response = requests.get(gene_url)
    
    gene_id = None
    gene_name = None
    gene_description = None
    
    if gene_response.ok:
        gene_data = gene_response.text
        for line in gene_data.splitlines():
            if not line.startswith("#"):
                fields = line.split("\t")
                if len(fields) >= 9:
                    attributes = fields[8]
                    for attr in attributes.split(";"):
                        if attr.startswith("ID="):
                            gene_id = attr.split("=")[1].replace("gene:", "")
                        if attr.startswith("Name="):
                            gene_name = attr.split("=")[1]
                            break
        if gene_id:
            gene_description = get_gene_description(gene_id)  # Get gene description
    return rsid_list, gene_id, gene_name, gene_description, ancestral, derived, maf

# Load sentinel variant list
sig_genes_file = "data/qtl.rna.saturation.maf01.11.12.24.rds"
output_file = "data/rsid_output_new.txt"  # Output as tab-delimited TXT

# Read the RDS file
sig_genes = pr.read_r(sig_genes_file)
sig_genes = sig_genes[None]  # Extract the DataFrame from the RDS content

# Filter the DataFrame based on the conditions for maxPC and pval_perm
filtered_sig_genes = sig_genes[(sig_genes['maxPC'] == 11) & (sig_genes['pval_perm'] < 0.05)]

# Extract unique variants
unique_variants = filtered_sig_genes['variant_id'].to_list()

# Initialize results list
results = []

# Iterate through each variant and extract RSID
for variant in unique_variants:
    chrom, pos = variant.split(":")
    pos = pos.split("[")[0]  # Remove brackets and annotations    
    
    print(f"Processing {chrom}:{pos}...")
    rsid_data, gene_id, gene_name, gene_description, ancestral, derived, maf = get_rsid_gene_and_info(chrom.replace("chr", ""), pos)
    
    for rsid, consequence in rsid_data:
        results.append({
            "variant_id": variant,
            "chromosome": chrom,
            "position": pos,
            "rsid": rsid,
            "consequence": consequence,
            "ensembl_gene_id": gene_id,
            "gene_name": gene_name,
            "gene_description": gene_description,
            "ancestral_allele": ancestral,
            "derived_allele": derived,
            "maf": maf
        })
    
    # Save intermediate results to output file
    if not os.path.exists(output_file) or os.stat(output_file).st_size == 0:
        pd.DataFrame(results).to_csv(output_file, sep="\t", index=False)
    else:
        pd.DataFrame([results[-1]]).to_csv(output_file, sep="\t", mode='a', header=False, index=False)

Processing chr1:1026225...
Processing chr1:907986...
Processing chr1:1503422...
Processing chr1:1071101...
Processing chr1:1127765...
Processing chr1:788418...
Processing chr1:841166...
Processing chr1:825532...
Processing chr1:982112...
Processing chr1:995512...
Processing chr1:984475...
Processing chr1:1022518...
Processing chr1:914061...
Processing chr1:1188054...
Processing chr1:1231710...
Processing chr1:1217251...
Processing chr1:1276846...
Processing chr1:1307327...
Processing chr1:1963312...
Processing chr1:1312114...
Processing chr1:2287876...
Processing chr1:1366183...
Processing chr1:1380867...
Processing chr1:1410590...
Processing chr1:1440430...
Processing chr1:1434243...
Processing chr1:1140858...
Processing chr1:1648636...
Processing chr1:1613421...
Processing chr1:2287876...
Processing chr1:1619986...
Processing chr1:1669576...
Processing chr1:1851793...
Processing chr1:1658830...
Processing chr1:1755713...
Processing chr1:1786996...
Processing chr1:1881065...
Processin

KeyboardInterrupt: 

In [30]:
# Query for allele frequency and ancestral/derived alleles
allele_url = f"https://rest.ensembl.org/variation/homo_sapiens/rs112555755?content-type=application/json"
allele_response = requests.get(allele_url)


C T 0.036303
