In [None]:
import pandas as pd
import os
import mygene
import json
from tqdm.notebook import tqdm

In [None]:
PVAL_THRESHOLD = 5e-8
mg = mygene.MyGeneInfo()

# Asthma

In [None]:
asthma_targets = pd.read_csv("data/GWAS_catalog_26-01/efotraits_MONDO_0004979-associations-2023-01-26_asthma.csv")
asthma_targets["P-value"] = asthma_targets["P-value"].str.replace(" x 10", "e").astype(float)
asthma_targets = asthma_targets[asthma_targets["P-value"]<=PVAL_THRESHOLD]
asthma_targets

In [None]:
asthma_genes = set()
for i, row in asthma_targets.iterrows():
    gns = row["Mapped gene"].split(", ")
    for gn in gns:
        asthma_genes.add(gn)
asthma_genes = sorted(list(asthma_genes))

In [None]:
gene_mapping = mg.querymany(asthma_genes, scopes=["symbol", "alias"], species="human", returnall=True)
gene_mapping["out"][:3]

In [None]:
ncbi_ids = []
gene2ncbi = {}
for gn in tqdm(asthma_genes):
    res = [e for e in gene_mapping["out"] if e["query"]==gn]
    res = [r for r in res if "_score" in r.keys() and "entrezgene" in r.keys()]
    if len(res)<1:
        continue
        
    res = sorted(res, key=lambda x: x["_score"], reverse=True)
    tmp_res = [r for r in res if gn==r["symbol"]]
    if len(tmp_res)>0:
        res = tmp_res
    ncbi_ids.append(res[0]["entrezgene"])
    gene2ncbi[gn] = res[0]["entrezgene"]
len(ncbi_ids)

In [None]:
with open("processed_data/gwas_catalog_targets/asthma_targets_ncbi_gwas_catalog.json", "w") as f:
    json.dump(ncbi_ids, f)
with open("processed_data/gwas_catalog_targets/asthma_targets_gene2ncbi.json", "w") as f:
    json.dump(gene2ncbi, f, indent=2)

# Autism

In [None]:
autism_targets = pd.read_csv("data/GWAS_catalog_26-01/efotraits_EFO_0003756-associations-2023-01-26_asd.csv")
autism_targets["P-value"] = autism_targets["P-value"].str.replace(" x 10", "e").astype(float)
autism_targets = autism_targets[autism_targets["P-value"]<=PVAL_THRESHOLD]
autism_targets

In [None]:
autism_genes = set()
for i, row in autism_targets.iterrows():
    gns = row["Mapped gene"].split(", ")
    for gn in gns:
        autism_genes.add(gn)

autism_genes = sorted(list(autism_genes))

In [None]:
gene_mapping = mg.querymany(autism_genes, scopes=["symbol", "alias"], species="human", returnall=True)
gene_mapping["out"][:3]

In [None]:
ncbi_ids = []
gene2ncbi = {}
for gn in tqdm(autism_genes):
    res = [e for e in gene_mapping["out"] if e["query"]==gn]
    res = [r for r in res if "_score" in r.keys() and "entrezgene" in r.keys()]
    if len(res)<1:
        continue
        
    res = sorted(res, key=lambda x: x["_score"], reverse=True)
    tmp_res = [r for r in res if gn==r["symbol"]]
    if len(tmp_res)>0:
        res = tmp_res
    ncbi_ids.append(res[0]["entrezgene"])
    gene2ncbi[gn] = res[0]["entrezgene"]
len(ncbi_ids)

In [None]:
with open("processed_data/gwas_catalog_targets/autism_targets_ncbi_gwas_catalog.json", "w") as f:
    json.dump(ncbi_ids, f)
with open("processed_data/gwas_catalog_targets/autism_targets_gene2ncbi.json", "w") as f:
    json.dump(gene2ncbi, f, indent=2)

# Schizophrenia

In [None]:
sch_targets = pd.read_csv("data/GWAS_catalog_26-01/efotraits_MONDO_0005090-associations-2023-01-26_schizophrenia.csv")
sch_targets["P-value"] = sch_targets["P-value"].str.replace(" x 10", "e").astype(float)
sch_targets = sch_targets[sch_targets["P-value"]<=PVAL_THRESHOLD]
sch_targets

In [None]:
sch_genes = set()
for i, row in sch_targets.iterrows():
    gns = row["Mapped gene"].split(", ")
    for gn in gns:
        sch_genes.add(gn)

sch_genes = sorted(list(sch_genes))

In [None]:
gene_mapping = mg.querymany(sch_genes, scopes=["symbol", "alias"], species="human", returnall=True)
gene_mapping["out"][:3]

In [None]:
ncbi_ids = []
gene2ncbi = {}
for gn in tqdm(sch_genes):
    res = [e for e in gene_mapping["out"] if e["query"]==gn]
    res = [r for r in res if "_score" in r.keys() and "entrezgene" in r.keys()]
    if len(res)<1:
        continue

    res = sorted(res, key=lambda x: x["_score"], reverse=True)
    tmp_res = [r for r in res if gn==r["symbol"]]
    if len(tmp_res)>0:
        res = tmp_res
    ncbi_ids.append(res[0]["entrezgene"])
    gene2ncbi[gn] = res[0]["entrezgene"]
len(ncbi_ids)

In [None]:
with open("processed_data/gwas_catalog_targets/schizophrenia_targets_ncbi_gwas_catalog.json", "w") as f:
    json.dump(ncbi_ids, f)
with open("processed_data/gwas_catalog_targets/schizophrenia_targets_gene2ncbi.json", "w") as f:
    json.dump(gene2ncbi, f, indent=2)