In [1]:
import os
import re
import pandas as pd
import numpy as np
import requests
import time

In [2]:
sig_meta_res_df = pd.read_excel("../data/meta/monogenic_meta.xlsx")


In [38]:
def get_enrich_type(ser):
    enrich_type = 0
    if ser.p_value<0.05:
        if ser.OR>=1:
            enrich_type = 1
        elif ser.OR<1:
            enrich_type = 2
        else:
            raise ValueError
    return enrich_type

# def get_enrich_type(ser):
#     enrich_type = 0
    
#     if ser.OR>=1:
#         enrich_type = "higher"
#         if ser.p_value<0.05:
#             enrich_type = "enriched"
#     elif ser.OR<1:
#         enrich_type = "lower"
#         if ser.p_value<0.05:
#             enrich_type = "depleted"
#     else:
#         raise ValueError
#     return enrich_type

def get_highest_effect(beta):
    if all(beta<0):
        return min(beta)
    elif all(beta>0):
        return max(beta)
    else:
        raise ValueError
    return

def get_enrich_status(etype):
    etype = np.array([e for e in etype if e!=""])
    if len(np.unique(etype))>1:
        raise ValueError
    if len(etype)==0:
        return ""
    return etype[0]

def get_genes(ID):
    pattern = re.compile("(.+)\.(PTV.*)\.0\.001")
    m = re.match(pattern, ID)
    if not m:
        print(ID)
    gene = m.group(1)
    return gene

bmi_cat_df = pd.read_excel("../data/bmi_cat_enrichment/monogenic_bmi_categories.xlsx")
bmi_cat_df["p_sig"] = bmi_cat_df.apply(get_enrich_type, axis=1)
bmi_cat_df = bmi_cat_df.pivot_table(index="ID", columns="comorbidity", values="p_sig", aggfunc='first').reset_index().replace({0: "", 1: "enriched", 2: "depleted"})
# bmi_cat_df.columns = ["ID"] + list(bmi_cat_df.columns.get_level_values(1)[1:])
bmi_cat_df = sig_meta_res_df.loc[:, ["ID", "beta"]].merge(bmi_cat_df.loc[:, ["ID", "nu", "ovw", "ob", "sob"]], on="ID")
bmi_cat_df["gene"] = bmi_cat_df.ID.apply(get_genes)
bmi_cat_df = bmi_cat_df.groupby("gene").agg({"beta": get_highest_effect, "nu": get_enrich_status, "ovw": get_enrich_status, "ob": get_enrich_status, "sob": get_enrich_status}).sort_values("beta").reset_index()


In [39]:
bmi_cat_df.head()

Unnamed: 0,gene,beta,nu,ovw,ob,sob
0,DCUN1D3,-3.842606,enriched,,,
1,NEUROD6,-3.024546,enriched,,,
2,SH3GL2,-2.339171,enriched,depleted,,
3,AQP3,-2.055961,enriched,,depleted,depleted
4,RABEP1,-1.709539,,,depleted,


In [11]:
bmi_cat_df = pd.read_excel("../data/manual_lit/monogenic_lit_review.xlsx")

In [14]:
bmi_cat_df = pd.read_excel("../data/manual_lit/monogenic_lit_review.xlsx")

known_obesity_genes = [
    "ZBTB7B","ACHE","RAPGEF3","PRKAG1","RAB21","KSR2","HIP1R","ZFHX3","GIPR", "MC4R", "ENTPD6", "ZFR2","ZNF169","SLC6A17","MAP1A","ALDH3A1","ANGPTL7","ZNF169", #turcot
    "PHIP", "DGKI", "ZMYM4", #marenne
    # "ADCY3", "AGRP", "BDNF", "KSR2", "LEP", "LEPR", "MRAP2", "NTRK2", "PCSK1", "PHIP", "SH2B1"," POMC", "SIM1", #loos 
    "UHMK1", "GPR75", "ROBO1", "KIAA1109","PCSK1","GPR151","SPARC", "UBR2", "CALCR","PDE3B","ANO4","KIAA0586", "MC4R", "DPP9","ANKRD27", "UBR2", "GIPR", #akbari 
    "SLTM", "MC4R", "PCSK1", "UBR2", "KIAA1109", "BSN", "APBA1", "TOX4", "ATP13A1", # zhao 
    "DIDO1", "KIAA1109", "MC4R", "PTPRG", "SLC12A5", "MC4R", "SLTM" # Kaisinger
]

bmi_cat_df["rvas"] = bmi_cat_df.gene.isin(set(known_obesity_genes))

In [22]:
gwas_genes = pd.read_csv("../data/known_genes/gwas_genes.txt", header=None)
gwas_genes = set(gwas_genes.iloc[:, 0])

In [23]:
bmi_cat_df["rvas"] = bmi_cat_df.gene.isin(known_obesity_genes)
bmi_cat_df["gwas"] = bmi_cat_df.gene.isin(gwas_genes)

In [79]:
def get_gtex_top_tissue_exp(gene_symbol):
    time.sleep(2)
    # Step 1: Convert gene symbol to Ensembl gene ID
    gene_symbol_url = f"https://gtexportal.org/api/v2/reference/gene?geneId={gene_symbol}&datasetId=gtex_v8"
    response = requests.get(gene_symbol_url)

    # Check if the request was successful
    if response.status_code == 200:
        data = response.json()
        if data['data']:
            gene_id = data['data'][0]['gencodeId']
            # print(f"Ensembl Gene ID for {gene_symbol}: {gene_id}")

            # Step 2: Use Ensembl gene ID to get expression data
            expression_url = f"https://gtexportal.org/api/v2/expression/medianGeneExpression?datasetId=gtex_v8&gencodeId={gene_id}"
            expression_response = requests.get(expression_url)

            if expression_response.status_code == 200:
                expression_data = expression_response.json()
                data_dict = {"median": [], "tissueSiteDetailId": [], "ontologyId": [], "datasetId": [], "gencodeId": [], "geneSymbol": [], "unit": []}
                if "data" in expression_data:
                    for ddict in expression_data["data"]:
                        for k,v in ddict.items():
                            data_dict[k].append(v)
                df = pd.DataFrame(data_dict)
                top_tissues = "|".join(df.sort_values("median", ascending=False).head().tissueSiteDetailId)
            else:
                print(f"Error retrieving expression data: {expression_response.status_code}")
                top_tissues = "Exp data retrieval error"
        else:
            print(f"No gene found for symbol {gene_symbol}")
            top_tissues = "No gene symbol data"
    else:
        print(f"Error retrieving gene ID: {response.status_code}")
        top_tissues = "Gene data retrieval error"
    return top_tissues


def get_pathway_ncbi_api(gene_symbol):
    time.sleep(5)
    # Step 1: Get gene pathways
    response = requests.get(f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/gene/genesymbol/{gene_symbol}/pwaccs/json")
    if response.status_code == 200:
        gene_response = response.json()
        if "InformationList" in gene_response:
            gene_pathways = gene_response["InformationList"]["Information"][0]["PathwayAccession"]
            pathway_names = []
            for pathway in gene_pathways:
                pathway_response = requests.get(f"https://pubchem.ncbi.nlm.nih.gov/rest/pug_view/data/pathway/{pathway}/json")
                if pathway_response.status_code==200:
                    pathway_response = pathway_response.json()
                    pathway_name = pathway_response["Record"]["RecordTitle"]
                    pathway_names.append(pathway_name)
                else:
                    print(gene_symbol, pathway)
                time.sleep(3)
            pathway_names = "|".join(pathway_names)
        else:
            print(gene_symbol)
            pathway_names = "Gene symbol invalid response"
    else:
        print(gene_symbol)
        pathway_names = "Gene symbol not in pug"
    return pathway_names

In [47]:
bmi_cat_df["gtex"] = bmi_cat_df.gene.apply(get_gtex_top_tissue_exp)

No gene found for symbol VIRMA


In [80]:
bmi_cat_df["pathways"] = bmi_cat_df.gene.apply(get_pathway_ncbi_api)

NEUROD6
VIRMA
TSPAN4
IQCF6
KCTD7
CDC42EP4
ZNF627
GLOD4
SCYL2
KLHL36
MTFP1
ALG1L2
GRXCR1
BAIAP3
MICU2
ZNF189
PROM2
ADAL
TMEM128
LRRC31
AHNAK
GHDC
PRR14
KRT23
CPA6
OR13G1
OR5J2
NWD1
MINDY4
MAMDC4
PCDHA3
SNAP47
PAM
JPH3
ATN1
ZFHX2
ODF3L2
ZG16
C1orf174
NACC2
MACROD1
LONRF2
ADNP
FAM171A2
NTNG2
TMEM229A
ZFC3H1


In [3]:
bmi_cat_df.to_excel("../data/manual_lit/monogenic_lit_review.xlsx", index=False)

In [2]:
bmi_cat_df = pd.read_excel("../data/manual_lit/monogenic_lit_review.xlsx")

In [22]:
#IMPC mouse API: https://www.mousephenotype.org/help/programmatic-data-access/

def get_mouse_phenotypes(gene_symbol):
    time.sleep(5)
    gene_symbol=gene_symbol.lower()
    gene_symbol=gene_symbol[0].upper()+gene_symbol[1:]

    response = requests.get(f"https://www.ebi.ac.uk/mi/impc/solr/genotype-phenotype/select?q=marker_symbol:{gene_symbol}")
    if response.status_code == 200:
        gene_response = response.json()
        if gene_response['response']['numFound']>0:
            mp_terms = []
            for rdict in gene_response['response']['docs']:
                mp_terms.append(rdict["mp_term_name"])
            mp_terms = set(mp_terms)
            mp_terms = "|".join(mp_terms)
        elif gene_response['response']['numFound']==0:
            mp_terms = "Mouse not phenotyped"
    else:
        print(gene_symbol)
        mp_terms = "Invalid response"
    return mp_terms





In [24]:
bmi_cat_df["mouse_pheno"] = bmi_cat_df.gene.apply(get_mouse_phenotypes)

In [25]:
bmi_cat_df.to_excel("../data/manual_lit/monogenic_lit_review.xlsx")

In [2]:
bmi_cat_df = pd.read_excel("../data/manual_lit/monogenic_lit_review.xlsx")

In [25]:
#IMPC cardio_metabolic API: https://md.hugeamp.org/project.html?project=lunaris

def get_cardio_metabolic_phenotypes(gene_symbol):
    time.sleep(1.5)

    response = requests.get(f"https://bioindex.hugeamp.org/api/bio/query/huge?q={gene_symbol}")
    if response.status_code == 200:
        gene_response = response.json()
        if gene_response['count']>0:
            mp_terms = {"gene": [], "phenotype": [], "bf_common": [], "bf_rare": [], "huge": []}
            for rdict in gene_response['data']:
                mp_terms["gene"].append(rdict["gene"])
                mp_terms["bf_common"].append(rdict["bf_common"])
                mp_terms["bf_rare"].append(rdict["bf_rare"])
                mp_terms["huge"].append(rdict["huge"])
                mp_terms["phenotype"].append(rdict["phenotype"])

            assert len(mp_terms["phenotype"])==gene_response['count']
        elif gene_response['count']==0:
            print(gene_symbol)
            mp_terms = {"gene": [gene_symbol], "bf_common": [0], "bf_rare": [0], "huge": [0]}
    else:
        print(gene_symbol)
        mp_terms = {"gene": [gene_symbol], "bf_common": [pd.NA], "bf_rare": [pd.NA], "huge": [pd.NA]}
    
    mp_terms = pd.DataFrame(mp_terms)
    return mp_terms



In [30]:
dfs = list(map(get_cardio_metabolic_phenotypes, bmi_cat_df.gene.values))

ODF3L2


In [31]:
cmd_df = pd.concat(dfs)

In [32]:
cmd_df.to_csv("../data/functional_enrichment/cmd/all_hige_scores.csv.gz")

In [4]:
cmd_df = pd.read_csv("../data/functional_enrichment/cmd/all_hige_scores.csv.gz", index_col=0)

In [8]:
cmd_df.loc[cmd_df.gene=="SNAP47"].head(10)

Unnamed: 0,gene,phenotype,bf_common,bf_rare,huge
0,SNAP47,TB-LM,45,1.389135,62.511068
1,SNAP47,CTS,45,1.0,45.0
2,SNAP47,NeutroPerc,45,1.0,45.0
3,SNAP47,TL,45,1.0,45.0
4,SNAP47,ThumbOA,45,1.0,45.0
5,SNAP47,HEIGHT,20,1.0,20.0
6,SNAP47,eBMD,20,1.0,20.0
7,SNAP47,TGtoHDL,3,1.0,3.0
8,SNAP47,SHBGadjBMI,3,1.0,3.0
9,SNAP47,HDL,3,1.0,3.0


In [41]:
bmi_huge_scores = cmd_df.loc[(cmd_df.phenotype=="BMI"), ["gene", "huge"]].set_index("gene").huge.to_dict()

In [42]:
bmi_cat_df["BMI_HuGE_score"] = bmi_cat_df.gene.map(bmi_huge_scores)

In [46]:
cmd_dict = cmd_df.loc[cmd_df.huge>=10].groupby("gene").agg({"phenotype": lambda x: "|".join(x)}).phenotype.to_dict()

In [47]:
bmi_cat_df["HuGE_Strong_phenotypes"] = bmi_cat_df.gene.map(cmd_dict)

In [21]:
bmi_cat_df

Unnamed: 0,gene,beta,nu,ovw,ob,sob,rvas,gwas,gtex,pathways,mouse_pheno,BMI_HuGE_score,HuGE_Strong_phenotypes,CMDKP
0,DCUN1D3,-3.842606,enriched,,,,False,0,Adipose_Visceral_Omentum|Nerve_Tibial|Adipose_...,Metabolism of proteins|Post-translational prot...,thrombocytopenia|decreased circulating HDL cho...,1.000000,eGFRcreaNoDiabetes,False
1,NEUROD6,-3.024546,enriched,,,,False,0,Brain_Frontal_Cortex_BA9|Brain_Cortex|Brain_An...,Gene symbol not in pug,Mouse not phenotyped,1.000000,,False
2,SH3GL2,-2.339171,enriched,depleted,,,False,0,Brain_Cerebellar_Hemisphere|Brain_Cerebellum|B...,EPHB forward signaling|Internalization of ErbB...,decreased grip strength|decreased exploration ...,1.000000,Parkinsons|HIPKNEEOA,False
3,AQP3,-2.055961,enriched,,depleted,depleted,False,0,Esophagus_Mucosa|Vagina|Skin_Not_Sun_Exposed_S...,Kidney Function|Kidney Function - Collecting D...,increased circulating alkaline phosphatase lev...,1.000000,Eczema,False
4,RABEP1,-1.709539,,,depleted,,False,1,Cells_EBV-transformed_lymphocytes|Brain_Spinal...,Membrane Trafficking|Vesicle-mediated transpor...,Mouse not phenotyped,3.000000,HEIGHT|SHBGadjBMI|CHOL|TG|nonHDL|MeanSpheredVo...,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
116,UBR2,2.003448,depleted,,enriched,enriched,True,1,Nerve_Tibial|Uterus|Thyroid|Brain_Cerebellar_H...,Primary ovarian insufficiency,"preweaning lethality, incomplete penetrance|sp...",429.884397,BMI|HYPERTENSION|MI|BFP|WEIGHT|CRP|Thyroid|Neu...,True
117,APBA1,2.268723,depleted,,enriched,enriched,True,0,Brain_Cerebellum|Brain_Cerebellar_Hemisphere|B...,Neurotransmitter release cycle|Neurotransmitte...,Mouse not phenotyped,1.000000,Ap-LM|CogPerformance|ImReticuloFrac,False
118,BSN,2.689351,depleted,depleted,enriched,enriched,True,1,Brain_Cerebellum|Brain_Cerebellar_Hemisphere|B...,Sensory processing of sound|Sensory processing...,Mouse not phenotyped,20.000000,HighScatReticuloPerc|ReticuloPerc|HYPERTENSION...,True
119,MC4R,3.032234,depleted,depleted,enriched,enriched,True,1,Brain_Hypothalamus|Brain_Caudate_basal_ganglia...,Syndecan-3-mediated signaling events|Antipsych...,Mouse not phenotyped,6960.000000,BFP|WEIGHT|T2D|TB-LM|BMI|Ap-LM|Stroke_hemorrha...,True


In [18]:
bmi_cat_df["CMDKP"] = bmi_cat_df.gene.isin(bmi_cat_df.loc[((bmi_cat_df.BMI_HuGE_score>=10)|(bmi_cat_df.HuGE_Strong_phenotypes.str.contains("BMI|Obesity"))), "gene"])

In [24]:
bmi_cat_df

Unnamed: 0,gene,beta,nu,ovw,ob,sob,rvas,gwas,gtex,pathways,mouse_pheno,BMI_HuGE_score,HuGE_Strong_phenotypes,CMDKP
0,DCUN1D3,-3.842606,enriched,,,,False,False,Adipose_Visceral_Omentum|Nerve_Tibial|Adipose_...,Metabolism of proteins|Post-translational prot...,thrombocytopenia|decreased circulating HDL cho...,1.000000,eGFRcreaNoDiabetes,False
1,NEUROD6,-3.024546,enriched,,,,False,False,Brain_Frontal_Cortex_BA9|Brain_Cortex|Brain_An...,Gene symbol not in pug,Mouse not phenotyped,1.000000,,False
2,SH3GL2,-2.339171,enriched,depleted,,,False,False,Brain_Cerebellar_Hemisphere|Brain_Cerebellum|B...,EPHB forward signaling|Internalization of ErbB...,decreased grip strength|decreased exploration ...,1.000000,Parkinsons|HIPKNEEOA,False
3,AQP3,-2.055961,enriched,,depleted,depleted,False,False,Esophagus_Mucosa|Vagina|Skin_Not_Sun_Exposed_S...,Kidney Function|Kidney Function - Collecting D...,increased circulating alkaline phosphatase lev...,1.000000,Eczema,False
4,RABEP1,-1.709539,,,depleted,,False,True,Cells_EBV-transformed_lymphocytes|Brain_Spinal...,Membrane Trafficking|Vesicle-mediated transpor...,Mouse not phenotyped,3.000000,HEIGHT|SHBGadjBMI|CHOL|TG|nonHDL|MeanSpheredVo...,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
116,UBR2,2.003448,depleted,,enriched,enriched,True,True,Nerve_Tibial|Uterus|Thyroid|Brain_Cerebellar_H...,Primary ovarian insufficiency,"preweaning lethality, incomplete penetrance|sp...",429.884397,BMI|HYPERTENSION|MI|BFP|WEIGHT|CRP|Thyroid|Neu...,True
117,APBA1,2.268723,depleted,,enriched,enriched,True,False,Brain_Cerebellum|Brain_Cerebellar_Hemisphere|B...,Neurotransmitter release cycle|Neurotransmitte...,Mouse not phenotyped,1.000000,Ap-LM|CogPerformance|ImReticuloFrac,False
118,BSN,2.689351,depleted,depleted,enriched,enriched,True,True,Brain_Cerebellum|Brain_Cerebellar_Hemisphere|B...,Sensory processing of sound|Sensory processing...,Mouse not phenotyped,20.000000,HighScatReticuloPerc|ReticuloPerc|HYPERTENSION...,True
119,MC4R,3.032234,depleted,depleted,enriched,enriched,True,True,Brain_Hypothalamus|Brain_Caudate_basal_ganglia...,Syndecan-3-mediated signaling events|Antipsych...,Mouse not phenotyped,6960.000000,BFP|WEIGHT|T2D|TB-LM|BMI|Ap-LM|Stroke_hemorrha...,True


In [25]:
bmi_cat_df.to_excel("../data/manual_lit/monogenic_lit_review.xlsx", index=False)

In [26]:
pubmed_df = pd.read_excel("../data/manual_lit/pubmed_search.xlsx")

In [27]:
pubmed_num_dict = pubmed_df.set_index("gene").nrefs.to_dict()

In [29]:
bmi_cat_df["pubmed_hits_max50"] =  bmi_cat_df.gene.map(pubmed_num_dict)

In [None]:
bmi_cat_df["impc"] =  bmi_cat_df.gene.isin(bmi_cat_df.loc[bmi_cat_df.mouse_pheno.str.contains("glucose tolerance|lean body mass|body fat amount"), "gene"])

In [32]:
bmi_cat_df.to_excel("../data/manual_lit/monogenic_lit_review.xlsx", index=False)