# Description

It creates a text file with mappings for all traits in PhenomeXcan (many of them are from UK Biobank, and a small set of 42 traits are from other studies) to EFO labels. It also adds a category for each trait, which now contains only one category: `disease` (or empty if not categorized).

# Modules loading

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import re
from pathlib import Path
from shutil import copyfile

from IPython.display import display
import pandas as pd
import obonet

import conf
from data.cache import read_data
from entity import Trait, GTEXGWASTrait

# Functions

In [3]:
def get_parents(graph, node):
    for t in graph.successors(node):
        yield t


def _is_disease_single_node(node):
    return node == "EFO:0000408"


def is_disease(graph, node):
    if node not in graph.nodes:
        return False

    if _is_disease_single_node(node):
        return True

    for parent_node in get_parents(graph, node):
        if is_disease(graph, parent_node):
            return True

    return False

# Load EFO Ontology

In [5]:
url = conf.GENERAL["EFO_ONTOLOGY_OBO_FILE"]
graph = obonet.read_obo(url)

In [6]:
# Number of nodes
len(graph)

26727

In [7]:
# Number of edges
graph.number_of_edges()

52865

In [8]:
assert graph.nodes["EFO:0000270"].get("name") == "asthma"

# Load PhenomeXcan traits

In [9]:
phenomexan_traits_names = read_data(
    conf.PHENOMEXCAN["SMULTIXCAN_MASHR_ZSCORES_FILE"]
).columns.tolist()

In [10]:
len(phenomexan_traits_names)

4091

In [11]:
phenomexcan_traits = [Trait.get_trait(full_code=t) for t in phenomexan_traits_names]

In [12]:
assert len(phenomexcan_traits) == len(phenomexan_traits_names)

In [13]:
phenomexcan_code_to_full_code = {t.code: t.get_plain_name() for t in phenomexcan_traits}

In [14]:
assert phenomexcan_code_to_full_code["50_raw"] == "50_raw-Standing_height"

# Load UKB to EFO mappings

In [15]:
ukb_to_efo = read_data(conf.UK_BIOBANK["UKBCODE_TO_EFO_MAP_FILE"])

In [16]:
ukb_to_efo.shape

(1565, 7)

In [17]:
ukb_to_efo

Unnamed: 0,ZOOMA QUERY,MAPPED_TERM_LABEL,MAPPED_TERM_URI,MAPPING_TYPE,ICD10_CODE/SELF_REPORTED_TRAIT_FIELD_CODE,COMMENTS/TICKET,AI
0,Vascular disorders of intestine,vascular disease,"EFO_0004264, EFO_0009431",Broad,K55,DONE,
1,Gonarthrosis,osteoarthritis || knee,EFO_0004616,Broad,M17,DONE,
2,Psoriatic and enteropathic arthropathies,psoriatic arthritis,EFO_0003778,? Broad,M07,DONE,
3,Pain associated with micturition,dysuria,EFO_0003901,? Broad,R30,DONE,
4,Other mood,mood disorder,EFO_0004247,? Broad,F38,DONE,
...,...,...,...,...,...,...,...
1560,Candidiasis,"Candidiasis, Invasive",EFO_1001283,Narrow,B37,DONE,
1561,Other bacterial intestinal infections,intestinal disease||bacterial disease,EFO_0009431||EFO_0000771,Broad,A04,DONE,
1562,Viral and other specified intestinal infections,intestinal disease||viral disease,EFO_0009431||EFO_0000763,Broad,A08,DONE,
1563,Other predominantly sexually transmitted disea...,bacterial sexually transmitted disease,EFO_0003955,Narrow,A63,DONE,


In [18]:
ukb_to_efo = ukb_to_efo.rename(
    columns={
        "MAPPED_TERM_LABEL": "term_label",
        "MAPPING_TYPE": "mapping_type",
        "MAPPED_TERM_URI": "term_codes",
        "ICD10_CODE/SELF_REPORTED_TRAIT_FIELD_CODE": "ukb_code",
    }
)[["ukb_code", "term_label", "term_codes", "mapping_type"]]

## Add GTEx GWAS EFO terms

In [20]:
all_gtex_gwas_phenos = [
    p for p in phenomexcan_traits if GTEXGWASTrait.is_phenotype_from_study(p.full_code)
]

In [21]:
_tmp = len(all_gtex_gwas_phenos)
display(_tmp)
assert _tmp == 42

42

In [22]:
all_gtex_gwas_phenos[:10]

[MAGNETIC_LDL.C,
 BCAC_Overall_BreastCancer_EUR,
 Astle_et_al_2016_Sum_neutrophil_eosinophil_counts,
 BCAC_ER_negative_BreastCancer_EUR,
 MAGNETIC_IDL.TG,
 Astle_et_al_2016_Eosinophil_counts,
 Astle_et_al_2016_Reticulocyte_count,
 Astle_et_al_2016_Lymphocyte_counts,
 MAGIC_ln_FastingInsulin,
 EAGLE_Eczema]

In [23]:
_old_shape = ukb_to_efo.shape

In [24]:
ukb_to_efo = ukb_to_efo.append(
    pd.DataFrame(
        {
            "ukb_code": [ggp.full_code for ggp in all_gtex_gwas_phenos],
            "term_codes": [ggp.orig_efo_id for ggp in all_gtex_gwas_phenos],
        }
    ),
    ignore_index=True,
)

In [25]:
# Fix wrong EFO codes
idx = ukb_to_efo[ukb_to_efo["ukb_code"] == "BCAC_ER_negative_BreastCancer_EUR"].index
ukb_to_efo.loc[idx, "term_codes"] = "EFO_1000650"

idx = ukb_to_efo[ukb_to_efo["ukb_code"] == "CARDIoGRAM_C4D_CAD_ADDITIVE"].index
ukb_to_efo.loc[idx, "term_codes"] = "EFO_0001645"

In [26]:
# ukb_to_efo_maps = ukb_to_efo_maps.dropna(subset=['efo_code'])

In [27]:
_tmp = ukb_to_efo.shape
display(_tmp)
assert _tmp[0] == _old_shape[0] + 42

(1607, 4)

## Replace values and remove nans

In [28]:
ukb_to_efo = ukb_to_efo.replace(
    {
        "term_codes": {
            #             '\|\|': ', ',
            "_": ":",
            "HP0011106": "HP:0011106",
        },
        #         'efo_name': {
        #             '\|\|': ', ',
        #         }
    },
    regex=True,
)

In [29]:
ukb_to_efo = ukb_to_efo.dropna(how="all")

In [30]:
ukb_to_efo = ukb_to_efo.dropna(subset=["term_codes"])

In [31]:
assert ukb_to_efo[ukb_to_efo["term_codes"].isna()].shape[0] == 0

In [32]:
assert ukb_to_efo[ukb_to_efo["term_codes"].str.contains("EFO:")].shape[0] > 0

In [33]:
assert ukb_to_efo[ukb_to_efo["term_codes"].str.contains("HP:")].shape[0] > 0

In [34]:
ukb_to_efo.shape

(1599, 4)

In [35]:
ukb_to_efo.head()

Unnamed: 0,ukb_code,term_label,term_codes,mapping_type
0,K55,vascular disease,"EFO:0004264, EFO:0009431",Broad
1,M17,osteoarthritis || knee,EFO:0004616,Broad
2,M07,psoriatic arthritis,EFO:0003778,? Broad
3,R30,dysuria,EFO:0003901,? Broad
4,F38,mood disorder,EFO:0004247,? Broad


# Add PhenomeXcan code/full code

In [36]:
def _get_fullcode(code):
    if code in phenomexcan_code_to_full_code:
        return phenomexcan_code_to_full_code[code]

    return None

In [37]:
ukb_to_efo = ukb_to_efo.assign(ukb_fullcode=ukb_to_efo["ukb_code"].apply(_get_fullcode))

In [38]:
ukb_to_efo.shape

(1599, 5)

In [39]:
ukb_to_efo.head()

Unnamed: 0,ukb_code,term_label,term_codes,mapping_type,ukb_fullcode
0,K55,vascular disease,"EFO:0004264, EFO:0009431",Broad,K55-Diagnoses_main_ICD10_K55_Vascular_disorder...
1,M17,osteoarthritis || knee,EFO:0004616,Broad,M17-Diagnoses_main_ICD10_M17_Gonarthrosis_arth...
2,M07,psoriatic arthritis,EFO:0003778,? Broad,
3,R30,dysuria,EFO:0003901,? Broad,R30-Diagnoses_main_ICD10_R30_Pain_associated_w...
4,F38,mood disorder,EFO:0004247,? Broad,


In [40]:
# remove entries for which we couldn't map a ukb full code
ukb_to_efo = ukb_to_efo.dropna(subset=["ukb_fullcode"])

In [41]:
ukb_to_efo.shape

(1087, 5)

In [42]:
ukb_to_efo.isna().sum()

ukb_code         0
term_label      43
term_codes       0
mapping_type    36
ukb_fullcode     0
dtype: int64

In [43]:
# for these ones we need to query the original EFO ontology
ukb_to_efo[ukb_to_efo["term_label"].isna()]

Unnamed: 0,ukb_code,term_label,term_codes,mapping_type,ukb_fullcode
452,R94,,EFO:0009628,Exact,R94-Diagnoses_main_ICD10_R94_Abnormal_results_...
1031,1170,,EFO:0009817,Exact,1170-Getting_up_in_morning
1123,M75,,EFO:0009470,Broad,M75-Diagnoses_main_ICD10_M75_Shoulder_lesions
1124,O32,,EFO:0009682,Broad,O32-Diagnoses_main_ICD10_O32_Maternal_care_for...
1126,O34,,EFO:0009682,Broad,O34-Diagnoses_main_ICD10_O34_Maternal_care_for...
1141,Z76,,EFO:0009786,Broad,Z76-Diagnoses_main_ICD10_Z76_Persons_encounter...
1142,Z71,,EFO:0009786,Broad,Z71-Diagnoses_main_ICD10_Z71_Persons_encounter...
1156,Z53,,EFO:0009786,Broad,Z53-Diagnoses_main_ICD10_Z53_Persons_encounter...
1566,BCAC_Overall_BreastCancer_EUR,,EFO:0005606,,BCAC_Overall_BreastCancer_EUR
1567,Astle_et_al_2016_Sum_neutrophil_eosinophil_counts,,EFO:0004842,,Astle_et_al_2016_Sum_neutrophil_eosinophil_counts


# Load EFO labels and xrefs

In [44]:
term_id_to_label = (
    read_data(conf.GENERAL["TERM_ID_LABEL_FILE"])[["term_id", "label"]]
    .dropna()
    .set_index("term_id")["label"]
    .to_dict()
)

In [45]:
len(term_id_to_label)

26698

In [46]:
# see if efo code with missing label in term_id_to_label is here
assert term_id_to_label["EFO:0009628"] == "abnormal result of function studies"
assert term_id_to_label["EFO:0005606"] == "family history of breast cancer"

In [47]:
assert term_id_to_label["EFO:0004616"] == "osteoarthritis, knee"

In [48]:
# get current labels for old EFO codes
term_id_xrefs = read_data(
    conf.GENERAL["TERM_ID_XREFS_FILE"]
)  # [['label', 'EFO']].dropna().set_index('EFO')[['label']]

In [49]:
term_id_xrefs.dtypes

target_id_type    category
target_id         category
dtype: object

In [50]:
term_id_xrefs.shape

(104094, 2)

In [51]:
term_id_xrefs.head()

Unnamed: 0_level_0,target_id_type,target_id
term_id,Unnamed: 1_level_1,Unnamed: 2_level_1
EFO:1001070,DOID,DOID:233
EFO:1001070,ICD9,ICD9:017.30
EFO:1001070,ICD9,ICD9:017.32
EFO:1001070,MESH,MESH:D014392
EFO:1001070,MONDO,MONDO:0006876


In [52]:
# see if for an old efo code we get the current efo label
new_efo_code = term_id_xrefs[term_id_xrefs["target_id"] == "EFO:1000673"].index[0]
display(new_efo_code)
assert term_id_to_label[new_efo_code] == "autoimmune bullous skin disease"

'EFO:0008598'

# Add new EFO label

## Functions

In [53]:
term_pattern = re.compile(r"\w+:\w+")


def _add_term_labels(row):
    term_ids = row["term_codes"]

    matches = term_pattern.findall(term_ids)

    labels = []
    for m in matches:
        if m in term_id_to_label:
            new_label = term_id_to_label[m]
        else:
            other_xrefs = term_id_xrefs[term_id_xrefs["target_id"] == m]
            if other_xrefs.shape[0] == 1:
                new_label = term_id_to_label[other_xrefs.index[0]]
            elif other_xrefs.shape[0] > 1:
                new_label = term_id_to_label[other_xrefs.index[0]]
            elif not pd.isnull(row["term_label"]):
                new_label = row["term_label"]
            else:
                continue

        labels.append(new_label.lower())

    return " AND ".join(labels)


## Apply

In [56]:
ukb_to_efo = ukb_to_efo.assign(
    current_term_label=ukb_to_efo.apply(_add_term_labels, axis=1)
)

In [57]:
ukb_to_efo.shape

(1087, 6)

In [58]:
ukb_to_efo.head()

Unnamed: 0,ukb_code,term_label,term_codes,mapping_type,ukb_fullcode,current_term_label
0,K55,vascular disease,"EFO:0004264, EFO:0009431",Broad,K55-Diagnoses_main_ICD10_K55_Vascular_disorder...,vascular disease AND intestinal disease
1,M17,osteoarthritis || knee,EFO:0004616,Broad,M17-Diagnoses_main_ICD10_M17_Gonarthrosis_arth...,"osteoarthritis, knee"
3,R30,dysuria,EFO:0003901,? Broad,R30-Diagnoses_main_ICD10_R30_Pain_associated_w...,dysuria
5,O60,premature birth,EFO:0003917,? Exact,O60-Diagnoses_main_ICD10_O60_Preterm_delivery,premature birth
7,S64,carpal tunnel syndrome,EFO:0004143,? Narrow,S64-Diagnoses_main_ICD10_S64_Injury_of_nerves_...,carpal tunnel syndrome


# Add categories

It only adds Disease for now

In [59]:
def _get_disease_category(row):
    term_ids = row["term_codes"]

    matches = term_pattern.findall(term_ids)

    for m in matches:
        if is_disease(graph, m):
            return "disease"

    return None

In [60]:
ukb_to_efo = ukb_to_efo.assign(category=ukb_to_efo.apply(_get_disease_category, axis=1))

In [61]:
ukb_to_efo.head()

Unnamed: 0,ukb_code,term_label,term_codes,mapping_type,ukb_fullcode,current_term_label,category
0,K55,vascular disease,"EFO:0004264, EFO:0009431",Broad,K55-Diagnoses_main_ICD10_K55_Vascular_disorder...,vascular disease AND intestinal disease,disease
1,M17,osteoarthritis || knee,EFO:0004616,Broad,M17-Diagnoses_main_ICD10_M17_Gonarthrosis_arth...,"osteoarthritis, knee",disease
3,R30,dysuria,EFO:0003901,? Broad,R30-Diagnoses_main_ICD10_R30_Pain_associated_w...,dysuria,
5,O60,premature birth,EFO:0003917,? Exact,O60-Diagnoses_main_ICD10_O60_Preterm_delivery,premature birth,
7,S64,carpal tunnel syndrome,EFO:0004143,? Narrow,S64-Diagnoses_main_ICD10_S64_Injury_of_nerves_...,carpal tunnel syndrome,disease


In [62]:
ukb_to_efo[ukb_to_efo["category"] == "disease"].shape

(810, 7)

In [63]:
_tmp = ukb_to_efo[ukb_to_efo["category"] == "disease"]
_tmp["current_term_label"].value_counts()

injury                           9
emotional symptom measurement    8
pregnancy disorder               8
skin disease                     6
bone disease                     6
                                ..
nervous system disease           1
seasonal allergic rhinitis       1
osteonecrosis                    1
hordeolum                        1
jaw disease                      1
Name: current_term_label, Length: 539, dtype: int64

# Testing

In [64]:
# asthma exists
_tmp = ukb_to_efo[ukb_to_efo["current_term_label"].str.lower().str.contains("asthma")]
display(_tmp)
assert _tmp.shape[0] >= 4

Unnamed: 0,ukb_code,term_label,term_codes,mapping_type,ukb_fullcode,current_term_label,category
118,J46,status asthmaticus,EFO:0008590,Exact,J46-Diagnoses_main_ICD10_J46_Status_asthmaticus,status asthmaticus,disease
611,22127,asthma,EFO:0000270,Exact,22127-Doctor_diagnosed_asthma,asthma,disease
681,20002_1111,asthma,EFO:0000270,Exact,20002_1111-Noncancer_illness_code_selfreported...,asthma,disease
1193,J45,asthma,EFO:0000270,Exact,J45-Diagnoses_main_ICD10_J45_Asthma,asthma,disease


In [65]:
# check if old EFO labels are updated in orig_efo_names
# _tmp = ukb_to_efo.dropna()
_tmp = ukb_to_efo[ukb_to_efo["term_codes"].str.contains("EFO:1000673")]
display(_tmp)
_tmp = _tmp.iloc[0]
assert _tmp["term_label"] == "bullous skin disease"
assert _tmp["current_term_label"] == "autoimmune bullous skin disease"

Unnamed: 0,ukb_code,term_label,term_codes,mapping_type,ukb_fullcode,current_term_label,category
1112,20002_1454,bullous skin disease,EFO:1000673,Narrow,20002_1454-Noncancer_illness_code_selfreported...,autoimmune bullous skin disease,


In [66]:
_tmp = ukb_to_efo.isna().sum()
assert _tmp.loc["ukb_fullcode"] == 0
assert _tmp.loc["term_codes"] == 0
assert _tmp.loc["current_term_label"] == 0

In [67]:
# check all nan term labels now have non-empty current labels
_tmp = ukb_to_efo[ukb_to_efo["term_label"].isna()]
display(_tmp)
assert _tmp[_tmp["current_term_label"].isna()].shape[0] == 0
assert _tmp[_tmp["current_term_label"].str.strip().str.len() == 0].shape[0] == 0

Unnamed: 0,ukb_code,term_label,term_codes,mapping_type,ukb_fullcode,current_term_label,category
452,R94,,EFO:0009628,Exact,R94-Diagnoses_main_ICD10_R94_Abnormal_results_...,abnormal result of function studies,
1031,1170,,EFO:0009817,Exact,1170-Getting_up_in_morning,"ease of getting up in the morning, self-reported",
1123,M75,,EFO:0009470,Broad,M75-Diagnoses_main_ICD10_M75_Shoulder_lesions,soft tissue disease,disease
1124,O32,,EFO:0009682,Broad,O32-Diagnoses_main_ICD10_O32_Maternal_care_for...,pregnancy disorder,disease
1126,O34,,EFO:0009682,Broad,O34-Diagnoses_main_ICD10_O34_Maternal_care_for...,pregnancy disorder,disease
1141,Z76,,EFO:0009786,Broad,Z76-Diagnoses_main_ICD10_Z76_Persons_encounter...,encounter with health service,
1142,Z71,,EFO:0009786,Broad,Z71-Diagnoses_main_ICD10_Z71_Persons_encounter...,encounter with health service,
1156,Z53,,EFO:0009786,Broad,Z53-Diagnoses_main_ICD10_Z53_Persons_encounter...,encounter with health service,
1566,BCAC_Overall_BreastCancer_EUR,,EFO:0005606,,BCAC_Overall_BreastCancer_EUR,family history of breast cancer,disease
1567,Astle_et_al_2016_Sum_neutrophil_eosinophil_counts,,EFO:0004842,,Astle_et_al_2016_Sum_neutrophil_eosinophil_counts,eosinophil count,


In [68]:
ukb_to_efo[(ukb_to_efo["current_term_label"].str.strip().str.len() == 0)]

Unnamed: 0,ukb_code,term_label,term_codes,mapping_type,ukb_fullcode,current_term_label,category


In [69]:
# check there are no null/empty current term labels
assert (
    ukb_to_efo[
        ukb_to_efo["current_term_label"].isna()
        | (ukb_to_efo["current_term_label"].str.strip().str.len() == 0)
    ].shape[0]
    == 0
)

In [70]:
# How many entries have an term_label that differs from current_term_label?
ukb_to_efo[ukb_to_efo["term_label"] != ukb_to_efo["current_term_label"]]

Unnamed: 0,ukb_code,term_label,term_codes,mapping_type,ukb_fullcode,current_term_label,category
0,K55,vascular disease,"EFO:0004264, EFO:0009431",Broad,K55-Diagnoses_main_ICD10_K55_Vascular_disorder...,vascular disease AND intestinal disease,disease
1,M17,osteoarthritis || knee,EFO:0004616,Broad,M17-Diagnoses_main_ICD10_M17_Gonarthrosis_arth...,"osteoarthritis, knee",disease
21,Z85,neoplasm' (covers benign + malignant; otherwis...,EFO:0000616,Broad,Z85-Diagnoses_main_ICD10_Z85_Personal_history_...,neoplasm,disease
27,Z34,pregnancy',EFO:0002950,Broad,Z34-Diagnoses_main_ICD10_Z34_Supervision_of_no...,pregnancy,
67,H52,Abnormality of refraction,HP:0000539,Broad,H52-Diagnoses_main_ICD10_H52_Disorders_of_refr...,abnormality of refraction,
...,...,...,...,...,...,...,...
1602,Astle_et_al_2016_Red_blood_cell_count,,EFO:0004305,,Astle_et_al_2016_Red_blood_cell_count,erythrocyte count,
1603,SSGAC_Depressive_Symptoms,,EFO:0007006,,SSGAC_Depressive_Symptoms,depressive symptom measurement,disease
1604,BCAC_ER_positive_BreastCancer_EUR,,EFO:1000649,,BCAC_ER_positive_BreastCancer_EUR,estrogen-receptor positive breast cancer,disease
1605,IBD.EUR.Inflammatory_Bowel_Disease,,EFO:0003767,,IBD.EUR.Inflammatory_Bowel_Disease,inflammatory bowel disease,disease


In [71]:
# How many entries comprise more than one EFO codes?
_tmp = ukb_to_efo[ukb_to_efo["current_term_label"].str.contains(" AND ")]
display(_tmp.shape)
display(_tmp)

(46, 7)

Unnamed: 0,ukb_code,term_label,term_codes,mapping_type,ukb_fullcode,current_term_label,category
0,K55,vascular disease,"EFO:0004264, EFO:0009431",Broad,K55-Diagnoses_main_ICD10_K55_Vascular_disorder...,vascular disease AND intestinal disease,disease
162,E23,"hypopituitarism, pituitary gland disease","EFO:0001380, EFO:0009607",Exact,E23-Diagnoses_main_ICD10_E23_Hypofunction_and_...,hypopituitarism AND pituitary gland disease,disease
346,I80,phlebitis || Thrombophlebitis,"EFO:1001395, HP:0004418",Exact,I80-Diagnoses_main_ICD10_I80_Phlebitis_and_thr...,phlebitis AND thrombophlebitis,disease
584,3404,shoulder pain || neck pain,"HP:0030834, HP:0030833",Broad,3404-Neckshoulder_pain_for_3_months,shoulder pain AND neck pain,
590,3799,headache disorder || headache,"EFO:0009550, HP:0002315",Exact,3799-Headaches_for_3_months,headache disorder AND headache,disease
693,20002_1139,esophagitis || Barrett's esophagus,"HP:0100633, EFO:0000280",Exact,20002_1139-Noncancer_illness_code_selfreported...,esophagitis AND barrett's esophagus,disease
701,20002_1162,gallstones || cholelithiasis,"EFO:0004210, EFO:0004799",Exact,20002_1162-Noncancer_illness_code_selfreported...,gallstones AND cholelithiasis,disease
708,20002_1197,nephrolithiasis || ureterolithiasis || bladd...,"EFO:0004253, EFO:1001228, EFO:1000839",Exact,20002_1197-Noncancer_illness_code_selfreported...,nephrolithiasis AND ureterolithiasis AND bladd...,disease
715,20002_1225,hyperthyroidism || thyrotoxicosis,"EFO:0009189, EFO:0009190",Exact,20002_1225-Noncancer_illness_code_selfreported...,hyperthyroidism AND thyrotoxicosis,disease
716,20002_1226,hypothyroidism || myxedema,"EFO:0004705, EFO:1001055",Exact,20002_1226-Noncancer_illness_code_selfreported...,hypothyroidism AND myxedema,disease


In [72]:
ukb_to_efo["mapping_type"].value_counts()

Exact       623
Broad       379
Narrow       38
?             7
Narrow?       1
? Broad       1
? Narrow      1
? Exact       1
Name: mapping_type, dtype: int64

In [73]:
ukb_to_efo[ukb_to_efo["mapping_type"] == "Exact"]["current_term_label"].value_counts()

emphysema                                                    4
cardiac arrhythmia                                           4
hearing loss                                                 3
asthma                                                       3
joint disease                                                3
                                                            ..
laryngeal disease                                            1
infectious mononucleosis AND epstein-barr virus infection    1
intestinal disease                                           1
celiac disease                                               1
breast hypertrophy                                           1
Name: current_term_label, Length: 516, dtype: int64

In [74]:
ukb_to_efo["current_term_label"].value_counts()

injury                           9
emotional symptom measurement    8
pregnancy disorder               8
encounter with health service    7
checkup                          7
                                ..
smoking behavior                 1
viral meningitis                 1
bilateral oophorectomy           1
abortion                         1
breast hypertrophy               1
Name: current_term_label, Length: 743, dtype: int64

In [75]:
ukb_to_efo[ukb_to_efo["current_term_label"] == "emotional symptom measurement"]

Unnamed: 0,ukb_code,term_label,term_codes,mapping_type,ukb_fullcode,current_term_label,category
914,1930,emotional symptom measurement,EFO:0007803,Broad,1930-Miserableness,emotional symptom measurement,disease
915,1950,emotional symptom measurement,EFO:0007803,Broad,1950-Sensitivity_hurt_feelings,emotional symptom measurement,disease
916,1960,emotional symptom measurement,EFO:0007803,Broad,1960-Fedup_feelings,emotional symptom measurement,disease
917,1970,emotional symptom measurement,EFO:0007803,Broad,1970-Nervous_feelings,emotional symptom measurement,disease
918,2030,emotional symptom measurement,EFO:0007803,Broad,2030-Guilty_feelings,emotional symptom measurement,disease
974,2050,emotional symptom measurement,EFO:0007803,Broad,2050-Frequency_of_depressed_mood_in_last_2_weeks,emotional symptom measurement,disease
975,2060,emotional symptom measurement,EFO:0007803,Broad,2060-Frequency_of_unenthusiasm_disinterest_in_...,emotional symptom measurement,disease
1058,4526,emotional symptom measurement,EFO:0007803,Broad,4526-Happiness,emotional symptom measurement,disease


In [76]:
ukb_to_efo[ukb_to_efo["ukb_fullcode"].duplicated(False)]

Unnamed: 0,ukb_code,term_label,term_codes,mapping_type,ukb_fullcode,current_term_label,category
1311,T79,complication,EFO:0009888,Broad,T79-Diagnoses_main_ICD10_T79_Certain_early_com...,trauma complication,disease
1329,T79,complication,EFO:0009518,Broad,T79-Diagnoses_main_ICD10_T79_Certain_early_com...,complication,disease
1441,M48,-,EFO:0004260,Broad,M48-Diagnoses_main_ICD10_M48_Other_spondylopat...,bone disease,disease
1442,M53,-,EFO:0009676,Broad,M53-Diagnoses_main_ICD10_M53_Other_dorsopathie...,musculoskeletal system disease,disease
1443,M48,spondyloarthropathy,EFO:0004260,Broad,M48-Diagnoses_main_ICD10_M48_Other_spondylopat...,bone disease,disease
1445,M53,back pain,EFO:0009676,Broad,M53-Diagnoses_main_ICD10_M53_Other_dorsopathie...,musculoskeletal system disease,disease


In [77]:
ukb_to_efo[ukb_to_efo["ukb_fullcode"].duplicated(False)]["ukb_fullcode"].tolist()

['T79-Diagnoses_main_ICD10_T79_Certain_early_complications_of_trauma_not_elsewhere_classified',
 'T79-Diagnoses_main_ICD10_T79_Certain_early_complications_of_trauma_not_elsewhere_classified',
 'M48-Diagnoses_main_ICD10_M48_Other_spondylopathies',
 'M53-Diagnoses_main_ICD10_M53_Other_dorsopathies_not_elsewhere_classified',
 'M48-Diagnoses_main_ICD10_M48_Other_spondylopathies',
 'M53-Diagnoses_main_ICD10_M53_Other_dorsopathies_not_elsewhere_classified']

In [78]:
# Fix duplicated ukb_fullcode entries with different current_term_labels
idx = ukb_to_efo[
    ukb_to_efo["ukb_fullcode"]
    == "T79-Diagnoses_main_ICD10_T79_Certain_early_complications_of_trauma_not_elsewhere_classified"
].index
ukb_to_efo.loc[idx, "current_term_label"] = "complication"

In [79]:
assert ukb_to_efo[ukb_to_efo["mapping_type"] == "Exact"]["ukb_fullcode"].is_unique

In [80]:
_tmp = (
    ukb_to_efo.groupby(["ukb_fullcode", "current_term_label"])
    .count()
    .reset_index()[["ukb_fullcode", "current_term_label"]]
)
assert not _tmp.duplicated().any()

# Save

## In main data folder

In [81]:
outfile = conf.PHENOMEXCAN["TRAITS_FULLCODE_TO_EFO_MAP_FILE"]
display(outfile)

ukb_to_efo.to_csv(outfile, sep="\t", index=False)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/data/phenomexcan/phenomexcan_traits_fullcode_to_efo.tsv')

## In libs/data folder

Since this file (`outfile`) is used by the Trait class to return EFO codes/labels for PhenomeXcan traits, it is copied also to a source code folder and it is supposed to be versioned.

In [None]:
display(Trait.UKB_TO_EFO_MAP_FILE)

In [None]:
copyfile(
    outfile,
    Trait.UKB_TO_EFO_MAP_FILE,
)