# DrugBank and MeSH Normalization
In this notebook, we evaluate our normalizer with the two refwerence data bases DrugBank and NCBI MeSH.

In [1]:
import sys
sys.path.append('../../')

In [2]:
from preon.normalization import PrecisionOncologyNormalizer
from preon.drug import load_drugbank_drugs, load_charite_drug_goldstandard, load_database_drug_goldstandard, load_ctg_drug_goldstandard
from preon.cancer import download_or_load_mesh_cancers, load_database_cancer_goldstandard, load_ncbi_cancer_goldstandard
from preon.tests.utils import precision_score, recall_score, f1_score

['Abdominal Neoplasms', 'Plasmablastic Lymphoma', 'Mammary Analogue Secretory Carcinoma', 'Unilateral Breast Neoplasms', 'Giant Cell Tumor of Tendon Sheath']
['MESH:D000008', 'MESH:D000069293', 'MESH:D000069295', 'MESH:D000069584', 'MESH:D000070779']


In [3]:
import numpy as np
import pandas as pd
import daproli as dp

Let's first load the reference drug names from DrugBank and fit the normalizer.

In [4]:
drug_names, db_ids = load_drugbank_drugs()
normalizer = PrecisionOncologyNormalizer().fit(drug_names, db_ids)

Now, we can evaluate it using the provided provided gold standards.

In [5]:
goldstandards = [
    ("charite", load_charite_drug_goldstandard),
    ("database", load_database_drug_goldstandard),
    ("ctg", load_ctg_drug_goldstandard)
]

In [6]:
for dataset_name, load_dataset in goldstandards:
    drug_names, _, db_ids = load_dataset()
    df_eval = normalizer.evaluate(drug_names, db_ids)
    print(f"{dataset_name}: precision_score={np.round(precision_score(df_eval), 2)} recall_score={np.round(recall_score(df_eval), 2)} f1_score={np.round(f1_score(df_eval), 2)}")

charite: precision_score=0.97 recall_score=0.76 f1_score=0.85
database: precision_score=1.0 recall_score=0.94 f1_score=0.97
ctg: precision_score=0.97 recall_score=0.88 f1_score=0.92


Let's now load the reference cancer types from NCBI and fit the normalizer.

In [7]:
cancer_types, mesh_ids = download_or_load_mesh_cancers()
normalizer = PrecisionOncologyNormalizer().fit(cancer_types, mesh_ids)

Now, we can evaluate it using the provided provided gold standards.

In [8]:
goldstandards = [
    ("database", load_database_cancer_goldstandard),
    ("ncbi", load_ncbi_cancer_goldstandard)
]

In [9]:
for dataset_name, load_dataset in goldstandards:
    cancer_types, _, mesh_ids = load_dataset()
    df_eval = normalizer.evaluate(cancer_types, mesh_ids, n_grams=3)
    print(f"{dataset_name}: precision_score={np.round(precision_score(df_eval), 2)} recall_score={np.round(recall_score(df_eval), 2)} f1_score={np.round(f1_score(df_eval), 2)}")

database: precision_score=0.36 recall_score=0.57 f1_score=0.44
ncbi: precision_score=0.45 recall_score=0.34 f1_score=0.39
