# Description

This notebook builds the gold-standard for drug-disease prediction using [PharmarcotherapyDB](https://dx.doi.org/10.7554%2FeLife.26726)

Instead of using all drug-disease pairs in PharmarcotherapyDB, we only use disease-modifying pairs as positive cases, and non-indications as negative ones. We exclude symptomatic (SYM) because those might not exert an important effect to the disease.

# Modules loading

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from pathlib import Path

import pandas as pd

import conf

# Settings

In [None]:
OUTPUT_DIR = conf.RESULTS["DRUG_DISEASE_ANALYSES"]
display(OUTPUT_DIR)
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

# PharmacotherapyDB: load gold standard

## Read data

In [21]:
# FIXME: add download of this data in setup data
input_file = Path(
    conf.DATA_DIR, "hetionet/pharmacotherapydb-v1.0", "indications.tsv"
).resolve()
display(input_file)

pharmadb_gold_standard = pd.read_csv(input_file, sep="\t")

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/data/hetionet/pharmacotherapydb-v1.0/indications.tsv')

In [None]:
pharmadb_gold_standard.shape

In [None]:
pharmadb_gold_standard.head()

In [None]:
pharmadb_gold_standard["doid_id"].unique().shape

In [None]:
pharmadb_gold_standard["drugbank_id"].unique().shape

## Build gold standard

In [None]:
pharmadb_gold_standard["category"].value_counts()

In [None]:
gold_standard = (
    pharmadb_gold_standard[pharmadb_gold_standard["category"].isin(("DM", "NOT"))]
    .set_index(["doid_id", "drugbank_id"])
    .apply(lambda x: int(x.category in ("DM",)), axis=1)
    .reset_index()
    .rename(
        columns={
            "doid_id": "trait",
            "drugbank_id": "drug",
            0: "true_class",
        }
    )
)

In [None]:
gold_standard.shape

In [None]:
assert gold_standard.shape[0] == 998

In [None]:
gold_standard.head()

In [None]:
gold_standard["trait"].unique().shape

In [None]:
gold_standard["drug"].unique().shape

In [None]:
gold_standard["true_class"].value_counts()

In [None]:
gold_standard.dropna().shape

In [None]:
doids_in_gold_standard = set(gold_standard["trait"].values)

# Save

In [None]:
output_file = Path(OUTPUT_DIR, "gold_standard.pkl").resolve()
display(output_file)

In [None]:
gold_standard.to_pickle(output_file)