# Description

This notebooks analyzes the drug-disease pairs discussed in the manuscript.

# Modules loading

In [None]:
%load_ext autoreload
%autoreload 2

In [2]:
from pathlib import Path

import numpy as np
import pandas as pd

from entity import Gene
import conf

# Settings

In [3]:
# OUTPUT_DIR = conf.RESULTS["DRUG_DISEASE_ANALYSES"]
# display(OUTPUT_DIR)

# assert OUTPUT_DIR.exists()
# # OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

In [4]:
# INPUT_DATA_DIR = Path(
#     conf.RESULTS["DRUG_DISEASE_ANALYSES"],
#     "data",
# )
# display(INPUT_DATA_DIR)

In [5]:
# OUTPUT_PREDICTIONS_DIR = Path(
#     conf.RESULTS["DRUG_DISEASE_ANALYSES"], "predictions", "dotprod_neg"
# )
# display(OUTPUT_PREDICTIONS_DIR)

# Data loading

## PharmacotherapyDB

### Gold standard set

In [6]:
gold_standard = pd.read_pickle(
    Path(conf.RESULTS["DRUG_DISEASE_ANALYSES"], "gold_standard.pkl"),
)

In [7]:
gold_standard.shape

(998, 3)

In [8]:
gold_standard.head()

Unnamed: 0,trait,drug,true_class
0,DOID:10652,DB00843,1
1,DOID:10652,DB00674,1
2,DOID:10652,DB01043,1
3,DOID:10652,DB00989,1
4,DOID:10652,DB00810,0


### Info

In [9]:
input_file = conf.PHARMACOTHERAPYDB["INDICATIONS_FILE"]
display(input_file)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/bases_data/base_orig/data/hetionet/pharmacotherapydb-v1.0/indications.tsv')

In [10]:
gold_standard_info = pd.read_csv(input_file, sep="\t")

In [11]:
gold_standard_info = gold_standard_info.rename(columns={"drug": "drug_name"})

In [12]:
gold_standard_info.shape

(1388, 7)

In [13]:
gold_standard_info.head()

Unnamed: 0,doid_id,drugbank_id,disease,drug_name,category,n_curators,n_resources
0,DOID:10652,DB00843,Alzheimer's disease,Donepezil,DM,2,1
1,DOID:10652,DB00674,Alzheimer's disease,Galantamine,DM,1,4
2,DOID:10652,DB01043,Alzheimer's disease,Memantine,DM,1,3
3,DOID:10652,DB00989,Alzheimer's disease,Rivastigmine,DM,1,3
4,DOID:10652,DB00245,Alzheimer's disease,Benzatropine,SYM,3,1


In [14]:
gold_standard_info = (
    gold_standard.set_index(["trait", "drug"])
    .join(
        gold_standard_info.rename(
            columns={"doid_id": "trait", "drugbank_id": "drug"}
        ).set_index(["trait", "drug"])
    )
    .reset_index()
)

In [15]:
gold_standard_info.shape

(998, 8)

In [16]:
gold_standard_info.head()

Unnamed: 0,trait,drug,true_class,disease,drug_name,category,n_curators,n_resources
0,DOID:10652,DB00843,1,Alzheimer's disease,Donepezil,DM,2,1
1,DOID:10652,DB00674,1,Alzheimer's disease,Galantamine,DM,1,4
2,DOID:10652,DB01043,1,Alzheimer's disease,Memantine,DM,1,3
3,DOID:10652,DB00989,1,Alzheimer's disease,Rivastigmine,DM,1,3
4,DOID:10652,DB00810,0,Alzheimer's disease,Biperiden,NOT,2,1


## LINCS data

In [17]:
input_file = Path(
    conf.RESULTS["DRUG_DISEASE_ANALYSES"], "lincs", "lincs-data.pkl"
).resolve()

display(input_file)

PosixPath('/media/miltondp/Elements1/projects/phenoplier/results/drug_disease_analyses/lincs/lincs-data.pkl')

In [18]:
lincs_data = pd.read_pickle(input_file).T.rename(columns=Gene.GENE_ID_TO_NAME_MAP)

In [19]:
display(lincs_data.shape)

(1170, 7120)

In [20]:
display(lincs_data.head())

Unnamed: 0_level_0,ADA,CDH2,AKT3,MED6,ACOT8,ABI1,GNPDA1,CDH3,TANK,TOPORS-AS1,...,RBX1,CDC42,THOC1,RCE1,HNRNPDL,DMTF1,PPP4R1,CDH1,PTBP3,CASP8AP2
perturbagen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
DB00014,-1.001,1.146,-0.693,-0.037,0.162,-0.38,0.649,-1.952,0.294,0.274,...,-0.592,0.178,0.27,-0.013,0.351,-0.128,0.274,-0.734,-0.048,0.036
DB00091,-1.835,-1.863,1.694,0.383,-0.899,0.628,-4.878,2.527,1.709,-0.981,...,-0.668,-2.795,-0.333,-0.027,0.578,6.926,-1.875,4.382,0.266,-4.995
DB00121,1.391,0.011,-0.804,0.269,0.105,-0.588,-1.899,0.306,-1.178,0.12,...,-0.962,0.45,-0.999,1.358,-1.476,0.423,-1.356,-1.897,-0.299,-0.732
DB00130,1.132,-1.02,-0.164,-0.997,-0.09,0.195,-2.341,0.494,-0.813,-1.14,...,-0.553,-0.528,0.308,0.534,-0.32,3.001,-0.414,0.134,0.147,2.158
DB00131,0.257,1.143,1.145,0.185,-1.291,-0.457,0.038,1.442,-1.692,0.593,...,-0.556,-0.6,0.092,-0.564,0.174,-1.203,-1.08,-0.575,-0.792,-0.095


## Prediction results (aggregated)

In [21]:
output_file = Path(
    conf.RESULTS["DRUG_DISEASE_ANALYSES"],
    "lincs",
    "predictions",
    "predictions_results_aggregated.pkl",
).resolve()
display(output_file)

PosixPath('/media/miltondp/Elements1/projects/phenoplier/results/drug_disease_analyses/lincs/predictions/predictions_results_aggregated.pkl')

In [22]:
predictions_avg = pd.read_pickle(output_file)

In [23]:
predictions_avg.shape

(1292, 5)

In [24]:
predictions_avg.head()

Unnamed: 0,trait,drug,method,score,true_class
0,DOID:0050741,DB00215,Gene-based,359.0,1.0
1,DOID:0050741,DB00215,Module-based,437.8,1.0
2,DOID:0050741,DB00704,Gene-based,395.8,1.0
3,DOID:0050741,DB00704,Module-based,562.8,1.0
4,DOID:0050741,DB00822,Gene-based,556.8,1.0


### Merge with gold standard set

In [25]:
pharmadb_predictions = pd.merge(
    gold_standard_info,
    predictions_avg,
    on=["trait", "drug"],
    how="inner",
)

In [26]:
pharmadb_predictions

Unnamed: 0,trait,drug,true_class_x,disease,drug_name,category,n_curators,n_resources,method,score,true_class_y
0,DOID:10652,DB00843,1,Alzheimer's disease,Donepezil,DM,2,1,Gene-based,311.6,1.0
1,DOID:10652,DB00843,1,Alzheimer's disease,Donepezil,DM,2,1,Module-based,475.2,1.0
2,DOID:10652,DB00674,1,Alzheimer's disease,Galantamine,DM,1,4,Gene-based,509.4,1.0
3,DOID:10652,DB00674,1,Alzheimer's disease,Galantamine,DM,1,4,Module-based,508.0,1.0
4,DOID:10652,DB01043,1,Alzheimer's disease,Memantine,DM,1,3,Gene-based,452.4,1.0
...,...,...,...,...,...,...,...,...,...,...,...
1287,DOID:363,DB00917,0,uterine cancer,Dinoprostone,NOT,3,1,Module-based,476.0,0.0
1288,DOID:363,DB00783,0,uterine cancer,Estradiol,NOT,2,1,Gene-based,556.0,0.0
1289,DOID:363,DB00783,0,uterine cancer,Estradiol,NOT,2,1,Module-based,603.0,0.0
1290,DOID:12306,DB00553,1,vitiligo,Methoxsalen,DM,3,1,Gene-based,550.8,1.0


In [27]:
pharmadb_predictions = pharmadb_predictions[
    ["trait", "drug", "disease", "drug_name", "method", "score", "true_class_x"]
].rename(columns={"true_class_x": "true_class", "drug_x": "drug"})

In [28]:
display(pharmadb_predictions.shape)
assert pharmadb_predictions.shape[0] == predictions_avg.shape[0]

(1292, 7)

In [29]:
pharmadb_predictions.head()

Unnamed: 0,trait,drug,disease,drug_name,method,score,true_class
0,DOID:10652,DB00843,Alzheimer's disease,Donepezil,Gene-based,311.6,1
1,DOID:10652,DB00843,Alzheimer's disease,Donepezil,Module-based,475.2,1
2,DOID:10652,DB00674,Alzheimer's disease,Galantamine,Gene-based,509.4,1
3,DOID:10652,DB00674,Alzheimer's disease,Galantamine,Module-based,508.0,1
4,DOID:10652,DB01043,Alzheimer's disease,Memantine,Gene-based,452.4,1


In [30]:
pharmadb_predictions["trait"].unique().shape

(53,)

In [31]:
pharmadb_predictions["drug"].unique().shape

(322,)

In [32]:
data_stats = pharmadb_predictions.groupby("method")["score"].describe()
display(data_stats)

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Gene-based,646.0,517.657585,93.950154,87.3,458.95,535.4,591.75,646.0
Module-based,646.0,530.429721,73.472425,220.4,484.1,535.0,587.9,646.0


# Standardize scores for each method

In [33]:
# Standardize scores by method
def _standardize(x):
    return (x["score"] - data_stats.loc[x["method"], "mean"]) / data_stats.loc[
        x["method"], "std"
    ]

In [34]:
pharmadb_predictions = pharmadb_predictions.assign(
    score_std=pharmadb_predictions.apply(_standardize, axis=1)
)

In [35]:
pharmadb_predictions

Unnamed: 0,trait,drug,disease,drug_name,method,score,true_class,score_std
0,DOID:10652,DB00843,Alzheimer's disease,Donepezil,Gene-based,311.6,1,-2.193265
1,DOID:10652,DB00843,Alzheimer's disease,Donepezil,Module-based,475.2,1,-0.751707
2,DOID:10652,DB00674,Alzheimer's disease,Galantamine,Gene-based,509.4,1,-0.087893
3,DOID:10652,DB00674,Alzheimer's disease,Galantamine,Module-based,508.0,1,-0.305281
4,DOID:10652,DB01043,Alzheimer's disease,Memantine,Gene-based,452.4,1,-0.694598
...,...,...,...,...,...,...,...,...
1287,DOID:363,DB00917,uterine cancer,Dinoprostone,Module-based,476.0,0,-0.740818
1288,DOID:363,DB00783,uterine cancer,Estradiol,Gene-based,556.0,0,0.408114
1289,DOID:363,DB00783,uterine cancer,Estradiol,Module-based,603.0,0,0.987721
1290,DOID:12306,DB00553,vitiligo,Methoxsalen,Gene-based,550.8,1,0.352766


### Testing

In [36]:
_tmp = pharmadb_predictions.groupby("method")[["score", "score_std"]].describe()
display(_tmp)

Unnamed: 0_level_0,score,score,score,score,score,score,score,score,score_std,score_std,score_std,score_std,score_std,score_std,score_std,score_std
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
method,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
Gene-based,646.0,517.657585,93.950154,87.3,458.95,535.4,591.75,646.0,646.0,-5.609548e-16,1.0,-4.580701,-0.62488,0.188849,0.788635,1.366069
Module-based,646.0,530.429721,73.472425,220.4,484.1,535.0,587.9,646.0,646.0,2.034836e-16,1.0,-4.219675,-0.630573,0.062204,0.782202,1.572975


In [37]:
_tmp0 = pharmadb_predictions[(pharmadb_predictions["method"] == "Gene-based")][
    ["score", "score_std"]
]

In [38]:
assert all(_tmp0.corr() > 0.99999)

In [39]:
_tmp0 = pharmadb_predictions[(pharmadb_predictions["method"] == "Module-based")][
    ["score", "score_std"]
]

In [40]:
assert all(_tmp0.corr() > 0.99999)

# List diseases

In [41]:
pharmadb_predictions["disease"].unique()

array(["Alzheimer's disease", "Crohn's disease", "Parkinson's disease",
       'alcohol dependence', 'allergic rhinitis', 'anemia',
       'ankylosing spondylitis', 'asthma', 'atherosclerosis',
       'bone cancer', 'brain cancer', 'breast cancer', 'cervical cancer',
       'chronic kidney failure', 'chronic obstructive pulmonary disease',
       'coronary artery disease', 'epilepsy syndrome',
       'esophageal cancer', 'gestational diabetes', 'glaucoma', 'gout',
       'hematologic cancer', 'hypertension', 'hypothyroidism',
       'kidney cancer', 'liver cancer', 'lung cancer', 'malaria',
       'melanoma', 'migraine', 'multiple sclerosis', 'nephrolithiasis',
       'obesity', 'osteoarthritis', 'osteoporosis', 'pancreatic cancer',
       'pancreatitis', 'periodontitis', 'prostate cancer', 'psoriasis',
       'psoriatic arthritis', 'rheumatoid arthritis', 'skin cancer',
       'stomach cancer', 'systemic lupus erythematosus',
       'testicular cancer', 'thyroid cancer', 'type 1 diabe

# Looks for differences in scores of both methods

In [42]:
def _compare(x):
    assert x.shape[0] == 2
    x_sign = np.sign(x["score_std"].values)
    x0 = x.iloc[0]["score_std"]
    x1 = x.iloc[1]["score_std"]

    return pd.Series(
        {"different_sign": x_sign[0] != x_sign[1], "score_difference": np.abs(x0 - x1)}
    )

In [43]:
pharmadb_predictions = pharmadb_predictions.set_index(["trait", "drug"]).join(
    pharmadb_predictions.groupby(["trait", "drug"]).apply(_compare)
)

In [44]:
pharmadb_predictions.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,disease,drug_name,method,score,true_class,score_std,different_sign,score_difference
trait,drug,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
DOID:0050741,DB00215,alcohol dependence,Citalopram,Gene-based,359.0,1,-1.688742,False,0.428001
DOID:0050741,DB00215,alcohol dependence,Citalopram,Module-based,437.8,1,-1.260741,False,0.428001
DOID:0050741,DB00704,alcohol dependence,Naltrexone,Gene-based,395.8,1,-1.297045,True,1.737622
DOID:0050741,DB00704,alcohol dependence,Naltrexone,Module-based,562.8,1,0.440577,True,1.737622
DOID:0050741,DB00822,alcohol dependence,Disulfiram,Gene-based,556.8,1,0.41663,False,0.508483


## any disease

In [45]:
with pd.option_context(
    "display.max_rows", None, "display.max_columns", None, "max_colwidth", None
):
    _tmp = pharmadb_predictions[(pharmadb_predictions["different_sign"])].sort_values(
        ["score_difference", "drug_name", "method"], ascending=[False, False, False]
    )
    
    display(_tmp.shape)
    display(_tmp)

Unnamed: 0_level_0,Unnamed: 1_level_0,disease,drug_name,method,score,true_class,score_std,different_sign,score_difference
trait,drug,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
DOID:2998,DB00445,testicular cancer,Epirubicin,Module-based,618.9,1,1.204129,True,5.78483
DOID:2998,DB00445,testicular cancer,Epirubicin,Gene-based,87.3,1,-4.580701,True,5.78483
DOID:2998,DB00997,testicular cancer,Doxorubicin,Module-based,618.9,1,1.204129,True,5.78483
DOID:2998,DB00997,testicular cancer,Doxorubicin,Gene-based,87.3,1,-4.580701,True,5.78483
DOID:2998,DB00970,testicular cancer,Dactinomycin,Module-based,532.2,1,0.024094,True,4.188618
DOID:2998,DB00970,testicular cancer,Dactinomycin,Gene-based,126.4,1,-4.164523,True,4.188618
DOID:10763,DB01136,hypertension,Carvedilol,Module-based,289.2,1,-3.283269,True,3.64455
DOID:10763,DB01136,hypertension,Carvedilol,Gene-based,551.6,1,0.361281,True,3.64455
DOID:4481,DB00959,allergic rhinitis,Methylprednisolone,Module-based,538.4,1,0.10848,True,3.395942
DOID:4481,DB00959,allergic rhinitis,Methylprednisolone,Gene-based,208.8,1,-3.287462,True,3.395942


In [46]:
def find_differences(trait_name):
    with pd.option_context(
        "display.max_rows", None, "display.max_columns", None, "max_colwidth", None
    ):
        _tmp = pharmadb_predictions[
            (pharmadb_predictions["disease"] == trait_name)
            & (pharmadb_predictions["different_sign"])
        ].sort_values(
            ["score_difference", "drug_name", "method"], ascending=[False, False, False]
        )
        display(_tmp)

## coronary artery disease

In [47]:
with pd.option_context(
    "display.max_rows", None, "display.max_columns", None, "max_colwidth", None
):
    _tmp = pharmadb_predictions[
        (pharmadb_predictions["drug_name"] == "Niacin")
        & (pharmadb_predictions["disease"] == "coronary artery disease")  # avoid cancer
    ].sort_values(
        ["score_difference", "drug_name", "method"], ascending=[False, False, False]
    )
    display(_tmp.head(50))

Unnamed: 0_level_0,Unnamed: 1_level_0,disease,drug_name,method,score,true_class,score_std,different_sign,score_difference
trait,drug,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
DOID:3393,DB00627,coronary artery disease,Niacin,Module-based,601.0,1,0.9605,False,0.452333
DOID:3393,DB00627,coronary artery disease,Niacin,Gene-based,565.4,1,0.508167,False,0.452333


In [48]:
find_differences("coronary artery disease")

Unnamed: 0_level_0,Unnamed: 1_level_0,disease,drug_name,method,score,true_class,score_std,different_sign,score_difference
trait,drug,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
DOID:3393,DB01197,coronary artery disease,Captopril,Module-based,580.2,1,0.677401,True,1.418832
DOID:3393,DB01197,coronary artery disease,Captopril,Gene-based,448.0,1,-0.741431,True,1.418832
DOID:3393,DB00451,coronary artery disease,Levothyroxine,Module-based,483.5,0,-0.638739,True,0.996827
DOID:3393,DB00451,coronary artery disease,Levothyroxine,Gene-based,551.3,0,0.358088,True,0.996827
DOID:3393,DB00509,coronary artery disease,Dextrothyroxine,Module-based,483.5,0,-0.638739,True,0.996827
DOID:3393,DB00509,coronary artery disease,Dextrothyroxine,Gene-based,551.3,0,0.358088,True,0.996827
DOID:3393,DB00700,coronary artery disease,Eplerenone,Module-based,538.6,1,0.111202,True,0.967588
DOID:3393,DB00700,coronary artery disease,Eplerenone,Gene-based,437.2,1,-0.856386,True,0.967588
DOID:3393,DB00790,coronary artery disease,Perindopril,Module-based,555.8,1,0.345303,True,0.554538
DOID:3393,DB00790,coronary artery disease,Perindopril,Gene-based,498.0,1,-0.209234,True,0.554538


## atherosclerosis

In [49]:
with pd.option_context(
    "display.max_rows", None, "display.max_columns", None, "max_colwidth", None
):
    _tmp = pharmadb_predictions[
        (pharmadb_predictions["drug_name"] == "Niacin")
        & (pharmadb_predictions["disease"] == "atherosclerosis")  # avoid cancer
    ].sort_values(
        ["score_difference", "drug_name", "method"], ascending=[False, False, False]
    )
    display(_tmp.head(50))

Unnamed: 0_level_0,Unnamed: 1_level_0,disease,drug_name,method,score,true_class,score_std,different_sign,score_difference
trait,drug,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
DOID:1936,DB00627,atherosclerosis,Niacin,Module-based,568.6,1,0.519518,True,0.530775
DOID:1936,DB00627,atherosclerosis,Niacin,Gene-based,516.6,1,-0.011257,True,0.530775


In [50]:
find_differences("atherosclerosis")

Unnamed: 0_level_0,Unnamed: 1_level_0,disease,drug_name,method,score,true_class,score_std,different_sign,score_difference
trait,drug,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
DOID:1936,DB00227,atherosclerosis,Lovastatin,Module-based,439.0,1,-1.244409,True,1.629106
DOID:1936,DB00227,atherosclerosis,Lovastatin,Gene-based,553.8,1,0.384698,True,1.629106
DOID:1936,DB00175,atherosclerosis,Pravastatin,Module-based,470.0,1,-0.822482,True,1.000687
DOID:1936,DB00175,atherosclerosis,Pravastatin,Gene-based,534.4,1,0.178205,True,1.000687
DOID:1936,DB00758,atherosclerosis,Clopidogrel,Module-based,503.2,0,-0.370611,True,0.921355
DOID:1936,DB00758,atherosclerosis,Clopidogrel,Gene-based,569.4,0,0.550743,True,0.921355
DOID:1936,DB00627,atherosclerosis,Niacin,Module-based,568.6,1,0.519518,True,0.530775
DOID:1936,DB00627,atherosclerosis,Niacin,Gene-based,516.6,1,-0.011257,True,0.530775
