# Evaluating the manual curation of the indication catalog

In [1]:
import os

import pandas

In [2]:
raw = {}
for initials in ['csh']:
    path = os.path.join(initials, 'curation-{}.csv'.format(initials.upper()))
    df = pandas.read_csv(path)
    df = df.dropna(how='all')
    raw[initials] = df

In [3]:
csh_df = raw['csh']
csh_df.tail(2)

Unnamed: 0,drug,disease,classification,notes,drug_url,disease_url
1386,Zopiclone,multiple sclerosis,SYM,insomnia,http://www.drugbank.ca/drugs/DB01198,http://www.disease-ontology.org/term/DOID%3A2377
1387,Zopiclone,schizophrenia,SYM,,http://www.drugbank.ca/drugs/DB01198,http://www.disease-ontology.org/term/DOID%3A5419


In [4]:
# Classification counts
csh_df.classification.value_counts()

DM     593
SYM    517
NOT    278
Name: classification, dtype: int64

In [5]:
# Classification counts as percent
csh_df.classification.value_counts() / len(csh_df)

DM     0.427233
SYM    0.372478
NOT    0.200288
Name: classification, dtype: float64

In [6]:
csh_DM_df = csh_df.query('classification == "DM"')

In [7]:
# Diseases with the most indications
disease_counts = csh_DM_df.disease.value_counts()
disease_counts.head(10)

hypertension                          61
hematologic cancer                    47
breast cancer                         33
coronary artery disease               22
glaucoma                              21
asthma                                21
prostate cancer                       19
psoriasis                             18
acquired immunodeficiency syndrome    18
lung cancer                           16
Name: disease, dtype: int64

In [8]:
# Percent of diseases with a single indication
(disease_counts == 1).mean()

0.18055555555555555

In [9]:
# Compounds with the most indications
compound_counts = csh_DM_df.drug.value_counts()
compound_counts.head(10)

Methotrexate     19
Doxorubicin      18
Epirubicin       15
Etoposide        11
Triamcinolone    10
Dactinomycin     10
Carboplatin       9
Docetaxel         9
Cisplatin         8
Fluorouracil      7
Name: drug, dtype: int64

In [10]:
# Percent of compounds with a single indication
(compound_counts == 1).mean()

0.68138801261829651