# Create validation indication sets

In [1]:
import pandas

## Add DrugCentral novel indications

In [2]:
url = 'https://github.com/dhimmel/drugcentral/blob/e80a0c966a53ce48650d98069b126801c2793517/rephetio/indications.tsv?raw=true'
drugcentral_df = pandas.read_table(url)
drugcentral_df = drugcentral_df[drugcentral_df.category.isnull()]
drugcentral_df = drugcentral_df.rename(columns={'doid_id': 'disease_id', 'drugbank_id': 'compound_id'})
drugcentral_df = drugcentral_df[['compound_id', 'disease_id']]
drugcentral_df['status_drugcentral'] = 1
drugcentral_df.head(2)

Unnamed: 0,compound_id,disease_id,status_drugcentral
5,DB00389,DOID:12361,1
19,DB00988,DOID:14330,1


## Add ClinicalTrials.gov novel indications

In [3]:
url = 'https://github.com/dhimmel/clintrials/blob/1ee4b912b51bf90a7455ff3b01b965975b06d421/data/DrugBank-DO.tsv?raw=true'
trial_df = pandas.read_table(url)
trial_df = trial_df.groupby(['drugbank_id', 'doid_code']).apply(len).reset_index()
trial_df.columns = 'compound_id', 'disease_id', 'n_trials'
trial_df['status_trials'] = 1
trial_df.head(2)

Unnamed: 0,compound_id,disease_id,n_trials,status_trials
0,DB00001,DOID:1324,1,1
1,DB00001,DOID:1588,2,1


## Apply to indications with an unknown category

In [4]:
pair_df = pandas.read_table('./../prediction/features/compound-disease-pairs.tsv.bz2')
pair_df = pair_df[pair_df.category.isnull()]
pair_df = pair_df.drop(['category', 'status'], axis='columns')

pair_df = pair_df.merge(trial_df, how='left')
pair_df = pair_df.merge(drugcentral_df, how='left')
for column in 'status_drugcentral', 'status_trials', 'n_trials':
    pair_df[column] = pair_df[column].fillna(0).astype(int)

pair_df.head(2)

Unnamed: 0,compound_id,compound_name,disease_id,disease_name,n_trials,status_trials,status_drugcentral
0,DB01048,Abacavir,DOID:10652,Alzheimer's disease,0,0,0
1,DB01048,Abacavir,DOID:9206,Barrett's esophagus,0,0,0


In [5]:
pair_df.status_trials.value_counts()

0    203045
1      4735
Name: status_trials, dtype: int64

In [6]:
pair_df.status_drugcentral.value_counts()

0    207572
1       208
Name: status_drugcentral, dtype: int64

In [7]:
pair_df.to_csv('validation-statuses.tsv', sep='\t', index=False)