# Create Project Rephetio Prediction Browser Tables

Tables are JSON formatted for import into DataTables.

In [1]:
import os

import pandas
import sklearn.metrics
import json
import requests

In [2]:
base_url = 'https://github.com/dhimmel/learn/raw/e5e7459532944cafcf542ef6ddfe9184548224d9/'
compound_df = pandas.read_table(base_url + 'summary/compounds.tsv')
disease_df = pandas.read_table(base_url + 'summary/diseases.tsv')
prob_df = pandas.read_table(base_url + 'prediction/predictions/probabilities.tsv')

In [3]:
url = 'https://github.com/dhimmel/disease-ontology/raw/75050ea2d4f60e745d3f3578ae03560a2cc0e444/data/term-names.tsv'
doid_synonym_df = pandas.read_table(url)
disease_to_names = {disease_id: ' | '.join(sorted(set(df.name))) for disease_id, df in doid_synonym_df.groupby('doid')}
disease_df['synonyms'] = disease_df.disease_id.map(disease_to_names)
disease_df.head(2)

Unnamed: 0,disease_id,disease_name,treats,palliates,total_edges,synonyms
0,DOID:10652,Alzheimer's disease,4,5,772,"AD | Alzheimer disease | Alzheimer disease, fa..."
1,DOID:9206,Barrett's esophagus,2,0,541,(ulcerative esophagitis) or (Barrett's esophag...


In [4]:
url = 'https://github.com/dhimmel/drugbank/raw/421a06305e9452deb72a3327a98bb4d07ac94a6c/data/aliases.json'
compound_to_aliases = requests.get(url).json()
compound_to_aliases = {k: ' | '.join(v) for k, v in compound_to_aliases.items()}
compound_df['synonyms'] = compound_df.compound_id.map(compound_to_aliases)
compound_df.head(2)

Unnamed: 0,compound_id,compound_name,treats,palliates,total_edges,synonyms
0,DB01048,Abacavir,1,0,144,Abacavir | Abacavir Sulfate | Ziagen
1,DB05812,Abiraterone,1,0,81,Abiraterone | Zytiga


In [5]:
url = 'https://github.com/dhimmel/clintrials/raw/4d63098c79042b7048f546720e727bc94e232182/data/DrugBank-DO-slim-counts.tsv'
clintrial_df = pandas.read_table(url)
clintrial_df = clintrial_df[['compound_id', 'disease_id', 'n_trials']]
prob_df = prob_df.drop('n_trials', axis='columns').merge(clintrial_df, how='left')
prob_df.n_trials = prob_df.n_trials.fillna(0).astype(int)

In [6]:
prob_df.head(2)

Unnamed: 0,compound_id,compound_name,disease_id,disease_name,category,status,prior_prob,prediction,training_prediction,compound_percentile,disease_percentile,status_trials,status_drugcentral,n_trials
0,DB01048,Abacavir,DOID:10652,Alzheimer's disease,,0,0.004753,0.00093,0.001129,0.125,0.154746,0.0,0.0,0
1,DB05812,Abiraterone,DOID:10652,Alzheimer's disease,,0,0.004753,0.003795,0.004604,0.757353,0.842653,0.0,0.0,0


In [7]:
def get_auroc(df):
    try:
        auroc = sklearn.metrics.roc_auc_score(y_true=df.status, y_score=df.prediction)
    except ValueError:
        auroc = None
    series = pandas.Series()
    series['auroc'] = auroc
    return series
    
compound_df = compound_df.merge(
    prob_df.groupby('compound_id').apply(get_auroc).reset_index()
)
disease_df = disease_df.merge(
    prob_df.groupby('disease_id').apply(get_auroc).reset_index()
)

In [8]:
def df_to_json(df, path, double_precision=6):
    """Write a pandas dataframe to a JSON text file formatted as datatables input."""
    dump_str = df.to_json(orient='split', double_precision=double_precision)
    obj = json.loads(dump_str)
    del obj['index']
    with open(path, 'wt') as fp:
        json.dump(obj, fp, sort_keys=True)

In [9]:
path = os.path.join('browser-tables', 'compounds.json')
df_to_json(compound_df, path)

path = os.path.join('browser-tables', 'diseases.json')
df_to_json(disease_df, path)

## Create probability datasets

In [10]:
prob_df = prob_df[[
    'compound_name',
    'disease_name',
    'prediction',
    'compound_percentile',
    'disease_percentile',
    'category',
    'n_trials',
    'compound_id',
    'disease_id',
]]

In [11]:
for compound_id, df in prob_df.groupby('compound_id'):
    path = os.path.join('browser-tables', 'compound', '{}.json'.format(compound_id))
    df = df.drop(['compound_id', 'compound_name'], axis = 'columns')
    df['synonyms'] = df.disease_id.map(disease_to_names)
    df_to_json(df, path)

for disease_id, df in prob_df.groupby('disease_id'):
    disease_id = disease_id.replace(':', '_')
    path = os.path.join('browser-tables', 'disease', '{}.json'.format(disease_id))
    df = df.drop(['disease_id', 'disease_name'], axis = 'columns')
    df['synonyms'] = df.compound_id.map(compound_to_aliases)
    df_to_json(df, path)

In [12]:
df.head(2)

Unnamed: 0,compound_name,prediction,compound_percentile,disease_percentile,category,n_trials,compound_id,synonyms
133806,Abacavir,0.000803,0.036765,0.052016,,0,DB01048,Abacavir | Abacavir Sulfate | Ziagen
133807,Abiraterone,0.007132,0.948529,0.870611,,0,DB05812,Abiraterone | Zytiga


## Create info tables

In [13]:
info = {}

for kind, df in ('compound', compound_df), ('disease', disease_df):
    for row in df.itertuples():
        row_id = getattr(row, kind + '_id')
        row_id = row_id.replace(':', '_')
        elem = [getattr(row, kind + '_name'), kind]
        item = {
            'name': getattr(row, kind + '_name'),
            'type': kind,
            'treats': int(row.treats),
            'palliates': int(row.palliates),
            'edges': int(row.total_edges),
        }
        if pandas.notnull(row.auroc):
            item['auroc'] = round(row.auroc, 4)
        info[row_id] = item

In [14]:
with open('./browser-tables/info.json', 'w') as fp:
    json.dump(info, fp, indent=1, sort_keys=True, )