In [1]:
import os
import json
import subprocess

import titlecase
import pandas

In [2]:
with open('template-full.txt') as read_file:
    template_full = read_file.read()
with open('template-mini.txt') as read_file:
    template_mini = read_file.read()

In [3]:
def format_df(df):
    for key in 'percent_of_prediction', 'percent_of_DWPC':
        if not key in df.columns:
            continue
        df[key] = df[key].map('{:.2%}'.format)
    df.columns = df.columns.map(lambda x: titlecase.titlecase(x.replace('_', ' ')))
    return df

In [4]:
# Create a dictionary of (compound_id, disease_id) to clinical trial IDs
url = 'https://github.com/dhimmel/clintrials/raw/7c65dec7b69322ca2f8ba2b170c1b3dbd92ebff8/data/DrugBank-DO-slim.tsv'
clintrial_df = pandas.read_table(url)
cd_to_trials = {key: list(df.nct_id) for key, df in clintrial_df.groupby(['compound_id', 'disease_id'])}

In [13]:
predictions_df = pandas.read_table('../learn/prediction/predictions/probabilities.tsv')
pairs = list(zip(predictions_df.compound_id, predictions_df.disease_id))
len(pairs)

1388

In [16]:
def create_guide(compound_id, disease_id):
    params = {}
    directory = '../het.io-rep-data/browser/{}/{}/'.format(compound_id, disease_id.replace(':', '_'))

    with open(os.path.join(directory, 'info.json')) as read_file:
        info = json.load(read_file)
    for key in 'compound_id', 'disease_id', 'compound_name', 'disease_name', 'prediction', 'compound_percentile', 'disease_percentile', 'category':
        if key in info:
            params[key] = info[key]
    params['fold_change'] = params['prediction'] / 0.00361 - 1
    for key in 'compound_percentile', 'disease_percentile':
        params[key] *= 100

    metapath_df = (
        pandas.Series(info['metapath_contribution'])
        .reset_index()
        .rename(columns = {'index': 'metapath', 0: 'percent_of_prediction'})
        .sort_values('percent_of_prediction', ascending=False)
        .pipe(format_df)
    )
    params['metapath_csv'] = metapath_df.to_csv(index=False).rstrip('\n')

    path = os.path.join(directory, 'highlights.cyp')
    if os.path.exists(path):
        with open(path) as read_file:
            params['path_query'] = read_file.read()

        path_df = (
            pandas.read_table(os.path.join(directory, 'paths.tsv'))
            .pipe(format_df)
            .head(25)
        )
        params['path_csv'] = path_df.to_csv(index=False).rstrip('\n')
        
    # Clinical Trials param
    trials = cd_to_trials.get((compound_id, disease_id), [])
    if trials:
        links = ', '.join('link:https://clinicaltrials.gov/ct2/show/{0}[[small]#{0}#]'.format(nct_id) for nct_id in trials)
        params['trial_sentence'] = 'ClinicalTrials.gov contains {n_trials} clinical trials investigating whether {compound_name} treats {disease_name} ({links}).'.format(links=links, n_trials=len(trials), **params)
    else:
        params['trial_sentence'] = 'No matching clinical trials were found in ClinicalTrials.gov.'

    # PharmacotherapyDB
    category = params.get('category')
    if category:
        indication_type = {'DM': 'disease-modifying ', 'SYM': 'symptomatic ', 'NOT': 'non-'}[category]
        params['phmcdb_sentence'] = 'In link:https://doi.org/10.15363/thinklab.d182[PharmacotherapyDB v1.0], {compound_name} is classified as a {indication_type}indication for {disease_name}.'.format(indication_type=indication_type, **params)
    else:
        params['phmcdb_sentence'] = 'link:https://doi.org/10.15363/thinklab.d182[PharmacotherapyDB v1.0] does not contain an indication between {compound_name} and {disease_name}.'.format(**params)
        
    
    #template = if 'path_query' in params else
    template = template_full if 'path_csv' in params else template_mini
    adoc = template.format(**params)
    with open('./guides/temp.adoc', 'wt') as write_file:
        write_file.write(adoc)

    path = os.path.join('..', 'guides', compound_id, '{}.html'.format(disease_id.replace(':', '_')))
    os.makedirs(os.path.dirname(path), exist_ok=True)
    subprocess.check_call("bash ./run.sh ../guides/temp.adoc {}".format(path),
        shell=True, cwd='neo4j-guides/')
    

In [None]:
%%time
for compound_id, disease_id in pairs:
    create_guide(compound_id, disease_id)

In [8]:
#cp guides/example.html /home/dhimmel/neo4j/hetionet-data/guides/example.html

In [18]:
rm ./guides/temp.adoc

In [19]:
%%time
# Create a bzip2-tarball of guides
! tar --create --bzip2 --file guides.tar.bz2 guides

CPU times: user 6.33 s, sys: 836 ms, total: 7.16 s
Wall time: 7min 14s
