In [49]:
import collections
import json

import pandas
import numpy
from matplotlib.backends.backend_pdf import PdfPages
import matplotlib.pyplot

In [119]:
feature_df = pandas.read_table('features/transformed-features.tsv.bz2')
untran_df = pandas.read_table('features/features.tsv.bz2')
coef_df = pandas.read_table('model/coefficient.tsv')
predict_df = pandas.read_table('predictions/probabilities.tsv')

In [17]:
drop = {'prior_logit', 'intercept'}
coef_df = coef_df.query("feature not in @drop")
coef_df = coef_df.query("coef != 0")
coef_df = coef_df.sort_values('coef')
features = list(coef_df.feature)
subcoef_df = coef_df[(coef_df.feature.str.startswith('dwpc_')) & (coef_df.coef > 0)]
subcoef_df.head(2)

Unnamed: 0,feature,coef,zcoef
41,dwpc_CbGuDtCpD,0.000345,0.000441
112,dwpc_CuGr>GcGdD,0.001188,0.001769


In [5]:
coef_series = pandas.Series(data=coef_df.coef.tolist(), index=coef_df.feature)
contrib_df = feature_df[coef_df.feature].mul(coef_series, axis='columns')
contrib_df = feature_df[['compound_id', 'disease_id']].join(contrib_df)
contrib_df = predict_df.merge(contrib_df)
contrib_df = contrib_df.sort_values('prediction', ascending=False)

In [6]:
contrib_df.head(2)

Unnamed: 0,compound_id,compound_name,disease_id,disease_name,category,status,prediction,training_prediction,dwpc_CtDaGiGaD,dwpc_CtDaGaDrD,...,dwpc_CtDrDrDrD,degree_DrD,dwpc_CbGaDrDrD,dwpc_CbGuAlDrD,dwpc_CtDtCtD,dwpc_CbGpPWpGaD,dwpc_CtDrDrD,dwpc_CtDaGaD,degree_DaG,degree_CtD
91496,DB00808,Indapamide,DOID:10763,hypertension,DM,1,0.994276,0.999878,0.519654,0.27547,...,-0.050888,0.014677,0.138279,0.284136,-0.041448,0.594393,-0.119877,-0.259058,2.255991,1.862382
90882,DB00542,Benazepril,DOID:10763,hypertension,DM,1,0.993767,0.999867,0.519654,0.27547,...,-0.050888,0.014677,0.456586,0.136071,-0.041448,0.539915,-0.119877,-0.259058,2.255991,1.862382


In [108]:
pdf = PdfPages('./figure/linear-predictor-plots.pdf')
plot_df = contrib_df.groupby('status').head(10)
for i, row in plot_df.iterrows():
    matplotlib.pyplot.figure(figsize=(5, 15))
    ax = seaborn.barplot(
        x = list(row[list(coef_df.feature)]),
        y = list(coef_df.feature),
        hue = list(numpy.sign(coef_df.coef)))
    ax.set_title('{}–{} · {:.2%} · {}'.format(row['compound_name'], row['disease_name'], row['prediction'], row['category']))
    pdf.savefig(ax.figure, bbox_inches='tight')
    matplotlib.pyplot.close()
pdf.close()

## Extra

In [181]:
# Estimate of percent contribution of each positive term with a positive coefficient
pos_contrib_df = contrib_df[subcoef_df.feature].copy()
pos_contrib_df[pos_contrib_df < 0] = 0
observation_total = pos_contrib_df.sum(axis='columns')
pos_contrib_df = pos_contrib_df.div(observation_total, axis='index')
pos_contrib_df = predict_df.merge(contrib_df[['compound_id', 'disease_id']].join(pos_contrib_df))
pos_contrib_df.sort_values('prediction', inplace=True, ascending=True)
pos_contrib_df.head(2)

Unnamed: 0,compound_id,compound_name,disease_id,disease_name,category,status,prediction,training_prediction,compound_rank,disease_rank,...,dwpc_CrCrCtD,dwpc_CtDrD,dwpc_CbGbCtD,dwpc_CtDrDrDrD,dwpc_CbGaDrDrD,dwpc_CbGuAlDrD,dwpc_CtDtCtD,dwpc_CbGpPWpGaD,dwpc_CtDrDrD,dwpc_CtDaGaD
160686,DB00424,Hyoscyamine,DOID:8893,psoriasis,,0,7.345773e-08,6.965431e-07,136,1538.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.554422
139690,DB00740,Riluzole,DOID:11476,osteoporosis,,0,1.073588e-07,6.728824e-07,136,1538.0,...,0.0,0.0,0.16051,0.0,0.023517,0.0,0.0,0.0,0.0,0.247909


In [182]:
# Create a dictionary of (compound_id, disease_id, metapath) keys to lookup untransformed DWPCs
dwpc_melt_df = pandas.melt(untran_df, id_vars=['compound_id', 'disease_id'],
    value_vars=list(subcoef_df.feature), var_name='feature', value_name='dwpc')
dwpc_melt_df.head()
untran_dwpc_map = dict()
for row in dwpc_melt_df.itertuples():
    key = row.compound_id, row.disease_id, row.feature.split('_')[1]
    untran_dwpc_map[key] = row.dwpc

In [183]:
obj = list()
for i, row in pos_contrib_df.query("prediction > 0.05").iterrows():
    observation = collections.OrderedDict()
    compound_id = row['compound_id']
    disease_id = row['disease_id']
    for key in ['compound_id', 'compound_name', 'disease_id', 'disease_name', 'category', 'status', 'prediction', 'training_prediction']:
        observation[key] = row[key]
    if pandas.isnull(observation['category']):
        del observation['category']
    contribs = collections.OrderedDict()
    for key in subcoef_df.feature:
        percent_contrib = row[key]
        if percent_contrib == 0:
            continue
        temp, metapath = key.split('_', 1)
        contribs[metapath] = percent_contrib
    observation['metapath_contribution'] = contribs
    obj.append(observation)
len(obj)

11734

In [184]:
with open('features/metapaths.json') as read_file:
    metapaths = json.load(read_file)

In [185]:
metapath_to_query = dict()
for metapath in metapaths:
    dwpc_query = metapath['dwpc_query']
    pdp_query = dwpc_query.split('RETURN')[0] + \
    '''\
    WITH
    extract(n in nodes(path)| n.name) AS nodes,
    sum(reduce(pdp = 1.0, d in degrees| pdp * d ^ -0.4)) / { dwpc } AS PDP_contribution
    WITH
    nodes,
    PDP_contribution,
    PDP_contribution * { metapath_contribution } AS prediction_contribution
    WHERE prediction_contribution >= 0.005
    RETURN nodes, prediction_contribution, PDP_contribution
    ORDER BY prediction_contribution
    '''
    metapath_to_query[metapath['abbreviation']] = pdp_query

In [186]:
import py2neo
neo = py2neo.Graph('http://localhost:7500/db/data/')

In [187]:
ms_obj = [elem for elem in obj if elem['disease_name'] == 'multiple sclerosis']

In [188]:
path_dfs = list()
for elem in ms_obj[:15]:
    dfs = list()
    c_id = elem['compound_id']
    d_id = elem['disease_id']
    for metapath, contribution in elem['metapath_contribution'].items():
        untran_dwpc = untran_dwpc_map[(c_id, d_id, metapath)]
        pdp_query = metapath_to_query[metapath]
        results = neo.cypher.execute(pdp_query, source=c_id, target=d_id, n = 10,
                                    dwpc=untran_dwpc,
                                    metapath_contribution=elem['metapath_contribution'][metapath]
                                    )
        df = pandas.DataFrame(results.records, columns=results.columns)
        #df['PDP_contribution'] = df['PDP'] / untran_dwpc
        #df['prediction_contribution'] = df['PDP_contribution'] * 
        df['nodes'] = df['nodes'].map(lambda x: '—'.join(x))
        df['metapath'] = metapath
        df['compound_id'] = c_id
        df['disease_id'] = d_id
        #del df['PDP']
        dfs.append(df)
    path_dfs.append(pandas.concat(dfs).sort_values('prediction_contribution', ascending=False))

path_df = pandas.concat(path_dfs)

In [189]:
path_df.to_csv('./predictions/paths.tsv', sep='\t', index=False)