In [15]:
import collections
import json

import pandas
import numpy
from matplotlib.backends.backend_pdf import PdfPages
import matplotlib.pyplot
import seaborn

In [16]:
feature_df = pandas.read_table('features/transformed-features.tsv.bz2')
untran_df = pandas.read_table('features/features.tsv.bz2')
coef_df = pandas.read_table('model/coefficient.tsv')
predict_df = pandas.read_table('predictions/probabilities.tsv', low_memory=False)

  interactivity=interactivity, compiler=compiler, result=result)


In [17]:
drop = {'prior_logit', 'intercept'}
coef_df = coef_df.query("feature not in @drop")
coef_df = coef_df.query("coef != 0")
coef_df = coef_df.sort_values('coef')
features = list(coef_df.feature)
subcoef_df = coef_df[(coef_df.feature.str.startswith('dwpc_')) & (coef_df.coef > 0)]
subcoef_df.head(2)

Unnamed: 0,feature,coef,zcoef
37,dwpc_CbGdCrCtD,0.000694,0.001141
101,dwpc_CrCbGaD,0.002035,0.002283


In [18]:
coef_series = pandas.Series(data=coef_df.coef.tolist(), index=coef_df.feature)
contrib_df = feature_df[coef_df.feature].mul(coef_series, axis='columns')
contrib_df = feature_df[['compound_id', 'disease_id']].join(contrib_df)
contrib_df = predict_df.merge(contrib_df)
contrib_df = contrib_df.sort_values('prediction', ascending=False)

In [19]:
contrib_df.head(2)

Unnamed: 0,compound_id,compound_name,disease_id,disease_name,category,status,prior_prob,prediction,training_prediction,compound_percentile,...,dwpc_CbGeAlD,dwpc_CbGpPWpGaD,dwpc_CpDpCtD,dwpc_CcSEcCtD,dwpc_CrCrCtD,dwpc_CtDrD,dwpc_CrCtD,dwpc_CiPCiCtD,dwpc_CbGaD,dwpc_CbGbCtD
139518,DB00282,Pamidronate,DOID:11476,osteoporosis,DM,1,0.03893,0.886899,0.977108,1.0,...,0.052756,-0.02465,-0.005732,0.243191,1.143319,1.109366,1.689815,2.056862,1.023269,1.218499
138453,DB00630,Alendronate,DOID:11476,osteoporosis,DM,1,0.03893,0.884991,0.976682,1.0,...,0.044759,0.049125,-0.005732,0.241482,1.143319,1.109366,1.689815,2.056862,0.871453,1.253708


In [20]:
pdf = PdfPages('./figure/linear-predictor-plots.pdf')
plot_df = contrib_df.groupby('status').head(10)
for i, row in plot_df.iterrows():
    matplotlib.pyplot.figure(figsize=(5, 7))
    ax = seaborn.barplot(
        x = list(row[list(coef_df.feature)]),
        y = list(coef_df.feature),
        hue = list(numpy.sign(coef_df.coef)))
    ax.set_title('{}–{} · {:.2%} · {}'.format(row['compound_name'], row['disease_name'], row['prediction'], row['category']))
    pdf.savefig(ax.figure, bbox_inches='tight')
    matplotlib.pyplot.close()
pdf.close()

## Extra

In [21]:
# Estimate of percent contribution of each positive term with a positive coefficient
pos_contrib_df = contrib_df[subcoef_df.feature].copy()
pos_contrib_df[pos_contrib_df < 0] = 0
observation_total = pos_contrib_df.sum(axis='columns')
pos_contrib_df = pos_contrib_df.div(observation_total, axis='index')
pos_contrib_df = predict_df.merge(contrib_df[['compound_id', 'disease_id']].join(pos_contrib_df))
pos_contrib_df.sort_values('prediction', inplace=True, ascending=True)
pos_contrib_df.head(2)

Unnamed: 0,compound_id,compound_name,disease_id,disease_name,category,status,prior_prob,prediction,training_prediction,compound_percentile,...,dwpc_CbGeAlD,dwpc_CbGpPWpGaD,dwpc_CpDpCtD,dwpc_CcSEcCtD,dwpc_CrCrCtD,dwpc_CtDrD,dwpc_CrCtD,dwpc_CiPCiCtD,dwpc_CbGaD,dwpc_CbGbCtD
12699,DB01262,Decitabine,DOID:2986,IgA glomerulonephritis,,0,0.0,0.00021,,0.007353,...,0.081643,0.842476,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13576,DB00896,Rimexolone,DOID:2986,IgA glomerulonephritis,,0,0.0,0.000217,,0.007353,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [22]:
# Create a dictionary of (compound_id, disease_id, metapath) keys to lookup untransformed DWPCs
dwpc_melt_df = pandas.melt(untran_df, id_vars=['compound_id', 'disease_id'],
    value_vars=list(subcoef_df.feature), var_name='feature', value_name='dwpc')
dwpc_melt_df.head()
untran_dwpc_map = dict()
for row in dwpc_melt_df.itertuples():
    key = row.compound_id, row.disease_id, row.feature.split('_')[1]
    untran_dwpc_map[key] = row.dwpc

In [23]:
obj = list()
for i, row in pos_contrib_df.query("prediction > 0.05").iterrows():
    observation = collections.OrderedDict()
    compound_id = row['compound_id']
    disease_id = row['disease_id']
    for key in ['compound_id', 'compound_name', 'disease_id', 'disease_name', 'category', 'status', 'prediction', 'training_prediction']:
        observation[key] = row[key]
    if pandas.isnull(observation['category']):
        del observation['category']
    contribs = collections.OrderedDict()
    for key in subcoef_df.feature:
        percent_contrib = row[key]
        if percent_contrib == 0:
            continue
        temp, metapath = key.split('_', 1)
        contribs[metapath] = percent_contrib
    observation['metapath_contribution'] = contribs
    obj.append(observation)
len(obj)

1070

In [24]:
with open('features/metapaths.json') as read_file:
    metapaths = json.load(read_file)

In [25]:
metapath_to_query = dict()
for metapath in metapaths:
    dwpc_query = metapath['dwpc_query']
    pdp_query = dwpc_query.split('RETURN')[0] + \
    '''\
    WITH
    extract(n in nodes(path)| n.name) AS nodes,
    sum(reduce(pdp = 1.0, d in degrees| pdp * d ^ -0.4)) / { dwpc } AS PDP_contribution
    WITH
    nodes,
    PDP_contribution,
    PDP_contribution * { metapath_contribution } AS prediction_contribution
    WHERE prediction_contribution >= 0.005
    RETURN nodes, prediction_contribution, PDP_contribution
    ORDER BY prediction_contribution
    '''
    metapath_to_query[metapath['abbreviation']] = pdp_query

In [26]:
import py2neo
neo = py2neo.Graph('http://localhost:7500/db/data/')

In [27]:
ms_obj = [elem for elem in obj if elem['disease_name'] == 'multiple sclerosis']

In [28]:
path_dfs = list()
for elem in ms_obj[:15]:
    dfs = list()
    c_id = elem['compound_id']
    d_id = elem['disease_id']
    for metapath, contribution in elem['metapath_contribution'].items():
        untran_dwpc = untran_dwpc_map[(c_id, d_id, metapath)]
        pdp_query = metapath_to_query[metapath]
        results = neo.cypher.execute(pdp_query, source=c_id, target=d_id, n = 10,
                                    dwpc=untran_dwpc,
                                    metapath_contribution=elem['metapath_contribution'][metapath]
                                    )
        df = pandas.DataFrame(results.records, columns=results.columns)
        #df['PDP_contribution'] = df['PDP'] / untran_dwpc
        #df['prediction_contribution'] = df['PDP_contribution'] * 
        df['nodes'] = df['nodes'].map(lambda x: '—'.join(x))
        df['metapath'] = metapath
        df['compound_id'] = c_id
        df['disease_id'] = d_id
        #del df['PDP']
        dfs.append(df)
    path_dfs.append(pandas.concat(dfs).sort_values('prediction_contribution', ascending=False))

path_df = pandas.concat(path_dfs)

In [29]:
path_df.to_csv('./predictions/paths.tsv', sep='\t', index=False)