# Collapsed source/target edge contributions to epilepsy predictions

In [1]:
import pandas

In [2]:
url = 'https://github.com/dhimmel/learn/raw/d2251a942813015d0362a90f179c961016336e77/summary/compounds.tsv'
compound_df = pandas.read_table(url)[['compound_id', 'compound_name']]

In [3]:
top_compounds_df = (pandas.read_table('./data/windows.tsv')
    .rename(columns={'name': 'compound_name'})
    .merge(compound_df)
)
top_compounds_df.head(2)

Unnamed: 0,compound_name,prediction,disease_pctl,phcodb,trials,category,min_pred,max_pred,freq_AIGD,freq_IGD,freq_UNKD,compound_id
0,Topiramate,0.603,1.0,DM,35,AIGD,0.46,0.603,1.0,0.0,0.0,DB00273
1,Ethotoin,0.589,0.9993,,0,AIGD,0.434,0.603,1.0,0.0,0.0,DB00754


In [4]:
path_dfs = list()
for compound_id in top_compounds_df.compound_id:
    path = '../../het.io-rep-data/prediction-info/{}/DOID_1826/paths.tsv'.format(compound_id)
    path_dfs.append(pandas.read_table(path))
path_df = pandas.concat(path_dfs)
path_df.head(2)

Unnamed: 0,nodes,percent_of_prediction,percent_of_DWPC,source_edge,target_edge,metapath
0,Topiramate—migraine—epilepsy syndrome,0.178,1.0,Topiramate—treats—migraine,epilepsy syndrome—resembles—migraine,CtDrD
1,Topiramate—GRIK5—epilepsy syndrome,0.0385,0.249,Topiramate—binds—GRIK5,epilepsy syndrome—associates—GRIK5,CbGaD


In [5]:
def summarize(df):
    s = pandas.Series()
    s['paths'] = len(df)
    s['contribution'] = sum(df.percent_of_prediction)
    return s

## Source edge contributions

In [6]:
source_df = (path_df
    .assign(source_edge = path_df.source_edge.map(lambda x: 'Compound—' + x.split('—', 1)[1]))
    .groupby('source_edge')
    .apply(summarize).reset_index()
    .sort_values('contribution', ascending=False)
)
source_df.head()

Unnamed: 0,source_edge,paths,contribution
1437,Compound—includes—Decreased Central Nervous Sy...,238.0,6.3412
1429,Compound—includes—Benzodiazepines,52.0,3.8446
104,Compound—binds—GABRA1,12385.0,2.819223
1519,Compound—resembles—Diazepam,402.0,2.708075
1438,Compound—includes—General Anesthesia,6.0,2.456


In [7]:
source_df.to_csv('data/source-edge-contributions.tsv', sep='\t', index=False, float_format='%.5g')
len(source_df)

1667

## Target edge contributions

In [8]:
target_df = (path_df
    .groupby('target_edge')
    .apply(summarize).reset_index()
    .sort_values('contribution', ascending=False)
)
target_df.head()

Unnamed: 0,target_edge,paths,contribution
355,epilepsy syndrome—treats—Diazepam,6843.0,8.123404
354,epilepsy syndrome—treats—Clonazepam,6488.0,6.27389
362,epilepsy syndrome—treats—Midazolam,4832.0,6.116992
353,epilepsy syndrome—treats—Clobazam,4159.0,5.67081
351,epilepsy syndrome—treats—Amobarbital,2002.0,4.840363


In [9]:
target_df.to_csv('data/target-edge-contributions.tsv', sep='\t', index=False, float_format='%.5g')
len(target_df)

375