In [1]:
import numpy as np
import numpy
from scipy import sparse
import functools, operator, collections, itertools, pandas as pd

## Import both DWPC data-sets

In [2]:
rep_results = pd.DataFrame.from_csv('data/dwpc.tsv', sep='\t')
rep_results = rep_results.reset_index()

rep_results = rep_results[rep_results['hetnet'] == 'rephetio-v2.0']
del rep_results['hetnet']

In [3]:
dwpc_results = pd.DataFrame.from_csv('data/dwpc_data.tsv', sep='\t')
dwpc_results = dwpc_results.reset_index()

###  Cut down the diseases and compounds 

In [4]:
rep_compounds = set(list(rep_results['compound_id']))
rep_diseases = set(list(rep_results['disease_id']))

In [5]:
dwpc_results = dwpc_results[[j in rep_compounds for j in dwpc_results['compound']]]
dwpc_results = dwpc_results[[j in rep_diseases for j in dwpc_results['disease']]]

In [6]:
# Make all the rephetio tuples
rep_pairs = list(zip(rep_results['compound_id'], rep_results['disease_id']))

In [7]:
# Designate the rows that are within rephetio pairs
cond = []
for row in dwpc_results.iterrows():
    if (row[1][0], row[1][1]) in rep_pairs:
        cond.append(True)
    else:
        cond.append(False)

In [8]:
dwpc_results = dwpc_results[cond]

## Re-order the columns

In [9]:
header = list(dwpc_results)

header = header[:2] + list(sorted(header[2:]))

In [10]:
dwpc_results = dwpc_results[header]

In [11]:
dwpc_results.head()

Unnamed: 0,compound,disease,CbG<rG<rGaD,CbG<rG<rGdD,CbG<rG<rGuD,CbG<rGaD,CbG<rGaDrD,CbG<rGbCpD,CbG<rGbCtD,CbG<rGcGaD,...,CuGuDpCpD,CuGuDpCtD,CuGuDpSpD,CuGuDrD,CuGuDrDrD,CuGuDtCpD,CuGuDtCtD,CuGuDuGaD,CuGuDuGdD,CuGuDuGuD
9,DB00014,DOID:10283,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000736,0.0,0.0,0.002448,0.000273,0.001891,0.0,0.000196
10,DB00014,DOID:10534,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.001628,0.0,0.00123,0.0,0.0,0.001286,0.0,0.0
35,DB00014,DOID:12236,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000672,0.0,0.001609,0.0,0.0,0.000478,0.0,0.0
37,DB00014,DOID:12361,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000814,0.0,0.0,0.0,0.0,0.000417,0.0,0.0
58,DB00014,DOID:1612,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000572,0.0,0.000784,0.001128,0.0,0.002099,0.001107,0.001325


In [12]:
rep_results.head()

Unnamed: 0,compound_id,disease_id,CbG<rG<rGaD,CbG<rG<rGdD,CbG<rG<rGuD,CbG<rGaD,CbG<rGaDrD,CbG<rGbCpD,CbG<rGbCtD,CbG<rGcGaD,...,CuGuDpCpD,CuGuDpCtD,CuGuDpSpD,CuGuDrD,CuGuDrDrD,CuGuDtCpD,CuGuDtCtD,CuGuDuGaD,CuGuDuGdD,CuGuDuGuD
0,DB00014,DOID:10283,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000736,0.0,0.0,0.002449,0.000273,0.001891,0.0,0.000196
1,DB00014,DOID:10534,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.001628,0.0,0.00123,0.0,0.0,0.001286,0.0,0.0
2,DB00014,DOID:12236,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000672,0.0,0.001609,0.0,0.0,0.000478,0.0,0.0
3,DB00014,DOID:12361,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000814,0.0,0.0,0.0,0.0,0.000417,0.0,0.0
4,DB00014,DOID:1612,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000572,0.0,0.000784,0.001128,0.0,0.002099,0.001107,0.001325


# Comparison

---

## Import both DWPC data-sets

In [13]:
def get_both(path):
    def order_df(df, idn):
    # Create a new first column with pairs of compounds and diseases. 
        df_pairs = list(zip(df['compound' + idn*'_id'], df['disease' + idn*'_id']))
        df['tuples'] = df_pairs

        # Sort the columns with 'tuples' at the front
        cols = list(reversed(sorted(list(df))))
        df = df[cols]
        del df['compound' + idn*'_id']
        del df['disease' + idn*'_id']
        return df

    rep_results = pd.DataFrame.from_csv('data/dwpc.tsv', sep='\t')
    rep_results = rep_results.reset_index()

    # Get only the rows from the correct hetnet then delete that column
    rep_results = rep_results[rep_results['hetnet'] == 'rephetio-v2.0']
    del rep_results['hetnet']

    rep_results = order_df(rep_results, 1)

    dwpc_results = pd.DataFrame.from_csv(path, sep='\t')
    dwpc_results = dwpc_results.reset_index()

    dwpc_results = order_df(dwpc_results, 0)

    dwpc_list = list(dwpc_results)
    # Delete columns in rep_results
    for i in list(rep_results):
        if i not in dwpc_list:
            del rep_results[i]

    rep_list = list(rep_results['tuples'])
    dwpc_results = dwpc_results[[i in rep_list for i in list(dwpc_results['tuples'])]]
    
    return rep_results, dwpc_results

In [14]:
def order_df(df, idn):
# Create a new first column with pairs of compounds and diseases. 
    df_pairs = list(zip(df['compound' + idn*'_id'], df['disease' + idn*'_id']))
    df['tuples'] = df_pairs

    # Sort the columns with 'tuples' at the front
    cols = list(reversed(sorted(list(df))))
    df = df[cols]
    del df['compound' + idn*'_id']
    del df['disease' + idn*'_id']
    return df

In [15]:
rep_results, dwpc_results = get_both('data/dwpc_data.tsv')

In [16]:
dwpc_results = dwpc_results.reset_index()

Now that the two dataframes have identical rows and columns in exactly in the same order, subtract the two and find the largest number.

In [17]:
((rep_results.loc[:, 'CuGuCtDrD':'CbG<rG<rGaD'].sub(dwpc_results.loc[:, 'CuGuCtDrD':'CbG<rG<rGaD'])).max()).max()

5.000000000032756e-06

This number is extremely small, so our new method has been successful for all metapaths!