# Combine all features into a single matrix

In [1]:
import bz2
import os
import configparser
import itertools

import pandas

## Read partitions

In [2]:
# Read partition information
part_df = pandas.read_table('data/partitions.tsv')
part_df.tail(2)

Unnamed: 0,hetnet,compound_id,disease_id,status
22648,rephetio-v2.0,DB08906,DOID:9970,0
22649,rephetio-v2.0_perm-2,DB08906,DOID:9970,0


## Create DWPC matrix

In [3]:
# Read DWPC results
dwpc_df = pandas.read_table('data/dwpc.tsv.bz2')
dwpc_df.tail(2)

Unnamed: 0,hetnet,compound_id,disease_id,metapath,PC,w,DWPC,seconds
27308956,rephetio-v2.0,DB08906,DOID:9970,CdGeAeGaD,0,0.4,0.0,0.004798
27308957,rephetio-v2.0_perm-2,DB08906,DOID:9970,CdGeAeGaD,0,0.4,0.0,0.2529


In [4]:
dwpc_spread_df = pandas.pivot_table(dwpc_df, values='DWPC', index=['hetnet', 'compound_id', 'disease_id'], columns='metapath')
dwpc_spread_df = dwpc_spread_df.reset_index()
dwpc_spread_df.head(2)

metapath,hetnet,compound_id,disease_id,CbG<rG<rGaD,CbG<rG<rGdD,CbG<rG<rGuD,CbG<rGaD,CbG<rGaDrD,CbG<rGbCpD,CbG<rGbCtD,...,CuGuDpCpD,CuGuDpCtD,CuGuDpSpD,CuGuDrD,CuGuDrDrD,CuGuDtCpD,CuGuDtCtD,CuGuDuGaD,CuGuDuGdD,CuGuDuGuD
0,rephetio-v2.0,DB00014,DOID:0050741,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000413,0.0,0.0,0.0,0.0,0.0,0.000211,0.00116
1,rephetio-v2.0,DB00014,DOID:10283,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000736,0.0,0.0,0.002448,0.000273,0.001891,0.0,0.000196


In [5]:
# Remove metapaths with missing DWPCs
# See https://github.com/dhimmel/learn/issues/1 for potential cause
is_complete = pandas.isnull(dwpc_spread_df).sum() == 0

# Remove columns with missing data
dwpc_spread_df = dwpc_spread_df.loc[:, is_complete]

# The following columns have missing values and were removed
is_complete.index[-is_complete].tolist()

['CbGeAeGaD',
 'CdG<rGeAlD',
 'CdGeAeGaD',
 'CdGeAeGdD',
 'CdGeAeGuD',
 'CuG<rGeAlD',
 'CuGeAeGaD',
 'CuGeAeGdD',
 'CuGeAeGuD',
 'CuGeAuGaD']

In [6]:
path = 'data/matrix/dwpc.tsv.bz2'
with bz2.open(path, 'wt') as wf:
    dwpc_spread_df.to_csv(wf, index=False, sep='\t', float_format='%.5g')

## Calculate Degree features

In [7]:
config = configparser.ConfigParser()
config.read('../config.ini')
commit = config['hetnet']['integrate_commit']

url = 'https://github.com/dhimmel/integrate/raw/{}/data/summary/metaedge-styles.tsv'.format(commit)
metaedge_style_df = pandas.read_table(url)
metaedge_to_abbreviation = dict(zip(metaedge_style_df.metaedge, metaedge_style_df.abbreviation))

url = 'https://github.com/dhimmel/integrate/raw/{}/data/summary/degrees.xlsx'.format(commit)

disease_degree_df = pandas.read_excel(url, sheetname='Disease')
disease_degree_df = disease_degree_df.rename(columns={'node_id': 'disease_id'}).drop('node_name', axis='columns')
disease_degree_df = disease_degree_df.rename(columns=metaedge_to_abbreviation)

compound_degree_df = pandas.read_excel(url, sheetname='Compound')
compound_degree_df = compound_degree_df.rename(columns={'node_id': 'compound_id'}).drop('node_name', axis='columns')
compound_degree_df = compound_degree_df.rename(columns=metaedge_to_abbreviation)

In [8]:
compound_degree_df.head(2)

Unnamed: 0,compound_id,CbG,CcSE,CdG,CiPC,CpD,CrC,CtD,CuG
0,DB00014,2,249,0,1,0,7,2,1
1,DB00035,5,142,0,3,0,11,0,0


In [9]:
disease_degree_df.head(2)

Unnamed: 0,disease_id,DaG,DdG,DlA,DpC,DpS,DrD,DtC,DuG
0,DOID:0050156,18,250,4,1,8,2,0,250
1,DOID:0050425,12,0,16,10,21,6,0,0


In [10]:
compound_degree_df.to_csv('data/matrix/compound_degree.tsv', index=False, sep='\t')
disease_degree_df.to_csv('data/matrix/disease_degree.tsv', index=False, sep='\t')

## Compute prior dataset

In [11]:
# Read compound and disease degrees
compound_df = pandas.read_table('../summary/compounds.tsv')
disease_df = pandas.read_table('../summary/diseases.tsv')
total_pairs = len(compound_df) * len(disease_df)
total_pairs

209168

In [12]:
rows = list()
for c, d in itertools.product(compound_df.itertuples(), disease_df.itertuples()):
    row = c.compound_id, d.disease_id, c.treats * d.treats / total_pairs
    rows.append(row)

prior_df = pandas.DataFrame(rows, columns=['compound_id', 'disease_id', 'prior_prob'])
prior_df.head(2)

Unnamed: 0,compound_id,disease_id,prior_prob
0,DB01048,DOID:10652,1.9e-05
1,DB01048,DOID:9206,1e-05


In [13]:
(prior_df.prior_prob > 0).value_counts(True)

False    0.857536
True     0.142464
Name: prior_prob, dtype: float64

In [14]:
prior_df.to_csv('data/matrix/prior.tsv', index=False, sep='\t', float_format='%.5g')

## Create a single matrix-like dataframe

In [15]:
matrix_df = part_df.merge(disease_df.iloc[:, :2]).merge(compound_df.iloc[:, :2])
matrix_df = matrix_df.merge(prior_df)
matrix_df = matrix_df.merge(compound_degree_df).merge(disease_degree_df)
matrix_df = matrix_df.merge(dwpc_spread_df)

In [16]:
matrix_df.head(2)

Unnamed: 0,hetnet,compound_id,disease_id,status,disease_name,compound_name,prior_prob,CbG,CcSE,CdG,...,CuGuDpCpD,CuGuDpCtD,CuGuDpSpD,CuGuDrD,CuGuDrDrD,CuGuDtCpD,CuGuDtCtD,CuGuDuGaD,CuGuDuGdD,CuGuDuGuD
0,rephetio-v2.0,DB00014,DOID:0050741,0,alcohol dependence,Goserelin,3.8e-05,2,249,0,...,0.0,0.0,0.000413,0.0,0.0,0.0,0.0,0.0,0.000211,0.00116
1,rephetio-v2.0_perm-5,DB00014,DOID:0050741,0,alcohol dependence,Goserelin,3.8e-05,2,249,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001268,0.003163


In [17]:
df_creators = [
    {'feature_type': 'prior', 'feature': ['prior_prob']},
    {'feature_type': 'degree', 'feature': compound_degree_df.columns[1:]},
    {'feature_type': 'degree', 'feature': disease_degree_df.columns[1:]},
    {'feature_type': 'dwpc', 'feature': dwpc_spread_df.columns[3:]},
]
feature_df = pandas.concat(map(pandas.DataFrame, df_creators))
feature_df.head(2)

Unnamed: 0,feature,feature_type
0,prior_prob,prior
0,CbG,degree


In [18]:
feature_df.to_csv('data/matrix/feature-type.tsv', index=False, sep='\t')

path = 'data/matrix/features.tsv.bz2'
with bz2.open(path, 'wt') as wf:
    matrix_df.to_csv(wf, index=False, sep='\t', float_format='%.5g')

In [19]:
# Save hetnet specific feature files
unperm_name = 'rephetio-v2.0'
unperm_matrix_df = matrix_df.query("hetnet == @unperm_name").drop('hetnet', axis='columns')
directory = os.path.join('data', 'matrix', unperm_name)
if not os.path.exists(directory):
    os.mkdir(directory)
path = os.path.join(directory, 'features.tsv.bz2')
with bz2.open(path, 'wt') as wf:
    unperm_matrix_df.to_csv(wf, index=False, sep='\t', float_format='%.5g')