# Convert features into a matrix

In [1]:
import bz2

import pandas
import numpy
from scipy.special import logit

In [2]:
# Read compound-disease pairs (observations)
pair_df = pandas.read_table('features/compound-disease-pairs.tsv.bz2')
pair_df.head(2)

Unnamed: 0,compound_id,compound_name,disease_id,disease_name,category,status
0,DB01048,Abacavir,DOID:10652,Alzheimer's disease,,0
1,DB01048,Abacavir,DOID:9206,Barrett's esophagus,,0


In [3]:
# Read prior probabilities
prior_df = pandas.read_table('../all-features/data/matrix/prior.tsv')
prior_df.tail(2)

Unnamed: 0,compound_id,disease_id,prior_prob
209166,DB01624,DOID:12306,0.0
209167,DB01624,DOID:1245,0.0


In [4]:
# Read degree features
compound_degree_df = pandas.read_table('../all-features/data/matrix/compound_degree.tsv')
disease_degree_df = pandas.read_table('../all-features/data/matrix/disease_degree.tsv')

for df in compound_degree_df, disease_degree_df:
    df.rename(columns={k: 'degree_' + k for k in df.columns[1:]}, inplace=True)

disease_degree_df.head(2)

Unnamed: 0,disease_id,degree_DaG,degree_DdG,degree_DlA,degree_DpC,degree_DpS,degree_DrD,degree_DtC,degree_DuG
0,DOID:0050156,18,250,4,1,8,2,0,250
1,DOID:0050425,12,0,16,10,21,6,0,0


In [5]:
# Read DWPCs
dwpc_df = pandas.read_table('features/dwpc.tsv.bz2')
dwpc_df.head(2)

Unnamed: 0,hetnet,compound_id,disease_id,metapath,PC,w,DWPC,seconds
0,hetio-ind,DB01048,DOID:10652,CpDpCtD,0,0.4,0.0,0.06404
1,hetio-ind,DB01048,DOID:14330,CpDpCtD,0,0.4,0.0,0.01049


In [6]:
dwpc_mat_df = dwpc_df.pivot_table(values = 'DWPC', index=['compound_id', 'disease_id'], columns = 'metapath')
dwpc_mat_df.columns = 'dwpc_' + dwpc_mat_df.columns
dwpc_mat_df = dwpc_mat_df.reset_index()
dwpc_mat_df.head(2)

Unnamed: 0,compound_id,disease_id,dwpc_CbG<rG<rGaD,dwpc_CbG<rGaD,dwpc_CbG<rGaDrD,dwpc_CbG<rGcGaD,dwpc_CbG<rGeAlD,dwpc_CbG<rGiGaD,dwpc_CbG<rGr>GaD,dwpc_CbGaD,...,dwpc_CuGaDuGaD,dwpc_CuGcG<rGaD,dwpc_CuGcGiGaD,dwpc_CuGdDpSpD,dwpc_CuGr>GbCtD,dwpc_CuGuCrCtD,dwpc_CuGuCtD,dwpc_CuGuCtDrD,dwpc_CuGuCuGaD,dwpc_CuGuDuGaD
0,DB00014,DOID:0050156,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001531,0.000289,0.000216
1,DB00014,DOID:0050425,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000277,0.0,0.0,0.0,0.0,0.0,0.0,0.000973,0.0


### Combine all observation-by-feature matrixes

In [7]:
feature_mat_df = pair_df.merge(prior_df).merge(compound_degree_df).merge(disease_degree_df).merge(dwpc_mat_df)

In [8]:
feature_mat_df.head(2)

Unnamed: 0,compound_id,compound_name,disease_id,disease_name,category,status,prior_prob,degree_CbG,degree_CcSE,degree_CdG,...,dwpc_CuGaDuGaD,dwpc_CuGcG<rGaD,dwpc_CuGcGiGaD,dwpc_CuGdDpSpD,dwpc_CuGr>GbCtD,dwpc_CuGuCrCtD,dwpc_CuGuCtD,dwpc_CuGuCtDrD,dwpc_CuGuCuGaD,dwpc_CuGuDuGaD
0,DB01048,Abacavir,DOID:10652,Alzheimer's disease,,0,0.004753,3,136,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,DB05812,Abiraterone,DOID:10652,Alzheimer's disease,,0,0.004753,8,71,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
feature_mat_df.shape

(209168, 165)

In [10]:
with bz2.open('features/features.tsv.bz2', 'wt') as write_file:
    feature_mat_df.to_csv(write_file, sep='\t', index=False, float_format='%.4g')

## Transform

In [11]:
trans_df = feature_mat_df.copy()
degree_features = list(trans_df.columns[trans_df.columns.str.startswith('degree_')])
dwpc_features = list(trans_df.columns[trans_df.columns.str.startswith('dwpc_')])

# Transform prior
trans_df.insert(7, 'prior_logit', logit(trans_df.prior_prob))

# Transform degree features
for feature in degree_features:
    trans_df[feature] = numpy.arcsinh(trans_df[feature])

# Transform DWPC features
for feature in dwpc_features:
    x = trans_df[feature]
    trans_df[feature] = numpy.arcsinh(x / x.mean())

# Standardize all features besides the prior
for feature in degree_features + dwpc_features:
    x = trans_df[feature]
    trans_df[feature] = (x - x.mean()) / x.std()

trans_df.head(3)

Unnamed: 0,compound_id,compound_name,disease_id,disease_name,category,status,prior_prob,prior_logit,degree_CbG,degree_CcSE,...,dwpc_CuGaDuGaD,dwpc_CuGcG<rGaD,dwpc_CuGcGiGaD,dwpc_CuGdDpSpD,dwpc_CuGr>GbCtD,dwpc_CuGuCrCtD,dwpc_CuGuCtD,dwpc_CuGuCtDrD,dwpc_CuGuCuGaD,dwpc_CuGuDuGaD
0,DB01048,Abacavir,DOID:10652,Alzheimer's disease,,0,0.004753,-5.3443,-0.295458,0.814594,...,-0.502775,-0.421062,-0.458137,-0.53992,-0.357021,-0.384854,-0.375,-0.564218,-0.555543,-0.495207
1,DB05812,Abiraterone,DOID:10652,Alzheimer's disease,,0,0.004753,-5.3443,0.549549,0.555388,...,-0.502775,-0.421062,-0.458137,-0.53992,-0.357021,-0.384854,-0.375,-0.564218,-0.555543,-0.495207
2,DB00659,Acamprosate,DOID:10652,Alzheimer's disease,,0,0.004753,-5.3443,0.905283,1.0224,...,-0.502775,-0.421062,-0.458137,-0.53992,-0.357021,-0.384854,-0.375,-0.564218,-0.555543,-0.495207


In [12]:
with bz2.open('features/transformed-features.tsv.bz2', 'wt') as write_file:
    trans_df.to_csv(write_file, sep='\t', index=False, float_format='%.4g')