# Convert features into a matrix

In [1]:
import bz2

import pandas

In [2]:
# Read compound-disease pairs (observations)
pair_df = pandas.read_table('features/compound-disease-pairs.tsv.bz2')
pair_df.head(2)

Unnamed: 0,compound_id,compound_name,disease_id,disease_name,rel_type,status
0,DB01048,Abacavir,DOID:10652,Alzheimer's disease,,0
1,DB01048,Abacavir,DOID:9206,Barrett's esophagus,,0


In [3]:
# Read prior probabilities
prior_df = pandas.read_table('../all-features/data/matrix/prior.tsv')
prior_df.tail(2)

Unnamed: 0,compound_id,disease_id,prior_prob
209166,DB01624,DOID:12306,0.0
209167,DB01624,DOID:1245,0.0


In [4]:
# Read degree features
compound_degree_df = pandas.read_table('../all-features/data/matrix/compound_degree.tsv')
disease_degree_df = pandas.read_table('../all-features/data/matrix/disease_degree.tsv')
disease_degree_df.head(2)

Unnamed: 0,disease_id,DaG,DdG,DlA,DpC,DpS,DrD,DtC,DuG
0,DOID:0050156,18,250,4,1,8,2,0,250
1,DOID:0050425,12,0,16,10,21,6,0,0


In [5]:
# Read DWPCs
dwpc_df = pandas.read_table('features/dwpc.tsv.bz2')
dwpc_df.head(2)

Unnamed: 0,hetnet,compound_id,disease_id,metapath,PC,w,DWPC,seconds
0,hetio-ind,DB01048,DOID:5408,CpDpCtD,0,0.4,0.0,0.6888
1,hetio-ind,DB01048,DOID:2986,CpDpCtD,0,0.4,0.0,0.707


In [6]:
dwpc_mat_df = dwpc_df.pivot_table(values = 'DWPC', index=['compound_id', 'disease_id'], columns = 'metapath').reset_index()

### Combine all observation-by-feature matrixes

In [7]:
feature_mat_df = pair_df.merge(prior_df).merge(compound_degree_df).merge(disease_degree_df).merge(dwpc_mat_df)

In [8]:
feature_mat_df.head(2)

Unnamed: 0,compound_id,compound_name,disease_id,disease_name,rel_type,status,prior_prob,CbG,CcSE,CdG,...,CtDrDrDrD,CtDrDuGaD,CtDtCbGaD,CtDtCtD,CtDtCuGaD,CtDuGcGuD,CuGaDpCtD,CuGiGuCpD,CuGuD,CuGuDpCtD
0,DB01048,Abacavir,DOID:10652,Alzheimer's disease,,0,0.004753,3,136,0,...,0.0,0.0,0.002638,0.0,0.000779,0.00739,0.0,0.0,0.0,0.0
1,DB05812,Abiraterone,DOID:10652,Alzheimer's disease,,0,0.004753,8,71,0,...,0.0,0.0,0.005065,0.0,0.009089,0.001119,0.0,0.0,0.0,0.0


In [9]:
feature_mat_df.shape

(209168, 89)

In [10]:
with bz2.open('features/features.tsv.bz2', 'wt') as write_file:
    feature_mat_df.to_csv(write_file, sep='\t', index=False, float_format='%.4g')