# Sandbox
Notebook to test code before real implementation

In [None]:
################ ONLY PYTHON 2 #########################
from __future__ import division
from __future__ import print_function
from operator import itemgetter
from itertools import combinations, chain, product
from collections import defaultdict
import argparse
import time
import os
import tensorflow as tf
import numpy as np
import networkx as nx
import scipy.sparse as sp
from sklearn import metrics
import pandas as pd
import psutil
import pickle
from decagon.deep.optimizer import DecagonOptimizer
from decagon.deep.model import DecagonModel
from decagon.deep.minibatch import EdgeMinibatchIterator
from decagon.utility import rank_metrics, preprocessing

In [8]:
def sparse_to_tuple(sparse_mx):
    if not sp.isspmatrix_coo(sparse_mx):
        sparse_mx = sparse_mx.tocoo()
    coords = np.vstack((sparse_mx.row, sparse_mx.col)).transpose()
    values = sparse_mx.data
    shape = sparse_mx.shape
    return coords, values, shape

In [1]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [2]:
import numpy as np
import pandas as pd
import scipy.sparse as sp
import time
import os
import psutil
import pickle
from pybdm import BDM
from pybdm.utils import decompose_dataset
from pybdm.partitions import PartitionIgnore
from pybdm.partitions import PartitionRecursive
from joblib import Parallel, delayed
from joblib import parallel_backend
from data.algorithms import PerturbationExperiment, NodePerturbationExperiment
import math
import datetime as dt
from itertools import product

### Load adj matrix

In [3]:
with open('data/data_structures/DS/DS_toy_DSE_600_PF_5_genes_500_drugs_400_se_4',\
          'rb') as f:
    ppi_adj = pickle.load(f)['ppi_adj']

### Create BDM object and calculate edge complexity

In [5]:
bdm = BDM(ndim=2, partition=PartitionRecursive)
ppi_per = PerturbationExperiment(bdm,metric='bdm',bipartite_network=False)
ppi_per.set_data(np.array(ppi_adj.todense()))
# Algorithmic complexity
edge_complexity = ppi_per.run()
# Reshape in the adj matrix shape
complexity_mat = edge_complexity.reshape(np.shape(ppi_adj))

### Get the coordinates of positive edges

In [20]:
coords,_,_ = sparse_to_tuple(ppi_adj)
l= np.shape(coords)[0]
print(np.shape(coords))
print(coords[:5])

(615, 2)
[[  0  94]
 [  0 143]
 [  1  10]
 [  2 167]
 [  3 152]]


### Select complexity values for true edges and take absolute value

In [14]:
true_cmplx = np.abs(complexity_mat[coords[:,0],coords[:,1]].reshape(615,1))
print(np.shape(true_cmplx))
print(true_cmplx[:5])

(615, 1)
[[0.03837811]
 [0.05435175]
 [0.04054595]
 [0.02965131]
 [0.05435175]]


### Concatenate values with index

In [15]:
a = np.concatenate((np.abs(true_cmplx),np.arange(615).reshape(615,1)),axis=1)
print(np.shape(a))
print(a[:5])

(615, 2)
[[0.03837811 0.        ]
 [0.05435175 1.        ]
 [0.04054595 2.        ]
 [0.02965131 3.        ]
 [0.05435175 4.        ]]


### Sort values

In [18]:
sorted_values = a[a[:,0].argsort()[::-1]]
print(np.shape(sorted_values))
print(sorted_values)

(615, 2)
[[2.66108470e+01 5.89000000e+02]
 [2.66108470e+01 5.41000000e+02]
 [2.66025186e+01 2.41000000e+02]
 ...
 [2.96513078e-02 3.71000000e+02]
 [2.96513078e-02 1.32000000e+02]
 [2.96513078e-02 3.07000000e+02]]


### Discard the lowest values of complexity

In [24]:
cut_frac = 0.25
remain = np.arange(np.floor(l*(1-cut_frac)),dtype=int)
new_values = sorted_values[remain,:]
print(np.shape(new_values))
print(new_values[:5])

<class 'numpy.int64'>
(461, 2)
[[ 26.610847   589.        ]
 [ 26.610847   541.        ]
 [ 26.60251864 241.        ]
 [ 26.60251864 306.        ]
 [ 26.59817317 304.        ]]


### Select new true edges

In [29]:
indices = new_values[:,1].astype(int)
new_coords = coords[indices,:]
print(new_coords[:5])
print(np.shape(new_coords))
new_l = np.shape(new_coords)[0]

[[476 435]
 [435 476]
 [195 251]
 [251 195]
 [250 192]]
(461, 2)


### Creation of new adyacency matrix

In [33]:
new_ppi_adj = sp.csr_matrix((np.ones(new_l), (new_coords[:,0], new_coords[:,1])), shape=np.shape(ppi_adj))

## Revisar si adj mats contienen el 1 en la diagonal

In [None]:
with open('data/data_structures/DS/DS_real_DSE_9700_genes_16837_drugs_636_se_7','rb') as f:
    DS = pickle.load(f)
    for key in DS.keys():
        globals()[key]=DS[key]
        print(key,"Imported successfully")

In [None]:
ddi_adj_list[0].todense()

# TWOSIDES

In [None]:
DF = pd.read_csv('../Thesis_datasets/TWOSIDES/small.csv',sep=',',usecols=[0,1,2,3,4,5])
DEC = pd.read_csv('data/original_data/bio-decagon-combo.csv',sep=',')

In [None]:
DEC.head()

In [None]:
DF.head(10)

### Transform Dictionaries Side effects
`meddra.tsv` is a database that contains names of side effects in both wanted ids

In [None]:
map_se = pd.read_csv('../Thesis_datasets/SIDER/meddra.tsv', sep = '\t'\
                     ,header=None).rename(columns={0:'UMLS',1:'kind',2:'MedDRA',3:'name'})
cui = pd.unique(map_se['UMLS'].values)
meddra = pd.unique(map_se['MedDRA'].values)
ses = pd.unique(map_se['name'].values)
print('Total',len(map_se))
print('CUI',len(cui))
print('MedDRA',len(meddra))
print('Side Effects',len(ses))
map_se.head(5)

In [None]:
# Dictionary that translates UMLS CUIs to MedDRA IDs
umls2meddra = defaultdict(set)
for se in map_se.index:
    umls2meddra[map_se.loc[se,'UMLS']] = map_se.loc[se,'MedDRA']
# Dictionary that translates MedDRA IDs to UMLS CUIs
meddra2umls = defaultdict(set)
for se in map_se.index:
    meddra2umls[map_se.loc[se,'MedDRA']] = map_se.loc[se,'UMLS']
# Dictionary that translates UMLS CUIs to name
names = map_se[map_se['kind'].str.match('PT',na=False)].reset_index(drop=True)
print(len(names.index))
UMLS2names = {}
for se in names.index:
    UMLS2names[names.loc[se,'UMLS']] = names.loc[se,'name']
# Verify numbers
print(len(umls2meddra))
print(len(meddra2umls))
print(len(UMLS2names))

In [None]:
# Dictionary that translates UMLS CUIs to name
names = map_se[map_se['kind'].str.match('PT',na=False)].reset_index(drop=True)
print(len(names.index))
UMLS2names = {}
for se in names.index:
    UMLS2names[names.loc[se,'UMLS']] = names.loc[se,'name']
print(len(UMLS2names))

### Replacing 

In [None]:
ids = []
for i in DF['condition_meddra_id']:
    ids.append(meddra2umls[i])
DF['Condition'] = ids
DF = DF.drop(columns=['condition_meddra_id'])
name_list = []
for i in DF['Condition']:
    name_list.append(UMLS2names[i])
DF['Condition_name'] = name_list
DF = DF.drop(columns=['condition_concept_name'])
DF.head(10)

In [None]:
map_drug = pd.read_csv('../Thesis_datasets/SIDER/drug_names.tsv',sep = '\t',header=None)

In [None]:
map_drug.head()

## Testing fixed unigram candidate sampler found in optimization

In [None]:
sess = tf.Session()
a = [7,0,20,8,33,9]
labels = tf.reshape(tf.constant(a,dtype=tf.int64),[6,1])
sampled_ids, true_expected_count, sampled_expected_count = tf.nn.fixed_unigram_candidate_sampler(
   true_classes = labels,
   num_true = 1,
   num_sampled = 20,
   unique = False,
   range_max = np.shape(a)[0],
   unigrams = [ 10, 10, 10, 10, 50, 10 ]
)
sample = tf.gather( labels, sampled_ids )
print(sess.run( true_expected_count ))
print(sess.run( sampled_ids ))
print(sess.run( sampled_expected_count ))
print(sess.run( sample ))

In [None]:
in_file = 'data/data_structures/DECAGON/DECAGON_real_affinities_genes_16814_drugs_276_se_7'
with open(in_file, 'rb') as f:
    DS = pickle.load(f)
    for key in DS.keys():
        globals()[key]=DS[key]
        print(key,"Imported successfully")

In [None]:
adj_mats_orig[1,0][0].todense().sum()

In [None]:
val_metrics[0,:,:]

In [None]:
%pylab inline

In [None]:
pd.read_csv('../Thesis_datasets/DrugBank/drugbank_all_full_database.xml/full database.xml')