# Numpy Distance versus Tensorflow 

In [4]:
from MDAnalysis.core.distances import distance_array, self_distance_array
from MDAnalysis.analysis.distances import contact_matrix
import scipy.sparse
from scipy.spatial.distance import cdist
import numpy as np
import time, os, sys, gc
import datetime
import logging
import tensorflow as tf
logger = logging.getLogger("py4j")
logger.setLevel(logging.ERROR)



# Small Test Data

In [5]:
x = np.array((np.random.rand(10, 3)*10).astype(int))

# Numpy Broadcast Version

We extend both our 2-D array into 3 dimensions so that we can do an element-wise subtraction (broadcast): `x[:, None, :]`


From **Python Data Science Handbook** by Jake Vanderblas:

Broadcasting in NumPy follows a strict set of rules to determine the interaction between the two arrays:

1. If the two arrays differ in their number of dimensions, the shape of the one with fewer dimensions is padded with ones on its leading (left) side.

1. If the shape of the two arrays does not match in any dimension, the array with shape equal to 1 in that dimension is stretched to match the other shape.

1. If in any dimension the sizes disagree and neither is equal to 1, an error is raised.



In [6]:
x[:, None, :].shape

(10, 1, 3)

In [7]:
x.shape

(10, 3)

In [8]:
(x[:, None, :]-x).shape

(10, 10, 3)

In [9]:
x.reshape(3,10)

array([[6, 6, 6, 3, 3, 1, 2, 2, 8, 4],
       [6, 6, 7, 6, 8, 6, 5, 5, 2, 9],
       [5, 8, 9, 1, 7, 5, 2, 5, 4, 5]])

In [39]:
def dist(x): 
    return np.sqrt(((x[:, None, :] - x) ** 2).sum(-1))

In [40]:
dist(x)

array([[  0.        ,   6.55743852,   6.        ,   2.        ,
          2.23606798,   1.41421356,   5.09901951,   6.164414  ,
          4.24264069,   2.44948974],
       [  6.55743852,   0.        ,   7.14142843,   5.91607978,
          8.60232527,   5.38516481,   7.28010989,   7.81024968,
          4.58257569,   4.58257569],
       [  6.        ,   7.14142843,   0.        ,   4.89897949,
          6.40312424,   5.83095189,   7.61577311,  11.5758369 ,
          8.36660027,   4.69041576],
       [  2.        ,   5.91607978,   4.89897949,   0.        ,
          3.60555128,   2.44948974,   3.74165739,   7.07106781,
          5.09901951,   2.44948974],
       [  2.23606798,   8.60232527,   6.40312424,   3.60555128,
          0.        ,   3.31662479,   6.55743852,   7.68114575,
          6.08276253,   4.12310563],
       [  1.41421356,   5.38516481,   5.83095189,   2.44948974,
          3.31662479,   0.        ,   5.65685425,   6.        ,
          3.16227766,   1.41421356],
       [  

# Scikit Learn

In [11]:
from sklearn.metrics.pairwise import euclidean_distances
euclidean_distances(x,x)

array([[  0.        ,  10.72380529,   7.34846923,   4.12310563,
          3.        ,   8.30662386,   6.40312424,   7.68114575,
          5.        ,   7.14142843],
       [ 10.72380529,   0.        ,  11.35781669,   9.48683298,
          9.69535971,   6.164414  ,   9.05538514,   9.16515139,
          6.78232998,   6.        ],
       [  7.34846923,  11.35781669,   0.        ,   3.60555128,
          8.66025404,   8.66025404,   7.68114575,   5.        ,
          9.43398113,   5.38516481],
       [  4.12310563,   9.48683298,   3.60555128,   0.        ,
          5.09901951,   7.34846923,   5.09901951,   4.24264069,
          6.        ,   4.24264069],
       [  3.        ,   9.69535971,   8.66025404,   5.09901951,
          0.        ,   9.16515139,   4.47213595,   7.07106781,
          3.16227766,   7.34846923],
       [  8.30662386,   6.164414  ,   8.66025404,   7.34846923,
          9.16515139,   0.        ,  10.19803903,   9.48683298,
          7.34846923,   4.69041576],
       [  

In [12]:
from scipy.spatial.distance import pdist
pdist(x)

array([ 10.72380529,   7.34846923,   4.12310563,   3.        ,
         8.30662386,   6.40312424,   7.68114575,   5.        ,
         7.14142843,  11.35781669,   9.48683298,   9.69535971,
         6.164414  ,   9.05538514,   9.16515139,   6.78232998,
         6.        ,   3.60555128,   8.66025404,   8.66025404,
         7.68114575,   5.        ,   9.43398113,   5.38516481,
         5.09901951,   7.34846923,   5.09901951,   4.24264069,
         6.        ,   4.24264069,   9.16515139,   4.47213595,
         7.07106781,   3.16227766,   7.34846923,  10.19803903,
         9.48683298,   7.34846923,   4.69041576,   3.74165739,
         5.09901951,   6.4807407 ,   7.34846923,   4.89897949,   6.164414  ])

# MD Analysis

In [18]:
def compute_distance_mdanalysis(coord):
    start = time.time()
    contact_matrix(coord, returntype="sparse")
    result="ComputeDistanceMDAnalysisSparse, %d, %.2f"%(len(coord), (time.time()-start))
    return result

# Tensorflow

In [38]:
import tensorflow as tf
import numpy as np

def compute_distance_tf(coord):
    start = time.time()
    sess = tf.Session(config=tf.ConfigProto(log_device_placement=True))
    matrix1 = tf.convert_to_tensor(coord.astype("float32"))
    matrix_extend=tf.expand_dims(matrix1, 1)
    dist_matrix=tf.sub(matrix_extend, matrix1)
    dist_matrix_pow=tf.pow(dist_matrix, 2)
    dist_matrix_pow_red=tf.reduce_sum(dist_matrix_pow, 2)
    dist_matrix_euc=tf.sqrt(dist_matrix_pow_red)
    rc=sess.run(dist_matrix_euc)
    result="ComputeDistanceTensorflow, %d, %.2f"%(len(coord), (time.time()-start))
    sess.close()
    return result, rc

In [39]:
compute_distance_tf(x)

('ComputeDistanceTensorflow, 10, 0.03',
 array([[  0.        ,   9.64365005,   8.60232449,   7.8740077 ,
           6.48074055,   9.43398094,   9.21954441,   5.        ,
           8.12403774,  10.86277962],
        [  9.64365005,   0.        ,   6.08276224,   7.14142799,
           4.12310553,   1.99999988,   7.34846973,  10.48808861,
           8.66025352,   7.28010893],
        [  8.60232449,   6.08276224,   0.        ,   1.41421342,
           7.48331451,   6.70820427,   3.31662488,   6.5574379 ,
           9.48683357,   3.1622777 ],
        [  7.8740077 ,   7.14142799,   1.41421342,   0.        ,
           7.8740077 ,   7.68114567,   3.60555077,   5.38516474,
           9.38083076,   3.74165726],
        [  6.48074055,   4.12310553,   7.48331451,   7.8740077 ,
           0.        ,   4.12310553,   8.66025352,   9.11043262,
           6.78233004,   9.27361774],
        [  9.43398094,   1.99999988,   6.70820427,   7.68114567,
           4.12310553,   0.        ,   7.07106733,  10.

In [44]:
cutoff = 15.0

def get_edges_point(self, point_index, adjacency_matrix, cutoff=15.0):
    edge_list = []
    for index, i in np.ndenumerate(adjacency_matrix):
        print ("Index: %d, Value: %d"%(index[i], i))
        #if i==True and point_index<=index[1]:
        if point_index<=index[1] and i<cutoff:
            # Attention we only compute the upper half of the adjacency matrix
            # thus we need to offset the target edge vertice by point_index
            edge_list.append((point_index, point_index+index[1]))
    return edge_list
    
def compute_distance_tf_batch(coord, batch_size=5):
    start_time = time.time()
    num_batches = len(coord)/batch_size
    sess = tf.Session(config=tf.ConfigProto(log_device_placement=True))
    matrix2 = tf.convert_to_tensor(coord.astype("float32"))
    print matrix2.get_shape()
    matrix1_splits = tf.split(0, num_batches, matrix2)
    edges=[]
    for split in matrix1_splits:
    #for i in range(num_batches):
        #start=i*batch_size
        #end=start+batch_size
        #print "Compute batch from %d to %d"%(start, end)
        #matrix1 = tf.convert_to_tensor(coord[start:end].astype("float32"))
        #matrix1 = tf.slice(matrix2, [start], [batch_size])
        matrix1 = split
        matrix_extend=tf.expand_dims(matrix1, 1)
        dist_matrix=tf.sub(matrix_extend, matrix2)
        dist_matrix_pow=tf.pow(dist_matrix, 2)
        dist_matrix_pow_red=tf.reduce_sum(dist_matrix_pow, 2)
        dist_matrix_euc=tf.sqrt(dist_matrix_pow_red) >cutoff
        dist_custoff_true=tf.where(dist_matrix_euc)
        #edges.append(sess.run(dist_matrix_euc))
        edges.append(sess.run(dist_custoff_true))
    sess.close()    
    result="ComputeDistanceTensorflow, %d, %.2f"%(len(coord), (time.time()-start_time))
    return (result, edges)

In [45]:
len(x)/3

3

In [46]:
compute_distance_tf_batch(x)

TensorShape([Dimension(10), Dimension(3)])


('ComputeDistanceTensorflow, 10, 0.10',
 [array([], shape=(0, 2), dtype=int64), array([], shape=(0, 2), dtype=int64)])

# Benchmark

In [48]:
DATA_PATH="../../data/mdanalysis/synthetic/traj/"
files=[os.path.join(DATA_PATH, i) for i in os.listdir(DATA_PATH)]

files=["../../data/mdanalysis/synthetic/traj/10000.np_txt"]

for file_name in files:
    print "Process: " + file_name
    coord = np.loadtxt(file_name, dtype='float32')
    start = time.time()
    result=compute_distance_tf_batch(coord, batch_size=4000)
    print result[0]

Process: ../../data/mdanalysis/synthetic/traj/10000.np_txt
TensorShape([Dimension(10000), Dimension(3)])
ComputeDistanceTensorflow, 10000, 10.01


In [7]:
!ls "../../data/mdanalysis/synthetic/traj/"

10.np_txt      100.np_txt     100000.np_txt  1600000.np_txt 200000.np_txt  400000.np_txt  800000.np_txt


# Theano

In [12]:
import theano
import theano.tensor as T
from theano import shared

tx=shared(x)
diff=tx-tx
diff_squared=diff**2

diff_squared.eval()

array([[0, 0, 0],
       [0, 0, 0],
       [0, 0, 0],
       [0, 0, 0],
       [0, 0, 0],
       [0, 0, 0],
       [0, 0, 0],
       [0, 0, 0],
       [0, 0, 0],
       [0, 0, 0]])