In [1]:
import numpy as np
import pandas as pd
import numexpr as ne

from math import sqrt
from multiprocessing import Pool

In [2]:
def create_random_df(mean, std, n):
    return pd.DataFrame(np.random.normal(mean, std, (n, 4)), columns=['x1', 'y1', 'x2', 'y2'])

df = create_random_df(20, 1, 10000)
print df.head()

          x1         y1         x2         y2
0  22.468385  17.611699  21.022606  18.604954
1  18.649893  20.080207  21.193146  20.475932
2  20.832434  21.683929  19.582718  19.806201
3  18.654435  20.354341  20.534422  20.184434
4  20.887474  21.169106  19.831849  19.520060


In [3]:
def euclidean_dist1(df):
    '''simple implementation from scratch'''
    return np.sqrt(np.square(df['x1'] - df['x2']) + np.square(df['y1'] - df['y2']))

def euclidean_dist2(df):
    '''using norm method from numpy'''
    return np.linalg.norm(df[['x1', 'y1']].values - df[['x2', 'y2']].values, axis=1)

def dist(row):
    '''distance for a single pair of points'''
    return sqrt((row[0] - row[2])**2 + (row[1] - row[3])**2)

def euclidean_dist3(df):
    '''use multiprocessing to compute distances in parallel'''
    p = Pool(processes=4)
    res = p.map(dist, df.values)
    p.terminate()
    return res

def euclidean_dist4(df):
    '''numexpr uses multi-threading capabilities to make use of all cores'''
    x1, y1, x2, y2 = df['x1'], df['y1'], df['x2'], df['y2']
    return ne.evaluate('sqrt((x1 - x2)**2 + (y1 - y2)**2)')

In [4]:
# sanity check: all functions return the same result
np.allclose(euclidean_dist1(df), euclidean_dist2(df), euclidean_dist3(df), euclidean_dist4(df))

True

In [5]:
%timeit euclidean_dist1(df)
%timeit euclidean_dist2(df)
%timeit euclidean_dist3(df)
%timeit euclidean_dist4(df)

1000 loops, best of 3: 440 µs per loop
1000 loops, best of 3: 1.07 ms per loop
10 loops, best of 3: 117 ms per loop
1000 loops, best of 3: 216 µs per loop


In [6]:
df = create_random_df(20, 1, 100000)
%timeit euclidean_dist1(df)
%timeit euclidean_dist2(df)
%timeit euclidean_dist3(df)
%timeit euclidean_dist4(df)

100 loops, best of 3: 2.22 ms per loop
100 loops, best of 3: 4.86 ms per loop
1 loop, best of 3: 522 ms per loop
1000 loops, best of 3: 781 µs per loop


In [7]:
df = create_random_df(20, 1, 1000000)
%timeit euclidean_dist1(df)
%timeit euclidean_dist2(df)
%timeit euclidean_dist3(df)
%timeit euclidean_dist4(df)

10 loops, best of 3: 21.3 ms per loop
10 loops, best of 3: 49.8 ms per loop
1 loop, best of 3: 4.88 s per loop
100 loops, best of 3: 6.97 ms per loop


<p>All the functions above should scale linearly in the size of the input.</p>
<p>euclidean_dist3 which uses multithreading doesn't do as well probably because the overhead of spawning different processes is high.</p>
<p>euclidean_dist4 is the fastest since it uses all the cores available. For example, the compution should be 4 times faster if we have 4 cores available. See more on the website of how it improves performance: https://github.com/pydata/numexpr</p>