# HDS 5230
## Week 4

High Performance Computing

In [2]:
import pandas as pd
import numpy as np

In [4]:
df = pd.read_csv('clinics.csv', delimiter='|')

df.head()

Unnamed: 0,bizID,bizCat,bizCatSub,bizName,bizAddr,bizCity,bizState,bizZip,bizPhone,bizFax,...,bizURL,locAreaCode,locFIPS,locTimeZone,locDST,locLat,locLong,locMSA,locPMSA,locCounty
0,1,Clinics,Clinics,Hino Ronald H MD,98-151 Pali Momi Street Suite 142,Aiea,HI,96701,(808)487-2477,,...,,808,15003,PST-2,N,21.398,-157.8981,3320.0,,Honolulu
1,2,Clinics,Clinics,Farmer Joesph F Md,1225 Breckenridge Drive,Little Rock,AR,72205,(501)225-2594,,...,,501,5119,CST,Y,34.7495,-92.3533,4400.0,,Pulaski
2,3,Clinics,Clinics & Medical Centers,Najjar Fadi Md,1155 West Linda Avenue Suite B,Hermiston,OR,97838,(541)289-1122,,...,,541,41059,PST,Y,45.8456,-119.2817,,,Umatilla
3,4,Clinics,Clinics & Medical Centers,Kittson Memorial Upper Level Nursing Home,1010 South Birch Avenue,Hallock,MN,56728,(218)843-2525,,...,,218,27069,CST,Y,48.7954,-97.009,,,Kittson
4,5,Clinics,Clinics & Medical Centers,Thompson Robert B Md,100 North Eagle Creek Drive,Lexington,KY,40509,(859)258-4000,,...,www.lexingtonclinic.com,859,21067,EST,Y,37.9935,-84.3712,4280.0,,Fayette


Defining the distance function:

In [5]:
def haversine(lat1, lon1, lat2, lon2):
    MILES = 3959
    lat1, lon1, lat2, lon2 = map(np.deg2rad, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1 
    dlon = lon2 - lon1 
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arcsin(np.sqrt(a)) 
    total_miles = MILES * c
    return total_miles

Method 1: Looping

In [17]:
%%timeit
def haversine_looping(df):
    distance_list = []
    for i in range(0, len(df)):
        d = haversine(40.671, -73.985, df.iloc[i]['locLat'], df.iloc[i]['locLong'])
        distance_list.append(d)
    return distance_list

df['distnace'] = haversine_looping(df)

5.54 ms ± 7.64 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


Method 2: Haversine applied on rows via iteration

In [9]:

%%timeit
haversine_series = []

for index, row in df.iterrows():
    haversine_series.append(haversine(40.671, -73.985, row['locLat'], row['locLong']))

df['distance'] = haversine_series

1.56 ms ± 2.61 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


Method 3: Timing apply on the Haversine function

In [13]:
%%timeit
df['distance'] = df.apply(lambda row: haversine(40.671, -73.985, row['locLat'], row['locLong']), axis=1)

969 µs ± 968 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each)


Method 4: Vectorized implementation of Haversine applied on Pandas series

In [14]:
%%timeit
df['distance'] = haversine(40.671, -73.985, df['locLat'], df['locLong'])

1.03 ms ± 1.45 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


Method 5: Vectorized implementation of Haversine applied on NumPy arrays

In [16]:
%%timeit
df['distance'] = haversine(40.671, -73.985, df['locLat'].values, df['locLong'].values)

106 µs ± 71.2 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
