# REFs and Imports

BEST : https://pandas.pydata.org/pandas-docs/stable/user_guide/enhancingperf.html

Other Refs:

- https://engineering.upside.com/a-beginners-guide-to-optimizing-pandas-code-for-speed-c09ef2c6a4d6
- https://github.com/s-heisler/pycon2017-optimizing-pandas/blob/master/pyCon%20materials/PyCon%20un-sad%20Pandas.ipynb
- https://realpython.com/fast-flexible-pandas/


In [2]:
import pandas as pd
import numpy as np
from math import *

In [3]:
#!pip install line_profiler

In [4]:
#!pip install cython

In [5]:
%load_ext line_profiler

In [6]:
df = pd.read_csv('new_york_hotels.csv', encoding='cp1252')

In [7]:
df.head(3)

Unnamed: 0,ean_hotel_id,name,address1,city,state_province,postal_code,latitude,longitude,star_rating,high_rate,low_rate
0,269955,Hilton Garden Inn Albany/SUNY Area,1389 Washington Ave,Albany,NY,12206,42.68751,-73.81643,3.0,154.0272,124.0216
1,113431,Courtyard by Marriott Albany Thruway,1455 Washington Avenue,Albany,NY,12206,42.68971,-73.82021,3.0,179.01,134.0
2,108151,Radisson Hotel Albany,205 Wolf Rd,Albany,NY,12205,42.7241,-73.79822,3.0,134.17,84.16


In [8]:
df.shape

(1631, 11)

In [9]:
df_big = df.append(df).append(df).append(df)
df_big = df_big.append(df_big).append(df_big)
df_big = df_big.append(df_big).append(df_big)
df_big = df_big.append(df_big)
df_big = df_big.reset_index(drop=True)

In [10]:
df_big.shape

(117432, 11)

Haversine distance

In [11]:
# Define a basic Haversine distance formula
def haversine(lat1, lon1, lat2, lon2):
    MILES = 3959
    lat1, lon1, lat2, lon2 = map(np.deg2rad, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1 
    dlon = lon2 - lon1 
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arcsin(np.sqrt(a)) 
    total_miles = MILES * c
    return total_miles

# Crude Loop <font color='red'>414 ms /// 29 s</font>

In [11]:
# Define a function to manually loop over all rows and return a series of distances
def haversine_looping(df):
    distance_list = []
    for i in range(0, len(df)):
        d = haversine(40.671, -73.985, df.iloc[i]['latitude'], df.iloc[i]['longitude'])
        distance_list.append(d)
    return distance_list

In [12]:
%%timeit

# Run the haversine looping function
df['distance'] = haversine_looping(df)

389 ms ± 20 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [13]:
%lprun -f haversine haversine_looping(df)

Picture of lprun results

<img src="lprune_pics/lprun_loop.png">

In the "big" case

In [67]:
%%timeit

# Run the haversine looping function
attempt = haversine_looping(df_big)

29 s ± 986 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


# iterrows() <font color='salmon'>152 ms /// 11 s</font>

In [7]:
%%timeit

# Haversine applied on rows via iteration
haversine_series = []
for index, row in df.iterrows():
    haversine_series.append(haversine(40.671, -73.985, row['latitude'], row['longitude']))
df['distance'] = haversine_series

152 ms ± 5.95 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [62]:
def haver_iter(df):
    haversine_series = []
    for index, row in df.iterrows():
        haversine_series.append(haversine(40.671, -73.985, row['latitude'], row['longitude']))
    df['distance'] = haversine_series

In [13]:
%lprun -f haversine haver_iter(df)

Picture of lprun results

<img src="lprune_pics/lprun_iter.png">

In the "big" case

In [68]:
%timeit haver_iter(df_big)

11 s ± 489 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


# apply() <font color='orange'>61 ms /// 4.2 s</font>

In [8]:
%%timeit

# Timing apply on the Haversine function
df['distance'] = df.apply(lambda row: haversine(40.671, -73.985, row['latitude'], row['longitude']), axis=1)

61.8 ms ± 4.03 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [44]:
# Haversine applied on rows with line profiler
%lprun -f haversine df.apply(lambda row: haversine(40.671, -73.985, row['latitude'], row['longitude']), axis=1)

Picture of lprun results

<img src="lprune_pics/lprun0.png">

In the "big" case

In [69]:
%%timeit
# Timing apply on the Haversine function
attempt = df_big.apply(lambda row: haversine(40.671, -73.985, row['latitude'], row['longitude']), axis=1)

4.24 s ± 96.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


# Vectorization <font color='darkgreen'>1.48 ms /// 5.6 ms</font>

In [17]:
%%timeit 

# Vectorized implementation of Haversine applied on Pandas series
df['distance'] = haversine(40.671, -73.985, df['latitude'], df['longitude'])

1.48 ms ± 107 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [18]:
# Vectorized implementation profile
%lprun -f haversine haversine(40.671, -73.985, df['latitude'], df['longitude'])

Picture of lprun results, **HITS** went from 1631 to 1.

<img src="lprune_pics/lprun_vec.png">

In the "big" case

In [70]:
%%timeit 
# Vectorized implementation of Haversine applied on Pandas series
attempt = haversine(40.671, -73.985, df_big['latitude'], df_big['longitude'])

5.62 ms ± 232 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


# Vectorization with NumPy <font color='lime'>165 $\mu$s /// 3.5 ms</font>

In [19]:
%%timeit

# Vectorized implementation of Haversine applied on NumPy arrays
df['distance'] = haversine(40.671, -73.985, df['latitude'].values, df['longitude'].values)

165 µs ± 1.58 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [20]:
%lprun -f haversine df['distance'] = haversine(40.671, -73.985, df['latitude'].values, df['longitude'].values)

Picture of lprun results

<img src="lprune_pics/lprun_numpy.png">

In the "big" case

In [72]:
%%timeit
# Vectorized implementation of Haversine applied on NumPy arrays
attempt = haversine(40.671, -73.985, df_big['latitude'].values, df_big['longitude'].values)

3.53 ms ± 91 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


# CYTHON

## No mods

https://cython.readthedocs.io/en/latest/src/quickstart/build.html

In [12]:
%load_ext cython

In [13]:
%%cython -a

# Haversine cythonized (no other edits)
import numpy as np # must be present even if already imported above
cpdef haversine_cy(lat1, lon1, lat2, lon2):
    miles_constant = 3959
    lat1, lon1, lat2, lon2 = map(np.deg2rad, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1 
    dlon = lon2 - lon1 
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arcsin(np.sqrt(a)) 
    mi = miles_constant * c
    return mi

In [14]:
%%timeit -n 20 -r 20
df['distance'] = \
        df.apply(lambda row: haversine_cy(40.671, -73.985,row['latitude'], row['longitude']), axis=1)

56.5 ms ± 3.33 ms per loop (mean ± std. dev. of 20 runs, 20 loops each)


In [15]:
%%timeit

# Vectorized implementation of Haversine applied on NumPy arrays
df['distance'] = haversine_cy(40.671, -73.985, df['latitude'].values, df['longitude'].values)

161 µs ± 464 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In the "big" case

In [24]:
%%timeit 
attempt = \
        df_big.apply(lambda row: haversine_cy(40.671, -73.985,row['latitude'], row['longitude']), axis=1)

4.03 s ± 122 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [22]:
%%timeit

# Vectorized implementation of Haversine applied on NumPy arrays
attempt = haversine_cy(40.671, -73.985, df_big['latitude'].values, df_big['longitude'].values)

3.34 ms ± 97.9 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


## Mods

In [19]:
%%cython -a

# Haversine cythonized with mods

from numpy import deg2rad # must be present even if already imported above
from libc.math cimport pow, sqrt, sin, cos, asin

cpdef float haversine_cy_mod(float lat1, float lon1, float lat2, float lon2):
    cdef int miles_constant = 3959
    lat1, lon1, lat2, lon2 = map(deg2rad, [lat1, lon1, lat2, lon2])
    cdef float dlat = lat2 - lat1 
    cdef float dlon = lon2 - lon1 
    cdef float a = pow(sin(dlat/2),2) + cos(lat1) * cos(lat2) * pow(sin(dlon/2),2)
    cdef float c = 2 * asin(sqrt(a)) 
    cdef float mi = miles_constant * c
    return mi

In [20]:
%%timeit -n 20 -r 20

df['distance'] = df.apply(lambda row: haversine_cy_mod(40.671, -73.985,row['latitude'], row['longitude']), axis=1)

41.4 ms ± 1.81 ms per loop (mean ± std. dev. of 20 runs, 20 loops each)


The vectorized version doesnt work with these modifications

# NUMBA