# High performance

```Python
mask = (x > 0.5) & (y < 0.5)
```
ex:
df = df[df[mask]]

# Intermediate values in memory
´´´python
temp = (x > 0.5)
temp2 (y < 0.5)
mask = tmp1 & tmp2
´´´

Can use pd.eval("") -> performs elementwise directly using numexpr

Good for compound expressions

In [1]:
import numpy as np
import pandas as pd

nrows, ncols = 1000000, 100
df1, df2, df3, df4 = [pd.DataFrame(np.random.randn(nrows, ncols)) for _ in range(4)]
df1.head()


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,1.383116,1.325772,-0.850583,0.478942,0.109862,0.202939,0.974377,1.415877,1.137297,-0.364979,...,0.806501,1.319331,-0.943088,0.76601,-0.763498,-0.54246,0.348946,-0.589862,0.366275,0.920043
1,-1.445233,1.302723,-0.451174,1.246797,2.705761,-0.080143,0.212093,1.284517,0.8814,0.560251,...,0.734202,-0.025936,2.332552,1.157261,1.451618,1.645595,-1.304122,-1.447577,0.214996,-1.190114
2,0.228578,-0.693271,-0.440112,0.098199,0.250028,1.128418,0.136831,0.776749,0.238162,1.517334,...,-0.727154,-1.570097,-0.168053,0.134417,1.09904,-0.500033,0.220809,-0.550406,0.184628,-1.847437
3,-0.83089,1.579971,1.038753,-1.796617,1.236998,0.889275,-0.411919,0.987442,-0.536708,-1.406357,...,-1.81154,0.200348,1.047035,-0.815309,-0.400975,-0.602829,-0.132343,1.037606,0.799041,0.922221
4,-0.710528,-0.101123,-1.217704,0.280437,1.881544,-0.884219,0.660293,-0.275393,-1.026402,0.051065,...,0.975879,-1.654539,-0.783887,-1.321788,-1.027736,1.621974,-3.474916,-0.303052,0.36697,2.248624


In [5]:
%timeit df1 + df2 + df3 + df4

339 ms ± 144 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [4]:
%timeit pd.eval("df1 + df2 + df3 + df4")

The slowest run took 6.50 times longer than the fastest. This could mean that an intermediate result is being cached.
236 ms ± 213 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [6]:
plain = df1 + df2 + df3 + df4
sum_eval = pd.eval("df1 + df2 + df3 + df4")
sum_eval.equals(plain)

True

In [9]:
# df.eval
rolls = pd.DataFrame(np.random.randint(1, 6, (6,3)), columns=["Die1", "Die2", "Die3"])
rolls.eval("Sum = Die1 + Die2 + Die3", inplace = True)
rolls

Unnamed: 0,Die1,Die2,Die3,Sum
0,3,5,2,10
1,5,2,5,12
2,3,3,4,10
3,2,3,4,9
4,2,2,1,5
5,4,2,1,7


In [10]:
# use variables
high = 10
rolls.eval("Winner = Sum > @high", inplace = True)
rolls

Unnamed: 0,Die1,Die2,Die3,Sum,Winner
0,3,5,2,10,False
1,5,2,5,12,True
2,3,3,4,10,False
3,2,3,4,9,False
4,2,2,1,5,False
5,4,2,1,7,False


In [11]:
# filter out "traditional" way
rolls[rolls["Sum"] <= high]

Unnamed: 0,Die1,Die2,Die3,Sum,Winner
0,3,5,2,10,False
2,3,3,4,10,False
3,2,3,4,9,False
4,2,2,1,5,False
5,4,2,1,7,False


# Query

In [12]:
rolls.query("Sum <= @high")

Unnamed: 0,Die1,Die2,Die3,Sum,Winner
0,3,5,2,10,False
2,3,3,4,10,False
3,2,3,4,9,False
4,2,2,1,5,False
5,4,2,1,7,False


In [13]:
os_data_src = "Data/athlete_events.csv"
os = pd.read_csv(os_data_src)
os.head()

Unnamed: 0,ID,Name,Sex,Age,Height,Weight,Team,NOC,Games,Year,Season,City,Sport,Event,Medal
0,1,A Dijiang,M,24.0,180.0,80.0,China,CHN,1992 Summer,1992,Summer,Barcelona,Basketball,Basketball Men's Basketball,
1,2,A Lamusi,M,23.0,170.0,60.0,China,CHN,2012 Summer,2012,Summer,London,Judo,Judo Men's Extra-Lightweight,
2,3,Gunnar Nielsen Aaby,M,24.0,,,Denmark,DEN,1920 Summer,1920,Summer,Antwerpen,Football,Football Men's Football,
3,4,Edgar Lindenau Aabye,M,34.0,,,Denmark/Sweden,DEN,1900 Summer,1900,Summer,Paris,Tug-Of-War,Tug-Of-War Men's Tug-Of-War,Gold
4,5,Christine Jacoba Aaftink,F,21.0,185.0,82.0,Netherlands,NED,1988 Winter,1988,Winter,Calgary,Speed Skating,Speed Skating Women's 500 metres,


In [16]:
%timeit os[os["NOC"] == "SWE"]
%timeit os.query("NOC == 'SWE'")

10.9 ms ± 827 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
5.25 ms ± 629 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [18]:
%timeit os[os["Height"] > 180]
%timeit os.query("Height > 180")

5.34 ms ± 36.2 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
6.45 ms ± 61 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [21]:
%timeit os[(os["Sex"] == "F") & (os["Height"] > 180) & (os["NOC"] == "SWE")]
%timeit os.query("Sex == 'F' & Height > 180 & NOC == 'SWE'")

17.9 ms ± 161 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
6.3 ms ± 146 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [None]:
os.query("Sex == 'F' & Height > 180 & NOC == 'SWE'")