# Tips and Tricks for Pandas


In [31]:
%load_ext Cython
import numba
import numpy as np
import pandas as pd

The Cython extension is already loaded. To reload it, use:
  %reload_ext Cython


In [32]:
def make_df() -> pd.DataFrame:
    return pd.DataFrame(
        np.random.rand(10_000, 3),
        columns=["A", "B", "C"],
    )

In [33]:
df = make_df()

In [34]:
print(df.head())

          A         B         C
0  0.765605  0.050536  0.997545
1  0.067672  0.209466  0.733057
2  0.039797  0.554136  0.299882
3  0.095909  0.556513  0.967307
4  0.592292  0.325236  0.201376


## Speed-Up Apply

Use-Case:
Replace the value in a column by 0.0 if it is less than 0.5


In [35]:
def slow_function(df: pd.DataFrame) -> None:
    col = "A"
    for _, row in df.iterrows():
        if row[col] < 0.5:
            row[col] = 0.0

In [36]:
%timeit slow_function(df)

299 ms ± 20.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [37]:
print(df.head())

          A         B         C
0  0.765605  0.050536  0.997545
1  0.000000  0.209466  0.733057
2  0.000000  0.554136  0.299882
3  0.000000  0.556513  0.967307
4  0.592292  0.325236  0.201376


In [38]:
def faster_function(df: pd.DataFrame) -> None:
    df["B"] = df["B"].apply(lambda x: 0.0 if x < 0.5 else x)

In [39]:
df = make_df()

In [40]:
%timeit faster_function(df)

1.21 ms ± 23.3 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [41]:
print(df.head())

          A         B         C
0  0.635418  0.603695  0.370619
1  0.623580  0.000000  0.983399
2  0.136205  0.835230  0.515819
3  0.549301  0.000000  0.550670
4  0.282295  0.000000  0.187677


In [42]:
def even_faster_function(df: pd.DataFrame) -> None:
    df["C"] = np.where(df["C"] < 0.5, 0.0, df["C"])

In [43]:
df = make_df()

In [44]:
%timeit even_faster_function(df)

119 µs ± 4.9 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [45]:
print(df.head())

          A         B         C
0  0.919487  0.879795  0.792813
1  0.174273  0.720798  0.000000
2  0.334481  0.872924  0.000000
3  0.925926  0.367924  0.000000
4  0.590363  0.697155  0.000000


In [46]:
%%cython
cimport cython
cimport numpy as np
import numpy as np

@cython.boundscheck(False)
@cython.wraparound(False)
cpdef inner(np.ndarray[double, ndim=1] array):
    for i in range(array.shape[0]):
        if array[i] < 0.5:
            array[i] = 0.0

def cython_apply(df):
    inner(df["A"].values)


In [47]:
df = make_df()

In [48]:
%timeit cython_apply(df)

8.53 µs ± 569 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)


In [49]:
print(df.head())

          A         B         C
0  0.000000  0.238601  0.702826
1  0.868471  0.038493  0.906479
2  0.538929  0.534994  0.532870
3  0.000000  0.398570  0.885484
4  0.823780  0.996472  0.399336


In [50]:
@numba.jit
def inner(array: np.ndarray) -> None:
    for i in range(array.shape[0]):
        if array[i] < 0.5:
            array[i] = 0.0


def numba_apply(df: pd.DataFrame) -> None:
    inner(df["A"].values)

  @numba.jit


In [51]:
df = make_df()

In [52]:
%timeit numba_apply(df)

7.36 µs ± 184 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)


In [53]:
print(df.head())

          A         B         C
0  0.772160  0.642384  0.108635
1  0.000000  0.111692  0.437901
2  0.531210  0.309255  0.175349
3  0.848457  0.205686  0.850719
4  0.813487  0.958661  0.335560


| Function             | Time       | Speed-Up |
| -------------------- | ---------- | -------- |
| slow_function        | 377ms      | -        |
| faster_function      | 1,4ms      | 269x     |
| even_faster_function | 138 us     | 2,731x   |
| cython_function      | 8.36 us    | 45,095x  |
| numba_function       | 6.18 us    | 61,003x  |
| c++                  | 13.2301 us | 28,495x  |


# Speed-Up Transform


In [54]:
def slow_function2(df: pd.DataFrame) -> None:
    col = "A"
    for _idx, row in df.iterrows():
        row[col] = row[col] + 1.0

In [55]:
df = make_df()

In [56]:
%timeit slow_function2(df)

338 ms ± 8.67 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [57]:
print(df.head())

          A         B         C
0  8.504680  0.368171  0.069273
1  8.227038  0.287186  0.918400
2  8.089656  0.874666  0.862913
3  8.920216  0.186675  0.251968
4  8.827670  0.988160  0.157106


In [58]:
def faster_function2(df: pd.DataFrame) -> None:
    df["B"] = df["B"].transform(lambda x: x + 1.0)

In [59]:
df = make_df()

In [60]:
%timeit faster_function2(df)

1.16 ms ± 26.3 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [61]:
print(df.head())

          A            B         C
0  0.331196  8111.366002  0.549351
1  0.155978  8111.093852  0.508282
2  0.715712  8111.329463  0.025127
3  0.094839  8111.834435  0.100286
4  0.559291  8111.595357  0.698990


In [62]:
def even_faster_function2(df: pd.DataFrame) -> None:
    df["C"] = df["C"] + 1.0

In [63]:
df = make_df()

In [64]:
%timeit even_faster_function2(df)

104 µs ± 3.01 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [65]:
print(df.head())

          A         B             C
0  0.530135  0.812159  81111.003906
1  0.026888  0.363115  81111.680093
2  0.802039  0.561515  81111.216162
3  0.074736  0.830822  81111.523468
4  0.317605  0.833871  81111.174935


In [66]:
%%cython
cimport cython
cimport numpy as np
import numpy as np

@cython.boundscheck(False)
@cython.wraparound(False)
cpdef inner(np.ndarray[double, ndim=1] array):
    for i in range(array.shape[0]):
        array[i] += 1.0

def cython_transform(df):
    inner(df["A"].values)


In [67]:
df = make_df()

In [68]:
%timeit cython_transform(df)

7.4 µs ± 172 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)


In [69]:
print(df.head())

               A         B         C
0  811111.363066  0.349891  0.920078
1  811111.091901  0.403213  0.949206
2  811111.785617  0.430489  0.466205
3  811111.364162  0.381953  0.078409
4  811111.430127  0.122566  0.743954


In [70]:
@numba.jit
def inner2(array: np.ndarray) -> None:
    for i in range(array.shape[0]):
        array[i] += 1.0


def numba_transform(df: pd.DataFrame) -> None:
    inner2(df["A"].values)

  @numba.jit


In [71]:
df = make_df()

5.55 µs ± 119 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)
               A         B         C
0  811111.583415  0.302987  0.633909
1  811111.175682  0.366690  0.312827
2  811111.746706  0.464280  0.662986
3  811111.460189  0.932064  0.561992
4  811111.013973  0.845645  0.169862


In [None]:
%timeit numba_transform(df)

In [None]:
print(df.head())

| Function             | Time     | Speed-Up |
| -------------------- | -------- | -------- |
| slow_function        | 440 ms   | -        |
| faster_function      | 1.4 ms   | 314x     |
| even_faster_function | 124 us   | 3,548x   |
| cython_function      | 7.63 us  | 57,667x  |
| numba_function       | 6.46 us  | 68,111x  |
| c++                  | 0.526 us | 836,501x |
