# Pandas - working with tabular data

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("https://raw.githubusercontent.com/pandas-dev/pandas/master/doc/data/titanic.csv")

In [3]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
df.groupby(["Pclass", "Sex"])["Survived"].mean().unstack() * 100

Sex,female,male
Pclass,Unnamed: 1_level_1,Unnamed: 2_level_1
1,96.808511,36.885246
2,92.105263,15.740741
3,50.0,13.544669


# Pandas + Numba = 🚀


pandas 1.0 added support for numba-jitted .rolling().apply
Expanding to groupby().aggregate() and .transform() in pandas 1.1

In [5]:
data = pd.Series(range(1_000_000))

In [6]:
def udf(x):
    return np.sum(x) + 5

In [7]:
roll = data.rolling(10)

In [8]:
%time roll.apply(udf, raw=True)

CPU times: user 3.95 s, sys: 13.8 ms, total: 3.96 s
Wall time: 3.96 s


0               NaN
1               NaN
2               NaN
3               NaN
4               NaN
            ...    
999995    9999910.0
999996    9999920.0
999997    9999930.0
999998    9999940.0
999999    9999950.0
Length: 1000000, dtype: float64

In [9]:
%time roll.apply(udf, raw=True, engine="numba")

CPU times: user 818 ms, sys: 51.8 ms, total: 869 ms
Wall time: 1.01 s


0               NaN
1               NaN
2               NaN
3               NaN
4               NaN
            ...    
999995    9999910.0
999996    9999920.0
999997    9999930.0
999998    9999940.0
999999    9999950.0
Length: 1000000, dtype: float64

In [10]:
%time roll.apply(udf, raw=True, engine="numba");

CPU times: user 117 ms, sys: 4.1 ms, total: 121 ms
Wall time: 120 ms


0               NaN
1               NaN
2               NaN
3               NaN
4               NaN
            ...    
999995    9999910.0
999996    9999920.0
999997    9999930.0
999998    9999940.0
999999    9999950.0
Length: 1000000, dtype: float64

In [11]:
%timeit roll.apply(udf, raw=True, engine="numba");

148 ms ± 35 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
