In [2]:
# writing a python function

In [5]:
def my_function():
    pass

In [6]:
def my_sq(x):
    return x ** 2

In [7]:
my_sq(4)

16

In [10]:
assert my_sq(4) == 16

In [12]:
def avg_2(x, y):
    return (x + y) / 2

In [14]:
avg_2(10, 20)

15.0

In [16]:
import pandas as pd

In [17]:
df = pd.DataFrame({
    'a': [10, 20, 30],
    'b': [20, 30, 40]
})

In [19]:
df['a'] ** 2

0    100
1    400
2    900
Name: a, dtype: int64

In [20]:
df['a'].apply(my_sq)

0    100
1    400
2    900
Name: a, dtype: int64

In [22]:
def my_exp(x, e):
    return x ** e

In [23]:
my_exp(4, 2)

16

In [24]:
my_exp(4, 3)

64

In [26]:
df['a'].apply(my_exp, e=4)

0     10000
1    160000
2    810000
Name: a, dtype: int64

In [27]:
def print_me(x):
    print(x)

In [29]:
df.apply(print_me)

0    10
1    20
2    30
Name: a, dtype: int64
0    20
1    30
2    40
Name: b, dtype: int64


a    None
b    None
dtype: object

In [30]:
def avg_3(x, y, z):
    return (x + y + z) / 3

In [31]:
df.apply(avg_3)

TypeError: ("avg_3() missing 2 required positional arguments: 'y' and 'z'", 'occurred at index a')

In [32]:
import numpy as np

In [33]:
def avg_3_apply(col):
    return np.mean(col)

In [34]:
df.apply(avg_3_apply)

a    20.0
b    30.0
dtype: float64

In [35]:
def avg_3_apply(col):
    x = col[0]
    y = col[1]
    z = col[2]
    return (x + y + z) / 3

In [36]:
df.apply(avg_3_apply)

a    20.0
b    30.0
dtype: float64

In [37]:
df.apply(avg_3_apply, axis='columns')

IndexError: ('index out of bounds', 'occurred at index 0')

In [38]:
df['a'].mean()

20.0

In [39]:
df['a'] + df['b']

0    30
1    50
2    70
dtype: int64

In [42]:
def avg_2_mod(x, y):
    if (x == 20):
        return np.NaN
    else:
        return (x + y) / 2

In [43]:
avg_2(df['a'], df['b'])

0    15.0
1    25.0
2    35.0
dtype: float64

In [44]:
avg_2_mod(df['a'], df['b'])

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [45]:
import numpy as np

In [47]:
avg_2_mod_vec = np.vectorize(avg_2_mod)

In [48]:
avg_2_mod_vec(df['a'], df['b'])

array([15., nan, 35.])

In [51]:
@np.vectorize
def v_avg_2_mod(x, y):
    if (x == 20):
        return np.NaN
    else:
        return (x + y) / 2

In [52]:
v_avg_2_mod(df['a'], df['b'])

array([15., nan, 35.])

In [53]:
import numba

In [55]:
@numba.vectorize
def v_avg_2_mod_numba(x, y):
    if (x == 20):
        return np.NaN
    else:
        return (x + y) / 2

In [57]:
v_avg_2_mod_numba(df['a'].values, df['b'].values)

array([15., nan, 35.])

In [58]:
%%timeit
avg_2(df['a'], df['b'])

241 µs ± 11.6 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [60]:
%%timeit
v_avg_2_mod(df['a'], df['b'])

108 µs ± 3.09 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [61]:
%%timeit
v_avg_2_mod_numba(df['a'].values, df['b'].values)

4.8 µs ± 118 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
