In [1]:
import numpy as np
import pandas as pd

### Apply a function along an axis of the DataFrame

> [**Reference**] https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.apply.html

> `df.apply(func, axis=0, broadcast=None, raw=False, reduce=None, result_type=None, args=(), **kwds)`

- **Recall**: 파이썬 함수 만들기

```
def my_func(x, y):
    pass
my_func(1, 2)
    
def my_square(x):
    return x ** 2
my_square(2)
my_square(4)

assert my_square(4) == 16
assert my_square(4) == 15 # AssertionError

avg_2 = lambda x, y: (x + y) / 2
avg_2(10, 20)
```

In [2]:
def my_func(x, y):
    pass
my_func(1, 2)

In [9]:
def my_square(x):
    return x ** 2
my_square(2)

4

In [4]:
my_square(4)

16

In [5]:
assert my_square(4) == 16

In [6]:
assert my_square(4) == 15 # AssertionError

AssertionError: 

In [27]:
avg_2 = lambda x, y: (x + y) / 2
avg_2(10, 20)

15.0

```
df = pd.DataFrame({
    'a': [10, 20, 30],
    'b': [20, 30, 40]
})
df

df['a'] ** 2
df ** 2
```

In [28]:
df = pd.DataFrame({
    'a': [10, 20, 30],
    'b': [20, 30, 40]
})
df

Unnamed: 0,a,b
0,10,20
1,20,30
2,30,40


In [29]:
df['a'] ** 2

0    100
1    400
2    900
Name: a, dtype: int64

In [30]:
df ** 2

Unnamed: 0,a,b
0,100,400
1,400,900
2,900,1600


```
my_square
df['a'].apply(my_square)
```

In [31]:
my_square

<function __main__.my_square(x)>

In [32]:
df['a'].apply(my_square)

0    100
1    400
2    900
Name: a, dtype: int64

```
def my_exp(x, e):
    return x ** e
my_exp(2, 10)

df['a'].apply(my_exp, e=4)
```

In [33]:
def my_exp(x, e):
    return x ** e
my_exp(2, 10)

1024

In [34]:
df['a'].apply(my_exp, e=4)

0     10000
1    160000
2    810000
Name: a, dtype: int64

- `axis=0`: apply function to each column (**default**)
- `axis=1`: apply function to each row

```
def print_me(x):
    print(x)
    return x.sum()
df

df.apply(print_me)
df.apply(print_me, axis=1)
```

In [40]:
def print_me(x):
    print(x)
    return x.sum()
df

Unnamed: 0,a,b
0,10,20
1,20,30
2,30,40


In [41]:
df.apply(print_me)

0    10
1    20
2    30
Name: a, dtype: int64
0    20
1    30
2    40
Name: b, dtype: int64


a    60
b    90
dtype: int64

In [42]:
df.apply(print_me, axis=1)

a    10
b    20
Name: 0, dtype: int64
a    20
b    30
Name: 1, dtype: int64
a    30
b    40
Name: 2, dtype: int64


0    30
1    50
2    70
dtype: int64

```
avg_3 = lambda x, y, z: (x + y + z) / 3
df.apply(avg_3)

avg_3_apply = lambda col: np.mean(col)
df.apply(avg_3_apply)

def avg_3_apply(col):
    x = col[0]
    y = col[1]
    z = col[2]
    return (x + y + z) / 3
df.apply(avg_3_apply)
```

In [43]:
avg_3 = lambda x, y, z: (x + y + z) / 3
df.apply(avg_3)

TypeError: ("<lambda>() missing 2 required positional arguments: 'y' and 'z'", 'occurred at index a')

In [47]:
avg_3_apply = lambda col: np.mean(col)
df.apply(avg_3_apply)

a    20.0
b    30.0
dtype: float64

In [40]:
df

Unnamed: 0,a,b
0,10,20
1,20,30
2,30,40


```
def avg_3_apply(col):
    x = col[0]
    y = col[1]
    z = col[2]
    return (x + y + z) / 3
df.apply(avg_3_apply)

df.apply(avg_3_apply, axis=1) # IndexError
```

In [51]:
def avg_3_apply(col):
    x = col[0]
    y = col[1]
    z = col[2]
    return (x + y + z) / 3
df.apply(avg_3_apply)

a    20.0
b    30.0
dtype: float64

In [52]:
df.apply(avg_3_apply, axis=1) # IndexError

IndexError: ('index out of bounds', 'occurred at index 0')

```
df['a'].mean()
df['a'] + df['b']
```

In [53]:
df['a'].mean()

20.0

In [54]:
df['a'] + df['b']

0    30
1    50
2    70
dtype: int64

```
def avg_2_mod(x, y):
    if (x == 20):
        return np.NaN # or np.NAN or np.nan
    else:
        return (x + y) / 2
df
avg_2_mod(df['a'], df['b']) # ValueError
```

In [64]:
def avg_2_mod(x, y):
    if (x == 20):
        return np.NaN # or np.NAN or np.nan
    else:
        return (x + y) / 2
df

Unnamed: 0,a,b
0,10,20
1,20,30
2,30,40


In [59]:
avg_2_mod(df['a'], df['b']) # ValueError

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

> [**Reference**] https://docs.scipy.org/doc/numpy/reference/generated/numpy.vectorize.html

> - The vectorize function is provided primarily for convenience, not for performance. The implementation is essentially a for loop.

```
avg_2_mod_vec = np.vectorize(avg_2_mod)
df
avg_2_mod_vec(df['a'], df['b'])
```

In [62]:
avg_2_mod_vec = np.vectorize(avg_2_mod)
df

Unnamed: 0,a,b
0,10,20
1,20,30
2,30,40


In [63]:
avg_2_mod_vec(df['a'], df['b'])

array([15., nan, 35.])

```
@np.vectorize
def avg_2_mod(x, y):
    if (x == 20):
        return np.NaN # or np.NAN or np.nan
    else:
        return (x + y) / 2
avg_2_mod(df['a'], df['b'])
```

In [67]:
@np.vectorize
def avg_2_mod(x, y):
    if (x == 20):
        return np.NaN # or np.NAN or np.nan
    else:
        return (x + y) / 2
avg_2_mod(df['a'], df['b'])

array([15., nan, 35.])

### Numba

- Numba is an open source JIT compiler that translates a subset of Python and NumPy code into fast machine code. 
- https://numba.pydata.org

```
import numba

@numba.vectorize
def avg_2_mod_numba(x, y):
    if (x == 20):
        return np.NaN # or np.NAN or np.nan
    else:
        return (x + y) / 2
avg_2_mod_numba(df['a'].values, df['b'].values)
```

In [74]:
import numba

@numba.vectorize
def avg_2_mod_numba(x, y):
    if (x == 20):
        return np.NaN # or np.NAN or np.nan
    else:
        return (x + y) / 2
avg_2_mod_numba(df['a'].to_numpy(), df['b'].to_numpy())

array([15., nan, 35.])

In [75]:
%%timeit
avg_2(df['a'], df['b'])

211 µs ± 7.66 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [76]:
%%timeit
avg_2_mod(df['a'], df['b'])

114 µs ± 3.81 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [78]:
%%timeit
avg_2_mod_numba(df['a'].to_numpy(), df['b'].to_numpy())

12.1 µs ± 295 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


In [79]:
%%timeit
avg_2_mod_numba(df['a'].values, df['b'].values)

6.05 µs ± 474 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
