In [3]:
def my_function():
    pass

In [4]:
def my_sq(x):
    return x ** 2

In [5]:
my_sq(4)

16

In [6]:
assert my_sq(4) == 16

In [7]:
def avg_2(x, y):
    """Take the average of 2 numbers
    """
    return (x + y) / 2

In [8]:
?avg_2

[1;31mSignature:[0m [0mavg_2[0m[1;33m([0m[0mx[0m[1;33m,[0m [0my[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[1;31mDocstring:[0m
Take the average of 2 numbers
    
[1;31mFile:[0m      c:\users\danie\desktop\2021-07-13-scipy-pandas\<ipython-input-7-5441b32aca99>
[1;31mType:[0m      function


In [9]:
avg_2(10, 20)

15.0

In [10]:
import pandas as pd

In [11]:
# create an example dataframe from scratch
dat = pd.DataFrame({
    'a': [10, 20, 30],
    'b': [20, 30, 40]
})

In [12]:
dat

Unnamed: 0,a,b
0,10,20
1,20,30
2,30,40


In [13]:
# square the a column
# math computations are vectorized/broadcast
dat["a"] ** 2

0    100
1    400
2    900
Name: a, dtype: int64

In [14]:
# using our function on a single value
my_sq(4)

16

In [15]:
# apply our function for each value in a column
dat["a"].apply(my_sq)

0    100
1    400
2    900
Name: a, dtype: int64

In [16]:
def my_exp(x, e):
    return x ** e

In [17]:
assert my_exp(2, 10) == 1024

In [18]:
# pass in other function parameters into apply
dat["a"].apply(my_exp, e=3)

0     1000
1     8000
2    27000
Name: a, dtype: int64

In [19]:
# if we want to pass in the value into something that is not the first argument
# one way is to write a wrapper function that will pass into first argument
def flip_exp(e, x):
    return my_exp(x, e)

In [20]:
# apply function that uses column values as the exponent
dat["a"].apply(flip_exp, x=3)

0              59049
1         3486784401
2    205891132094649
Name: a, dtype: int64

In [21]:
# instead of re-writing a new function
# you can use lambda to write on the fly
dat["a"].apply(lambda pizza: my_exp(3, pizza))

0              59049
1         3486784401
2    205891132094649
Name: a, dtype: int64

In [22]:
def print_me(x):
    print(x)

In [23]:
# applying functions on entire dataframes
# will work column by column
dat.apply(print_me)

0    10
1    20
2    30
Name: a, dtype: int64
0    20
1    30
2    40
Name: b, dtype: int64


a    None
b    None
dtype: object

In [24]:
def avg_3(x, y, z):
    return (x + y + z) / 3

In [25]:
assert avg_3(1, 3, 5) == 3

In [26]:
# the entire column of values will be passed into the FIRST argument
dat.apply(avg_3)

TypeError: avg_3() missing 2 required positional arguments: 'y' and 'z'

In [27]:
import numpy as np

In [28]:
# some functions will automatically take in a vector/series of values
def avg_3_apply(col):
    return (np.mean(col))

In [29]:
dat.apply(avg_3_apply)

a    20.0
b    30.0
dtype: float64

In [30]:
# or we have to re-write and parse out the column values
def avg_3_apply(col):
    x = col[0]
    y = col[1]
    z = col[2]
    return (x + y + z) / 3

In [31]:
dat.apply(avg_3_apply)

a    20.0
b    30.0
dtype: float64

In [32]:
dat

Unnamed: 0,a,b
0,10,20
1,20,30
2,30,40


In [33]:
# axis-1 will work row by row
# you usually do not want to do this
# since there are also performance issues associated with this
dat.apply(avg_3_apply, axis=1)

IndexError: index 2 is out of bounds for axis 0 with size 2

In [34]:
dat

Unnamed: 0,a,b
0,10,20
1,20,30
2,30,40


In [35]:
# again, moth math operations are already broadcast/vectorized
(dat["a"] + dat["b"]) / 2

0    15.0
1    25.0
2    35.0
dtype: float64

In [36]:
def avg_2_mod(x, y):
    if (x == 20):
        return np.NaN
    else:
        return (x + y) / 2

In [37]:
# what if we wanted to pass columns of values and have our function work element-wise?
avg_2_mod(dat["a"], dat["b"])

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [38]:
# np.vectorize is a function that takes another function as input
# and it will output a vectorized version of that input function
avg_2_mod_vec = np.vectorize(avg_2_mod)

In [39]:
avg_2_mod_vec(dat["a"], dat["b"])

array([15., nan, 35.])

In [40]:
# we can also use the @ decorator to do the vectorization
# during function definition
# this way we don't need to create a new function
@np.vectorize
def v_avg_2_mod(x, y):
    if (x == 20):
        return np.NaN
    else:
        return (x + y) / 2

In [41]:
v_avg_2_mod(dat["a"], dat["b"])

array([15., nan, 35.])

In [42]:
import numba

In [52]:
@numba.vectorize
def v_avg_2_mod_numba(x, y):
    if (x == 20):
        return np.NaN
    else:
        return (x + y) / 2

In [None]:
# timing our functions

In [47]:
%%timeit
(dat["a"] + dat["b"]) / 2

318 µs ± 12.1 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [50]:
%%timeit
avg_2(dat['a'], dat['b'])

312 µs ± 5.41 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [51]:
%%timeit
v_avg_2_mod(dat['a'], dat['b'])

96.5 µs ± 2.71 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [54]:
%%timeit
v_avg_2_mod_numba(dat['a'].values, dat['b'].values)

The slowest run took 7.87 times longer than the fastest. This could mean that an intermediate result is being cached.
26.4 µs ± 26.7 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)
