# Writing a Python function

In [1]:
def my_function():
    pass

In [2]:
def my_sq(x):
    return x ** 2

In [3]:
my_sq(4)

16

In [4]:
def avg_2(x, y):
    return (x + y) / 2

In [5]:
avg_2(10, 20)

15.0

In [6]:
assert avg_2(10, 20) == 15.0

In [7]:
import pandas as pd

# Applying functions

In [8]:
df = pd.DataFrame({'a':[10,20,30],
                   'b':[20,30,40]})
df

Unnamed: 0,a,b
0,10,20
1,20,30
2,30,40


In [9]:
df['a'] ** 2

0    100
1    400
2    900
Name: a, dtype: int64

In [10]:
type(df['a'])

pandas.core.series.Series

In [11]:
def my_sq(x):
    # assert isinstance(x, int)
    return x ** 2

In [12]:
df['a'].apply(my_sq)

0    100
1    400
2    900
Name: a, dtype: int64

In [13]:
def my_exp(x, e):
    return x ** e

In [14]:
my_exp(2, 2)

4

In [15]:
my_exp(2, 10)

1024

In [16]:
df['a'].apply(my_exp, e=10)

0        10000000000
1     10240000000000
2    590490000000000
Name: a, dtype: int64

In [17]:
def print_me(x):
    print(x)

In [18]:
print_me('hello')

hello


In [19]:
df

Unnamed: 0,a,b
0,10,20
1,20,30
2,30,40


In [20]:
df.apply(print_me, axis=0)

0    10
1    20
2    30
Name: a, dtype: int64
0    20
1    30
2    40
Name: b, dtype: int64


a    None
b    None
dtype: object

In [21]:
def avg_3(x, y, z):
    return (x + y + z) / 3

In [22]:
avg_3(0, 5, 10)

5.0

In [23]:
df.apply(avg_3)

TypeError: ("avg_3() missing 2 required positional arguments: 'y' and 'z'", 'occurred at index a')

In [24]:
import numpy as np
def avg_3_apply(col):
    # return np.sum(col) / 3
    return np.mean(col)

In [25]:
def avg_3_apply(col):
    x = col[0]
    y = col[1]
    z = col[2]
    return (x + y + z) / 3

In [26]:
df

Unnamed: 0,a,b
0,10,20
1,20,30
2,30,40


In [27]:
df.apply(avg_3_apply)

a    20.0
b    30.0
dtype: float64

In [28]:
df.apply(avg_3_apply, axis=1)

IndexError: ('index out of bounds', 'occurred at index 0')

In [29]:
def avg_2_apply(row):
    x = row[0]
    y = row[1]
    return (x + y) / 2

In [30]:
df.apply(avg_2_apply, axis=1)

0    15.0
1    25.0
2    35.0
dtype: float64

In [31]:
def avg_2(x, y):
    return (x + y) / 2

In [32]:
df

Unnamed: 0,a,b
0,10,20
1,20,30
2,30,40


In [33]:
(df['a'] + df['b']) / 2

0    15.0
1    25.0
2    35.0
dtype: float64

In [34]:
def avg_2_mod(x, y):
    if (x == 20):
        return np.NaN
    else:
        return (x + y) / 2


In [35]:
avg_2(df['a'], df['b'])

0    15.0
1    25.0
2    35.0
dtype: float64

In [36]:
avg_2_mod(df['a'], df['b'])

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [37]:
(df['a'] == 20)

0    False
1     True
2    False
Name: a, dtype: bool

In [38]:
(df['a'] == 20).any()

True

In [39]:
avg_2_mod(10, 20)

15.0

In [40]:
avg_2_mod(20, 30)

nan

# Vectorized functions

In [41]:
import numpy as np

In [42]:
avg_2_mod_vec = np.vectorize(avg_2_mod)

In [43]:
avg_2_mod_vec(df['a'], df['b'])

array([15., nan, 35.])

In [44]:
@np.vectorize
def v_avg_2_mod(x, y):
    if (x == 20):
        return np.NaN
    else:
        return (x + y) / 2

In [45]:
v_avg_2_mod(df['a'], df['b'])

array([15., nan, 35.])

In [46]:
import numba

In [47]:
@numba.vectorize
def v_avg_2_mod_numba(x, y):
    if (x == 20):
        return np.NaN
    else:
        return (x + y) / 2

In [48]:
v_avg_2_mod_numba(df['a'], df['b'])

ValueError: [1mcannot determine Numba type of <class 'pandas.core.series.Series'>[0m

In [49]:
v_avg_2_mod_numba(df['a'].values, df['b'].values)

array([15., nan, 35.])

In [50]:
%%timeit
avg_2(df['a'], df['b'])

448 µs ± 41.3 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [51]:
%%timeit
v_avg_2_mod(df['a'], df['b'])

225 µs ± 17 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [52]:
%%timeit
v_avg_2_mod_numba(df['a'].values, df['b'].values)

8.28 µs ± 170 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


### exercise: use the ebola dataset from the tidy section, and instead of using the .str. accessor, write a function to parse out the string.