# Applying Functions

In [34]:
import sys
print(sys.version)
import numpy as np
print(np.__version__)
import pandas as pd
print(pd.__version__)
import matplotlib.pyplot as plt

%matplotlib inline

3.9.12 (main, Apr  5 2022, 01:53:17) 
[Clang 12.0.0 ]
1.21.5
1.4.2


In [35]:
np.random.seed(200)

Mice = pd.DataFrame( np.random.geometric(.2, size = (5,5)),
                   columns = ['test_{}'.format(x) for x in range(5)],
                   index = ['mouse_{}'.format(x) for x in range(5)])
Mice

Unnamed: 0,test_0,test_1,test_2,test_3,test_4
mouse_0,14,2,5,3,7
mouse_1,1,2,11,3,18
mouse_2,10,20,12,2,9
mouse_3,1,7,2,1,13
mouse_4,8,4,10,4,3


# retrieve csv file

In [88]:
gradebook = pd.read_csv('gradebook.csv')
gradebook

Unnamed: 0,student,midterm,final
0,Ben,88,90
1,May,78,87
2,Sue,92,64
3,Blake,56,90
4,Amy,79,96
5,Steve,92,84


In [8]:
# one difference

## note the index (which was student) was lost when calling it back

In [9]:
#revert

In [89]:
gradebook = gradebook.set_index('student')
gradebook

Unnamed: 0_level_0,midterm,final
student,Unnamed: 1_level_1,Unnamed: 2_level_1
Ben,88,90
May,78,87
Sue,92,64
Blake,56,90
Amy,79,96
Steve,92,84


In [11]:
# now thiings are back to the way they were

---

In [12]:
# add some missing values -- just to make things more interesting

In [91]:
gradebook.loc['Steve', 'midterm'] = np.nan

In [92]:
gradebook.loc['Amy', 'final'] = np.nan

In [19]:
gradebook

Unnamed: 0_level_0,midterm,final
student,Unnamed: 1_level_1,Unnamed: 2_level_1
Ben,88.0,90.0
May,78.0,87.0
Sue,92.0,64.0
Blake,56.0,90.0
Amy,,
Steve,,84.0


![image.png](attachment:f15a056d-54d5-4db8-91f8-532f52b78444.png)

---

### mean

In [93]:
gradebook.mean()

midterm    78.6
final      83.0
dtype: float64

#### pandas ignored the missing values

#### with numpy this would have been different

# Sometimes this can be a probem -- the fact that pandas does not care about them

#### replicate numpy functionality -- with skip NA

In [21]:
gradebook.mean(skipna = False)

midterm   NaN
final     NaN
dtype: float64

---

## by default the mean function was applied to each column

In [22]:
# as this usually what makes sens

---

In [94]:
gradebook.mean(axis = 1)

student
Ben      89.0
May      82.5
Sue      78.0
Blake    73.0
Amy      79.0
Steve    84.0
dtype: float64

#### this does the mean by rows (axis =1)

---

In [95]:
avg = gradebook.mean()

In [43]:
avg

midterm    78.6
final      83.0
dtype: float64

In [44]:
type(avg)

pandas.core.series.Series

In [45]:
# by default, this is a pandas series

In [46]:
gradebook - avg

# subtract the midterm average

Unnamed: 0_level_0,midterm,final
student,Unnamed: 1_level_1,Unnamed: 2_level_1
Ben,9.4,7.0
May,-0.6,4.0
Sue,13.4,-19.0
Blake,-22.6,7.0
Amy,0.4,
Steve,,1.0


In [48]:
# the average series is being BROADCAST in the same way as it would be in Numpy

---

In [49]:
gradebook.sum()

midterm    393.0
final      415.0
dtype: float64

In [50]:
gradebook.min()

midterm    56.0
final      64.0
dtype: float64

In [51]:
gradebook.std()

midterm    13.957077
final      10.908712
dtype: float64

In [52]:
gradebook.corr()

Unnamed: 0,midterm,final
midterm,1.0,-0.574706
final,-0.574706,1.0


In [53]:
# this is the correlation matrix

In [54]:
# there is a negative correlation between midterm adn final

---

In [55]:
gradebook.cov()

Unnamed: 0,midterm,final
midterm,194.8,-116.5
final,-116.5,119.0


In [56]:
# covariance

In [57]:
gradebook.corrwith(gradebook.final)

midterm   -0.574706
final      1.000000
dtype: float64

In [58]:
# this compares correlation with another series

---

# Custom Functions

---

In [96]:
g_range = lambda x:x.max() - x.min()

# x has to be a series/array

In [97]:
gradebook.apply(g_range)

midterm    36.0
final      26.0
dtype: float64

In [98]:
gradebook.apply(g_range, axis = 1)

student
Ben       2.0
May       9.0
Sue      28.0
Blake    34.0
Amy       0.0
Steve     0.0
dtype: float64

--

## Aggregate Method

In [63]:
gradebook.agg(g_range)

midterm    36.0
final      26.0
dtype: float64

In [64]:
# we are now moving from a 2D dataframe to a 1D series

In [65]:
# so far, the functions only return single values

---

### writing functions that return a series

In [99]:
def g_min_max(x): # think of x as a aseires that reprennts one row or column
    return pd.Series([x.min(), x.max()], index = ['min', 'max'])

In [100]:
gradebook.apply(g_min_max)

Unnamed: 0,midterm,final
min,56.0,64.0
max,92.0,90.0


---

In [70]:
# one more wrinkle on applying a function

In [71]:
# you can also apply a function to each single value of the dataframe

In [101]:
gradebook.applymap(lambda x: x+1)

Unnamed: 0_level_0,midterm,final
student,Unnamed: 1_level_1,Unnamed: 2_level_1
Ben,89.0,91.0
May,79.0,88.0
Sue,93.0,65.0
Blake,57.0,91.0
Amy,80.0,
Steve,,85.0


In [73]:
# this allows you to add a gradepoint to every single value

---

## Describe

In [102]:
gradebook.describe()

Unnamed: 0,midterm,final
count,5.0,5.0
mean,78.6,83.0
std,13.957077,10.908712
min,56.0,64.0
25%,78.0,84.0
50%,79.0,87.0
75%,88.0,90.0
max,92.0,90.0


count = number of non missing values

In [75]:
# describe works differnt depending on the type of variable

In [76]:
# add a letter grade

In [103]:
gradebook.mean(axis=1)

student
Ben      89.0
May      82.5
Sue      78.0
Blake    73.0
Amy      79.0
Steve    84.0
dtype: float64

In [104]:
pd.cut(gradebook.mean(axis=1), (0, 70, 80, 90, 100), labels =("F", "C", "B", "A"))

student
Ben      B
May      B
Sue      C
Blake    C
Amy      C
Steve    B
dtype: category
Categories (4, object): ['F' < 'C' < 'B' < 'A']

In [81]:
# letters have to be less than couint

In [82]:
# F is between 0-70

---

In [83]:
# save as new variable

In [105]:
gradebook['letter_grade'] = pd.cut(gradebook.mean(axis=1), (0, 70, 80, 90, 100), labels =("F", "C", "B", "A"))

# take a dictionary style approach

In [106]:
gradebook

Unnamed: 0_level_0,midterm,final,letter_grade
student,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Ben,88.0,90.0,B
May,78.0,87.0,B
Sue,92.0,64.0,C
Blake,56.0,90.0,C
Amy,79.0,,C
Steve,,84.0,B


In [109]:
gradebook.letter_grade.describe()

# this runs describe for a variable

count     6
unique    2
top       C
freq      3
Name: letter_grade, dtype: object

### see that describe works differently -- information

In [108]:
gradebook.describe()

Unnamed: 0,midterm,final
count,5.0,5.0
mean,78.6,83.0
std,13.957077,10.908712
min,56.0,64.0
25%,78.0,84.0
50%,79.0,87.0
75%,88.0,90.0
max,92.0,90.0


---

# value counts

In [110]:
# more important than describe

In [112]:
gradebook.letter_grade.describe()

count     6
unique    2
top       C
freq      3
Name: letter_grade, dtype: object

In [113]:
gradebook.letter_grade.value_counts()

C    3
B    3
F    0
A    0
Name: letter_grade, dtype: int64

## this gives the frequency of the values

In [114]:
# 3 C's, 3 B's ..

---

In [115]:
gradebook.to_csv('gradebook_v2.csv')