## Week 3: Jupyter Notebook - Pandas GroupBy, Apply, and Create Your Own Functions
Make sure to select your Kernel before starting!

### Import Pandas Library/Package
`import pandas as pd`

In [1]:
import pandas as pd

### Optional: Displaying your Dataframe
Don't use for big datasets

In [None]:
pd.set_option("display.max_columns", None)  # Show all columns
pd.set_option("display.max_rows", None)     # Show all rows

### GroupBy Functions
`df.groupby()`

Create own dataset

In [2]:
data = {
    'Name': ['Zophie', 'Alice', 'Bob', 'Charlie', 'Max', 'Milo', 'Luna', 'Bella'],
    'Age': [7, 5, 3, 2, 5, 3, 2, 5],
    'Color': ['Gray', 'Black', 'Brown', 'White', 'Black', 'Brown', 'White', 'Black'],
    'Owner': ['Charles', 'Charles', 'Cj', 'Cj', 'Cj', 'Charles', 'Cj', 'Charles'],
    'Type': ['Cat', 'Cat', 'Dog', 'Dog', 'Dog', 'Cat', 'Dog', 'Cat']
}

# Create DataFrame
df = pd.DataFrame(data)
df

Unnamed: 0,Name,Age,Color,Owner,Type
0,Zophie,7,Gray,Charles,Cat
1,Alice,5,Black,Charles,Cat
2,Bob,3,Brown,Cj,Dog
3,Charlie,2,White,Cj,Dog
4,Max,5,Black,Cj,Dog
5,Milo,3,Brown,Charles,Cat
6,Luna,2,White,Cj,Dog
7,Bella,5,Black,Charles,Cat


Count how many pets of each color each owner has

`.size()`

In [3]:
# Group By Owner and Color
df.groupby(['Owner', 'Color']).size()

Owner    Color
Charles  Black    2
         Brown    1
         Gray     1
Cj       Black    1
         Brown    1
         White    2
dtype: int64

### Read In Datasets
`pd.read_csv()`

Read in the file from the data folder called `Shot_Visuals_KaylanBigun_NathanTrouve.csv`



In [5]:
# Read data and create df variable
shots = pd.read_csv('data/Shot_Visuals_KaylanBigun_NathanTrouve.csv')
# Print the head of the object
shots.head()

Unnamed: 0,pointScore,gameScore,setScore,isPointStart,pointStartTime,isPointEnd,pointEndTime,pointNumber,isBreakPoint,shotInRally,...,returnerName,shotHitBy,InsideOut,InsideIn,isDoubleFault,pointWonBy,lastShotError,serveResult,serveInPlacement,depth
0,0-0,0-0,0-0,1.0,25020.0,,,1,,1,...,Nathan Trouve,Kaylan Bigun,,,,,,2nd Serve In,T,
1,0-0,0-0,0-0,,36320.0,,,1,,2,...,Nathan Trouve,Nathan Trouve,1.0,,,,,,,Deep
2,0-0,0-0,0-0,,37349.0,,,1,,3,...,Nathan Trouve,Kaylan Bigun,1.0,,,,,,,
3,0-0,0-0,0-0,,38549.0,1.0,38549.0,1,,4,...,Nathan Trouve,Nathan Trouve,,,,Kaylan Bigun,1.0,,,Long
4,15-0,0-0,0-0,1.0,57814.0,1.0,61681.0,2,,1,...,Nathan Trouve,Kaylan Bigun,,,,Kaylan Bigun,,1st Serve In,Wide,


##### Create Sample dataframe
- save as `sample_df`
- First 3 points

In [6]:
# sample_df and first 3 points
sample_df = shots.head(11)
sample_df

Unnamed: 0,pointScore,gameScore,setScore,isPointStart,pointStartTime,isPointEnd,pointEndTime,pointNumber,isBreakPoint,shotInRally,...,returnerName,shotHitBy,InsideOut,InsideIn,isDoubleFault,pointWonBy,lastShotError,serveResult,serveInPlacement,depth
0,0-0,0-0,0-0,1.0,25020.0,,,1,,1,...,Nathan Trouve,Kaylan Bigun,,,,,,2nd Serve In,T,
1,0-0,0-0,0-0,,36320.0,,,1,,2,...,Nathan Trouve,Nathan Trouve,1.0,,,,,,,Deep
2,0-0,0-0,0-0,,37349.0,,,1,,3,...,Nathan Trouve,Kaylan Bigun,1.0,,,,,,,
3,0-0,0-0,0-0,,38549.0,1.0,38549.0,1,,4,...,Nathan Trouve,Nathan Trouve,,,,Kaylan Bigun,1.0,,,Long
4,15-0,0-0,0-0,1.0,57814.0,1.0,61681.0,2,,1,...,Nathan Trouve,Kaylan Bigun,,,,Kaylan Bigun,,1st Serve In,Wide,
5,30-0,0-0,0-0,1.0,61671.0,,,3,,1,...,Nathan Trouve,Kaylan Bigun,,,,,,1st Serve In,T,
6,30-0,0-0,0-0,,81681.0,,,3,,2,...,Nathan Trouve,Nathan Trouve,,1.0,,,,,,Short
7,30-0,0-0,0-0,,82877.0,,,3,,3,...,Nathan Trouve,Kaylan Bigun,,1.0,,,,,,Short
8,30-0,0-0,0-0,,84155.0,,,3,,4,...,Nathan Trouve,Nathan Trouve,,,,,,,,Short
9,30-0,0-0,0-0,,85964.0,,,3,,5,...,Nathan Trouve,Kaylan Bigun,,,,,,,,Short


Group By `pointNumber` and subset the columns `pointStartTime`

Use the `.first()` function to grab the first value


In [7]:
sample_df.groupby('pointNumber')['pointStartTime'].first()

pointNumber
1    25020.0
2    57814.0
3    61671.0
Name: pointStartTime, dtype: float64

Use the `.last()` function to grab the last value


In [8]:
sample_df.groupby('pointNumber')['pointStartTime'].last()

pointNumber
1    38549.0
2    57814.0
3    87340.0
Name: pointStartTime, dtype: float64

### Create your own functions

`def function()`


In [9]:
# set a variable x = 2
x = 2
# print variable x
x

2

In [10]:
# Create a function that takes a variable x and returns x + 1
def function(x):
    answer = x + 1
    return answer

In [11]:
function(4)

5

##### Personalize your function name and add multiple arguments

In [12]:
# Create a function that takes a variable x and returns x + 2
def add_two(x):
    answer = x + 2
    return answer

In [13]:
# print function and arguments
add_two(6)

8

### Apply Functions
`.apply()`

In [14]:
# Use the pets dataset for this example
df

Unnamed: 0,Name,Age,Color,Owner,Type
0,Zophie,7,Gray,Charles,Cat
1,Alice,5,Black,Charles,Cat
2,Bob,3,Brown,Cj,Dog
3,Charlie,2,White,Cj,Dog
4,Max,5,Black,Cj,Dog
5,Milo,3,Brown,Charles,Cat
6,Luna,2,White,Cj,Dog
7,Bella,5,Black,Charles,Cat


In [16]:
# Add 1 to the Age column of the pets dataset
df['Age'] = df['Age'].apply(lambda x: x + 1)
df['Age']

0    9
1    7
2    5
3    4
4    7
5    5
6    4
7    7
Name: Age, dtype: int64

In [17]:
# same as creating your own function
def function(x):
    answer = x * 10 
    return answer

df['Age'] = df['Age'].apply(function)
df['Age']

0    90
1    70
2    50
3    40
4    70
5    50
6    40
7    70
Name: Age, dtype: int64

In [18]:
# Also use the summarizing functions (mean, max, mean)
df['Age'].apply('mean')

60.0

In [19]:
df['Age'].apply('max')

90

In [21]:
# But the same as this
df['Age'] + 1

0    91
1    71
2    51
3    41
4    71
5    51
6    41
7    71
Name: Age, dtype: int64

But if you notice this code below gives us the same as our `plus_one()` function

Use case is in the conditional statements!!

In [24]:
# create a function that classifies 5 years and younger as 'young' and everything above 'old'
def classify_age(age):
    if age <= 5:
        return 'young'
    else:
        return 'old'
    
df['life_stage'] = df['Age'].apply(classify_age)
df

Unnamed: 0,Name,Age,Color,Owner,Type,life_stage
0,Zophie,90,Gray,Charles,Cat,old
1,Alice,70,Black,Charles,Cat,old
2,Bob,50,Brown,Cj,Dog,old
3,Charlie,40,White,Cj,Dog,old
4,Max,70,Black,Cj,Dog,old
5,Milo,50,Brown,Charles,Cat,old
6,Luna,40,White,Cj,Dog,old
7,Bella,70,Black,Charles,Cat,old


Create a new Column called duration in `new_df`
- using the apply function **row-rise**

In [25]:
# new_df
new_df = sample_df.groupby('pointNumber')[['pointStartTime']].first()
new_df


Unnamed: 0_level_0,pointStartTime
pointNumber,Unnamed: 1_level_1
1,25020.0
2,57814.0
3,61671.0


In [26]:
new_df['pointEndTime'] = sample_df.groupby('pointNumber')[['pointEndTime']].last()
new_df

Unnamed: 0_level_0,pointStartTime,pointEndTime
pointNumber,Unnamed: 1_level_1,Unnamed: 2_level_1
1,25020.0,38549.0
2,57814.0,61681.0
3,61671.0,90130.0


In [27]:
def duration(end, start):
    duration = end - start
    return duration

new_df['duration'] = new_df.apply(lambda x: duration(x['pointEndTime'], x['pointStartTime']), axis = 1)
new_df


Unnamed: 0_level_0,pointStartTime,pointEndTime,duration
pointNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,25020.0,38549.0,13529.0
2,57814.0,61681.0,3867.0
3,61671.0,90130.0,28459.0


But as you notice this function could be simplified to...

In [28]:
new_df['duration2'] = new_df['pointEndTime'] - new_df['pointStartTime']
new_df

Unnamed: 0_level_0,pointStartTime,pointEndTime,duration,duration2
pointNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,25020.0,38549.0,13529.0,13529.0
2,57814.0,61681.0,3867.0,3867.0
3,61671.0,90130.0,28459.0,28459.0


Use case is in the conditional statements!!

### Problems to Practice Yourself!!

#### Problem 1: From the `(playsight)kaylan_usc.xlsx` dataset, classify the speed of the ball
- More than 110: **Fast**
- Less than 110 and greater than 70: **Regular**
- Less than 70: **Slow**


In [29]:
import pandas as pd

shots_excel = pd.read_excel('data/(playsight)kaylan_usc.xlsx', sheet_name='Shots')
shots_excel.head()

def classify_speed(speed):
    if speed > 110:
        return "fast"
    elif speed > 70:
        return "regular"
    else:
        return "slow"
    
shots_excel['ClassifiedSpeed'] = shots_excel['Speed (MPH)'].apply(classify_speed)
shots_excel


Unnamed: 0,Player,Shot,Type,Stroke,Spin,Speed (MPH),Point,Game,Set,Bounce Depth,...,Hit Side,Hit (x),Hit (y),Hit (z),Direction,Result,Favorited,Start Time,Video Time,ClassifiedSpeed
0,Kaylan Bigun,1,first_serve,Serve,Slice,86.335464,1,1,1,deep,...,near,0.978027,0.181941,2.421875,down the T,Out,False,22:45:24,25.020000,regular
1,Nathan Trouve,1,none,Backhand,Flat,35.898598,1,1,1,deep,...,far,-0.909668,24.365038,1.114258,inside in,In,False,22:45:25,26.020000,slow
2,Kaylan Bigun,1,second_serve,Serve,Slice,81.066826,1,1,1,short,...,near,0.976562,0.518099,2.332031,down the T,In,False,22:45:34,35.220001,regular
3,Nathan Trouve,2,second_return,Backhand,Flat,61.041412,1,1,1,deep,...,far,-1.298828,23.865038,0.971680,inside out,In,False,22:45:35,36.320000,slow
4,Kaylan Bigun,3,serve_plus_one,Forehand,Topspin,62.817978,1,1,1,out,...,near,0.910156,-0.470559,0.833008,inside out,Out,False,22:45:36,37.349998,slow
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
539,Nathan Trouve,1,first_serve,Serve,Flat,108.220818,115,20,2,short,...,far,1.132812,22.286913,2.345703,down the T,In,False,23:58:52,4433.919922,regular
540,Kaylan Bigun,2,first_return,Forehand,Topspin,36.660290,115,20,2,short,...,near,-3.189453,0.545066,1.249023,cross court,In,False,23:58:53,4434.580078,slow
541,Nathan Trouve,3,serve_plus_one,Backhand,Slice,42.250107,115,20,2,short,...,far,2.984375,23.365038,0.604004,down the line,In,False,23:58:55,4436.419922,slow
542,Nathan Trouve,4,return_plus_one,Overhead,Flat,60.578091,115,20,2,short,...,far,2.121094,17.849413,2.363281,---,In,False,23:58:58,4439.640137,slow


#### Problem 2: From the `Shot_Visuals_KaylanBigun_NathanTrouve.csv` dataset, create a new column called "shot_depth"
- Use the image below to classify shots as deep, short, and long
- Create a function that takes the arguments `shotLocationX` and `shotLocationY` to create the outputs **short**, **deep**, and **long**

In [None]:
from IPython.display import Image

Image(filename='images/court_dimensions.jpg', width=250)

# IN RELATION TO THIS DATASET'S COORDINATES

# Origin is at the center of the court
# 455 is the top of the court, -455 is the bottom of the court
# -210 is the left side of the court, 210 is the right side of the court

In [30]:
# read in the dataset
shots = pd.read_csv('data/Shot_Visuals_KaylanBigun_NathanTrouve.csv')
shots.head()

Unnamed: 0,pointScore,gameScore,setScore,isPointStart,pointStartTime,isPointEnd,pointEndTime,pointNumber,isBreakPoint,shotInRally,...,returnerName,shotHitBy,InsideOut,InsideIn,isDoubleFault,pointWonBy,lastShotError,serveResult,serveInPlacement,depth
0,0-0,0-0,0-0,1.0,25020.0,,,1,,1,...,Nathan Trouve,Kaylan Bigun,,,,,,2nd Serve In,T,
1,0-0,0-0,0-0,,36320.0,,,1,,2,...,Nathan Trouve,Nathan Trouve,1.0,,,,,,,Deep
2,0-0,0-0,0-0,,37349.0,,,1,,3,...,Nathan Trouve,Kaylan Bigun,1.0,,,,,,,
3,0-0,0-0,0-0,,38549.0,1.0,38549.0,1,,4,...,Nathan Trouve,Nathan Trouve,,,,Kaylan Bigun,1.0,,,Long
4,15-0,0-0,0-0,1.0,57814.0,1.0,61681.0,2,,1,...,Nathan Trouve,Kaylan Bigun,,,,Kaylan Bigun,,1st Serve In,Wide,


In [35]:
# create function
def classify_shot_depth(shotLocationX, shotLocationY):
    
    if abs(shotLocationX) > 157.5:
        return 'long'

    y_abs = abs(shotLocationY)

    if y_abs <= 157.5:
        return 'short'
    elif y_abs <= 242:
        return 'deep'
    else:
        return 'long'

In [38]:
# apply function
shots['ShotDepth'] = shots.apply(lambda x: classify_shot_depth(x['shotLocationX'], x['shotLocationY']), axis = 1)
shots.head()

Unnamed: 0,pointScore,gameScore,setScore,isPointStart,pointStartTime,isPointEnd,pointEndTime,pointNumber,isBreakPoint,shotInRally,...,shotHitBy,InsideOut,InsideIn,isDoubleFault,pointWonBy,lastShotError,serveResult,serveInPlacement,depth,ShotDepth
0,0-0,0-0,0-0,1.0,25020.0,,,1,,1,...,Kaylan Bigun,,,,,,2nd Serve In,T,,long
1,0-0,0-0,0-0,,36320.0,,,1,,2,...,Nathan Trouve,1.0,,,,,,,Deep,long
2,0-0,0-0,0-0,,37349.0,,,1,,3,...,Kaylan Bigun,1.0,,,,,,,,long
3,0-0,0-0,0-0,,38549.0,1.0,38549.0,1,,4,...,Nathan Trouve,,,,Kaylan Bigun,1.0,,,Long,long
4,15-0,0-0,0-0,1.0,57814.0,1.0,61681.0,2,,1,...,Kaylan Bigun,,,,Kaylan Bigun,,1st Serve In,Wide,,long
