# PGA TOUR Summary

This notebook shows summary statistics from PGA TOUR Shotlink data for a chosen year.





# Section 1: Import packages, and load the selected year

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from os import getcwd
import os
import plotly.express as px
import plotly.graph_objects as go
import warnings
warnings.filterwarnings('ignore')
import plotly.io as pio; pio.renderers.default='notebook'
import pga_tour_sum_funs# DB: Custom functions to reduce size of notebook 
from IPython.display import Markdown as md
from sklearn.linear_model import LinearRegression

# Parameter choices
year = 2021# DB: 2015 - 2021
#min_drives = 25
min_events = 5

df, df_event = pga_tour_sum_funs.load_data(year)
df_measured, df_measured_player = pga_tour_sum_funs.get_measured_data(df, 0)
df_hole_average, df_hole_sum = pga_tour_sum_funs.get_hole_average_data(df, 0)
df_event_player = pga_tour_sum_funs.player_level_data(df, df_event, min_events)
#df_event_player = pga_tour_sum_funs.add_putting(df_event_player, df)
print('Data Loaded for ' + str(year))

Data Loaded for 2021


# Section 2: Driving Distance

## The figure below shows the distribution of measured drives on the PGA TOUR.  

There are generally two measured drives per rounds, on holes chosen to face in roughly opposite directions to negate the impact of wind, and with flat landing ares to avoid bounce and roll having too large an influence on the distance.

In [6]:
fig = px.histogram(df_measured, x="DrivingDistance_rounded_",
                   title='Driving Distance - Measured Drives (yards) - ' + str(year), histnorm='probability density', nbins = 80)

fig.update_layout(bargap=0.2)
fig.show()

## We can also look at the distribution of measured drives on an event by event basis, shown below.

We can see that there is considerable variabillity both between different events, and also within each event itself.

In [9]:
fig = px.box(df_measured, x="EventName", y='DrivingDistance_rounded_', \
             color="EventName", labels = '',\
             title='Distribution of Driving Distances by Event - ' + str(year))
fig.update_layout(
    xaxis_visible = False)
fig.show()

## The Top 20 longest players, along with a variety of other stats, are shown below.

In [10]:
df_temp = df_event_player.sort_values(by='Driving Distance', ascending=False).reset_index()
df_temp.head(20)

Unnamed: 0,index,Name,Player Number,Money,Driving Distance,Driving Accuracy,GIR,Putts Per Round
0,152,"DeChambeau, Bryson",47959,5409588.11,320.766667,58.018868,67.407407,28.5
1,23,"Clark, Wyndham",51766,986801.08,319.380952,46.075085,66.402116,28.547619
2,183,"McIlroy, Rory",28237,1782341.87,319.125,57.2,66.049383,28.722222
3,149,"Champ, Cameron",52372,645429.34,317.618421,56.766917,68.128655,30.315789
4,44,"Gordon, William",56762,307583.5,314.658333,54.186603,67.314815,29.383333
5,216,"Wolff, Matthew",56278,2248853.83,314.640625,50.225225,65.798611,28.9375
6,169,"Johnson, Dustin",30925,3686600.72,313.25,58.928571,70.238095,29.285714
7,73,"List, Luke",27129,538202.5,312.990196,53.792135,66.993464,29.372549
8,187,"Niemann, Joaquin",45486,2590638.3,312.71875,60.863095,71.527778,28.791667
9,25,"Daffue, MJ",39067,144276.0,312.464286,53.571429,69.84127,29.5


# Section 3: Driving and Accuracy 

## The figure below looks at the releationship between driving distance, accuracy and success.

The size of the bubbles represent the amount of money earned by that player over the course of the season.

In [11]:
# Bubble plot
fig = px.scatter(df_event_player, x="Driving Distance", y="Driving Accuracy", trendline = 'ols',
         color="Name", size="Money", 
                 hover_name="Name", title='Driving Distance, Accuracy and Money Earned - ' + str(year))

# Do regression
x = df_event_player["Driving Distance"].to_numpy().reshape(-1, 1)
y = df_event_player["Driving Accuracy"].to_numpy().reshape(-1, 1)
model = LinearRegression().fit(x,y)
rsq = model.score(x, y)

# Construct line to plot
lx = [np.min(x), np.max(x)]
ly = lx * model.coef_[0] + model.intercept_[0]

# Add line and text
fig.add_scatter(x=lx, y=ly, mode='lines')
fig.add_annotation(x=280, y=45,
            text='Accuracy = ' + str(np.round(model.coef_[0], 3)) +\
                   ' * Driving Distance + ' + str(np.round(model.intercept_[0],)), \
                  showarrow = False)
fig.add_annotation(x=280, y=42,
            text='r-sq = ' + str(np.round(rsq, 3)), showarrow = False)
fig.show()

# Section 4: Accuracy

## The distribution of driving accuracy and GIR percentage by player is shown below.

In [12]:
fig = px.histogram(df_event_player, x="Driving Accuracy",
                   title='Driving Accuracy Percentage by Player - ' + str(year),  nbins = 50)

fig.update_layout(bargap=0.2)
mean_acc = np.mean(df_event_player["Driving Accuracy"])

fig.add_annotation(x=50, y=20,
            text='Average Driving Accuracy % = ' + str(np.round(mean_acc, 1)), showarrow = False)
fig.update_layout(bargap=0.2)
fig.show()

fig = px.histogram(df_event_player, x="GIR",
                   title='GIR Percentage by Player - ' + str(year),  nbins = 50)

mean_gir = np.mean(df_event_player["GIR"])

fig.add_annotation(x=50, y=20,
            text='Average GIR % = ' + str(np.round(mean_gir, 1)), showarrow = False)
fig.update_layout(bargap=0.2)
fig.show()

## In a similar way as we did above with driving accuracy and driving distance, we can look at the relationship between driving accuracy, GIR and money earned.

In [13]:
fig = px.scatter(df_event_player, x="Driving Accuracy", y="GIR", 
         color="Name", size="Money", 
                 hover_name="Name", title='Driving Accuracy, GIR and Money Earned - ' + str(year))
# Do regression
x = df_event_player["Driving Accuracy"].to_numpy().reshape(-1, 1)
y = df_event_player["GIR"].to_numpy().reshape(-1, 1)
model = LinearRegression().fit(x,y)
rsq = model.score(x, y)

# Construct line to plot
lx = [np.min(x), np.max(x)]
ly = lx * model.coef_[0] + model.intercept_[0]

# Add line and text
fig.add_scatter(x=lx, y=ly, mode='lines')
fig.add_annotation(x=55, y=55,
            text='GIR = ' + str(np.round(model.coef_[0], 3)) +\
                   ' * Driving Accuracy + ' + str(np.round(model.intercept_[0],)), \
                  showarrow = False)
fig.add_annotation(x=55, y=52,
            text='r-sq = ' + str(np.round(rsq, 3)), showarrow = False)
fig.show()

## The Top 20 players in driving accuracy percentage, along with a variety of other stats, are shown below.

In [14]:
df_temp = df_event_player.sort_values(by='Driving Accuracy', ascending=False).reset_index()
df_temp.head(20)

Unnamed: 0,index,Name,Player Number,Money,Driving Distance,Driving Accuracy,GIR,Putts Per Round
0,208,"Todd, Brendon",30927,960939.16,275.8,72.714286,64.888889,28.28
1,238,"Moore, Ryan",26596,147266.0,287.975,72.142857,65.555556,29.4
2,143,"Ancer, Abraham",45526,1694962.28,289.96875,72.02381,71.064815,28.916667
3,121,"Stuard, Brian",31560,586186.66,281.241379,71.905941,65.900383,28.931034
4,173,"Kisner, Kevin",29478,1196576.44,287.029412,71.848739,66.830065,28.470588
5,202,"Simpson, Webb",29221,1628989.85,292.881579,71.428571,70.614035,28.710526
6,61,"Kim, Joohyung",55182,98258.0,294.305556,70.238095,66.358025,28.555556
7,35,"Garnett, Brice",29535,426153.17,288.351852,70.212766,67.489712,28.907407
8,168,"Im, Sungjae",39971,2604052.6,297.431818,69.891304,69.69697,28.69697
9,150,"Conners, Corey",39997,2440045.0,294.537037,69.652406,71.399177,29.240741


## And the table below shows a similar table for the Top 20 players in GIR percentage.

In [15]:
df_temp = df_event_player.sort_values(by='GIR', ascending=False).reset_index()
df_temp.head(20)

Unnamed: 0,index,Name,Player Number,Money,Driving Distance,Driving Accuracy,GIR,Putts Per Round
0,22,"Cink, Stewart",20229,1720715.0,305.452381,58.983051,74.074074,29.47619
1,87,"NeSmith, Matthew",36871,943614.49,297.634615,65.426997,73.290598,30.057692
2,46,"Grillo, Emiliano",31646,850127.59,299.35,64.200477,73.055556,29.383333
3,93,"Percy, Cameron",22056,505731.0,283.833333,63.265306,72.751323,29.642857
4,184,"Morikawa, Collin",50525,2588517.33,293.888889,69.642857,72.530864,28.861111
5,195,"Rahm, Jon",46970,2969382.33,306.0625,61.964286,72.5,29.4
6,140,"Wise, Aaron",49964,1254218.26,302.238095,54.280822,71.825397,30.02381
7,232,"Henley, Russell",34098,1650170.36,290.22619,67.241379,71.693122,28.952381
8,50,"Hahn, James",32448,1048184.5,299.845238,65.753425,71.560847,29.119048
9,187,"Niemann, Joaquin",45486,2590638.3,312.71875,60.863095,71.527778,28.791667


# Section 5: Putting

In [23]:
fig = px.histogram(df_event_player, x="Putts Per Round",
                   title='Putts per Round - ' + str(year), nbins = 80)

fig.update_layout(bargap=0.2)
mean_ppr = np.mean(df_event_player["Putts Per Round"])

fig.add_annotation(x=28, y=10,
            text='Average PPR = ' + str(np.round(mean_ppr, 2)), showarrow = False)
fig.show()
df_event_player

Unnamed: 0,Name,Player Number,Money,Driving Distance,Driving Accuracy,GIR,Putts Per Round
0,"Anderson, Mark",33120,82411.25,283.867647,59.873950,68.790850,29.676471
1,"Aphibarnrat, Kiradech",30978,115820.00,288.620000,57.714286,58.000000,28.280000
2,"Armour, Ryan",19803,344455.96,279.456522,69.626168,64.613527,28.608696
3,"Baddeley, Aaron",22371,65940.00,283.343750,56.919643,64.930556,28.500000
4,"Bae, Sangmoon",28259,33700.00,290.656250,56.250000,59.375000,28.562500
...,...,...,...,...,...,...,...
235,"Molinari, Francesco",25198,751581.00,292.178571,60.567010,63.690476,28.500000
236,"Koepka, Brooks",36689,2823715.33,308.166667,54.326923,67.962963,28.333333
237,"Harrington, Padraig",20766,151888.34,300.383333,55.048077,60.185185,28.900000
238,"Moore, Ryan",26596,147266.00,287.975000,72.142857,65.555556,29.400000


## The relationship between PPR, GIR and money earned is shown below.

In [24]:
# Bubble plot
fig = px.scatter(df_event_player, x="GIR", y="Putts Per Round",
         color="Name", size="Money", 
                 hover_name="Name", title='GIR, Putts Per Round and Money Earned - ' + str(year))

# Do regression
x = df_event_player["GIR"].to_numpy().reshape(-1, 1)
y = df_event_player["Putts Per Round"].to_numpy().reshape(-1, 1)
model = LinearRegression().fit(x,y)
rsq = model.score(x, y)

# Construct line to plot
lx = [np.min(x), np.max(x)]
ly = lx * model.coef_[0] + model.intercept_[0]

# Add line and text
fig.add_scatter(x=lx, y=ly, mode='lines')
fig.add_annotation(x=50, y=30.5,
            text='PPR = ' + str(np.round(model.coef_[0], 3)) +\
                   ' * GIR + ' + str(np.round(model.intercept_[0],)), \
                  showarrow = False)
fig.add_annotation(x=50, y=30,
            text='r-sq = ' + str(np.round(rsq, 3)), showarrow = False)
fig.show()

## The table below shows a similar table for the Top 20 players in PPR.

In [18]:
df_temp = df_event_player.sort_values(by='Putts Per Round', ascending=True).reset_index()
df_temp.head(20)

Unnamed: 0,index,Name,Player Number,Money,Driving Distance,Driving Accuracy,GIR,Putts Per Round
0,130,"Uihlein, Peter",29484,75356.0,286.55,44.285714,58.611111,27.75
1,196,"Reed, Patrick",34360,2514654.62,287.40625,64.508929,65.625,27.8125
2,67,"Lahiri, Anirban",31420,632237.25,296.828947,49.43609,63.74269,27.815789
3,113,"Spieth, Jordan",34046,3024056.04,297.522727,50.163934,63.636364,27.909091
4,117,"Stefani, Shawn",33418,0.0,274.090909,53.246753,51.010101,27.909091
5,77,"Malnati, Peter",34466,1333148.82,290.695652,57.763975,64.371981,27.913043
6,203,"Smith, Cameron",35891,2799044.67,296.821429,56.632653,66.534392,27.97619
7,57,"Hoge, Tom",35532,1145746.64,289.169643,61.053985,64.781746,28.053571
8,206,"Thomas, Justin",33448,5419760.76,300.434211,58.834586,69.736842,28.078947
9,163,"Homa, Max",39977,2664442.44,303.260417,56.586826,65.046296,28.083333
