## **Installation**

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import norm
import numpy as np

## **Data Import**

In [2]:
# Create a DataFrame from CSV data
df = pd.read_csv("strokes.csv")

## **SG Section**

In [3]:
print(f'There are {len(df["player_id"].unique())} players')

There are 30 players


In [4]:
shot_1 = df.loc[df['shot'] == 1]
avg_round = shot_1.groupby(['player_id', 'round','hole']).agg({'hole_score': 'sum'}).reset_index()
avg_round

Unnamed: 0,player_id,round,hole,hole_score
0,1810,1,1,4
1,1810,1,2,2
2,1810,1,3,4
3,1810,1,4,4
4,1810,1,5,5
...,...,...,...,...
2155,33141,4,14,3
2156,33141,4,15,4
2157,33141,4,16,4
2158,33141,4,17,5


In [5]:
sg_calc = avg_round.groupby(['hole','round'])['hole_score'].mean().reset_index()
sg_calc.rename(columns={'hole_score': 'sg_per_hole'}, inplace=True)
sg_calc

Unnamed: 0,hole,round,sg_per_hole
0,1,1,3.833333
1,1,2,4.033333
2,1,3,4.166667
3,1,4,4.000000
4,2,1,3.066667
...,...,...,...
67,17,4,4.300000
68,18,1,2.900000
69,18,2,3.066667
70,18,3,3.166667


In [6]:
selected_columns = ['player_id', 'last_name', 'round','hole','hole_score','par_value','yardage']  # Add the column names you want to select
condensed_df = shot_1[selected_columns].copy()
condensed_df

Unnamed: 0,player_id,last_name,round,hole,hole_score,par_value,yardage
0,1810,Mickelson,1,1,4,4,424
4,1810,Mickelson,1,2,2,3,214
6,1810,Mickelson,1,3,4,4,387
10,1810,Mickelson,1,4,4,4,440
14,1810,Mickelson,1,5,5,4,520
...,...,...,...,...,...,...,...
8403,33141,Bradley,4,14,3,4,442
8406,33141,Bradley,4,15,4,5,525
8410,33141,Bradley,4,16,4,4,481
8414,33141,Bradley,4,17,5,4,470


In [7]:
import pandas as pd

# Assuming sg_calc and condensed_df are your DataFrames
condensed_df['sg_difference'] = condensed_df.apply(
    lambda row: sg_calc.loc[(sg_calc['round'] == row['round']) & (sg_calc['hole'] == row['hole']), 'sg_per_hole'].iloc[0] - row['hole_score'],
    axis=1
)

condensed_df['sg_per_hole'] = condensed_df.apply(
    lambda row: sg_calc.loc[(sg_calc['round'] == row['round']) & (sg_calc['hole'] == row['hole']), 'sg_per_hole'].iloc[0],
    axis=1
)

# Display the resulting DataFrame
condensed_df


Unnamed: 0,player_id,last_name,round,hole,hole_score,par_value,yardage,sg_difference,sg_per_hole
0,1810,Mickelson,1,1,4,4,424,-0.166667,3.833333
4,1810,Mickelson,1,2,2,3,214,1.066667,3.066667
6,1810,Mickelson,1,3,4,4,387,-0.033333,3.966667
10,1810,Mickelson,1,4,4,4,440,-0.233333,3.766667
14,1810,Mickelson,1,5,5,4,520,-0.766667,4.233333
...,...,...,...,...,...,...,...,...,...
8403,33141,Bradley,4,14,3,4,442,0.766667,3.766667
8406,33141,Bradley,4,15,4,5,525,0.366667,4.366667
8410,33141,Bradley,4,16,4,4,481,0.166667,4.166667
8414,33141,Bradley,4,17,5,4,470,-0.700000,4.300000


# Strokes gained sanity check

In [8]:
sg_calc.iloc[:20,:]

Unnamed: 0,hole,round,sg_per_hole
0,1,1,3.833333
1,1,2,4.033333
2,1,3,4.166667
3,1,4,4.0
4,2,1,3.066667
5,2,2,3.266667
6,2,3,2.866667
7,2,4,3.1
8,3,1,3.966667
9,3,2,3.866667


In [17]:
condensed_df[['round','hole','hole_score','sg_per_hole','sg_difference','par_value']].iloc[:20,:]


Unnamed: 0,round,hole,hole_score,sg_per_hole,sg_difference,par_value
0,1,1,4,3.833333,-0.166667,4
4,1,2,2,3.066667,1.066667,3
6,1,3,4,3.966667,-0.033333,4
10,1,4,4,3.766667,-0.233333,4
14,1,5,5,4.233333,-0.766667,4
19,1,6,3,3.333333,0.333333,3
22,1,7,4,4.1,0.1,4
26,1,8,4,3.8,-0.2,4
30,1,9,5,4.766667,-0.233333,5
35,1,10,4,4.066667,0.066667,4


# Append Sci Pi normal fit!

In [10]:
condensed_df

Unnamed: 0,player_id,last_name,round,hole,hole_score,par_value,yardage,sg_difference,sg_per_hole
0,1810,Mickelson,1,1,4,4,424,-0.166667,3.833333
4,1810,Mickelson,1,2,2,3,214,1.066667,3.066667
6,1810,Mickelson,1,3,4,4,387,-0.033333,3.966667
10,1810,Mickelson,1,4,4,4,440,-0.233333,3.766667
14,1810,Mickelson,1,5,5,4,520,-0.766667,4.233333
...,...,...,...,...,...,...,...,...,...
8403,33141,Bradley,4,14,3,4,442,0.766667,3.766667
8406,33141,Bradley,4,15,4,5,525,0.366667,4.366667
8410,33141,Bradley,4,16,4,4,481,0.166667,4.166667
8414,33141,Bradley,4,17,5,4,470,-0.700000,4.300000


In [18]:
list_of_sg_differences = []

for _, row in condensed_df.iterrows():
    list_of_sg_differences = condensed_df[
        (condensed_df['round'] == 1) & (condensed_df['hole'] == 2)]['sg_difference'].values

In [38]:
list_of_sg_differences

array([ 1.06666667,  0.06666667,  0.06666667, -0.93333333,  0.06666667,
       -0.93333333, -0.93333333,  0.06666667,  0.06666667,  0.06666667,
        0.06666667,  0.06666667,  0.06666667, -0.93333333,  0.06666667,
        0.06666667,  0.06666667,  0.06666667,  0.06666667,  0.06666667,
        1.06666667,  0.06666667,  0.06666667,  0.06666667, -0.93333333,
       -0.93333333,  0.06666667,  0.06666667,  1.06666667,  1.06666667])

In [41]:
list_of_sg_differences.mean()

2.0724163126336256e-16

In [42]:
from scipy.stats import norm
import matplotlib.pyplot as plt
import numpy as np

# Set the parameters of the normal distribution

# Generate synthetic data from the normal distribution

# Fit the normal distribution to the synthetic data
params = norm.fit(list_of_sg_differences)



print("Fitted Mean:", params[0])
print("Fitted Standard Deviation:", params[1])

Fitted Mean: 2.0724163126336256e-16
Fitted Standard Deviation: 0.5734883511361751


In [None]:
^ ok now do thus for every hole in everyround, then look at which holes had the highest fluxation in variance by subtracting the min and max variance for eeach round of each hole, guessing most variance be be on a tough pin placement on a par three that could depend on wind

then split inot back and front nine preformance for each day