[Twitter](https://x.com/dec1costello) | [GitHub](https://github.com/dec1costello) | [Kaggle](https://www.kaggle.com/dec1costello) | [LinkedIn](https://www.linkedin.com/in/declan-costello-7423aa137/)

<div class="alert alert-block alert-info">
    <b>Key Principles</b><br>
    - This notebook models Expected Strokes Gained across various lies.<br>
    - It then evaluates performance by comparing predicted values with actual outcomes to derive Strokes Gained.<br>
    - Finally, enirched Strokes Gained statistics are generated to provide deeper insights into player performance.<br>
</div>


## **Table of Context**
1. [Installation](#Installation)
2. [Data Import](#Data-Import)
3. [Model Import](#Model-Import)
4. [xS Predictions](#xS-Predictions)
5. [xS Aggregation](#xS-Aggregation)
6. [SG Calculation](#SG-Calculation)
7. [SG Enrichment](#SG-Enrichment)

# 📚 **Installation**

In [1]:
import bz2
import joblib
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
warnings.simplefilter("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=pd.errors.SettingWithCopyWarning)
warnings.filterwarnings("ignore", category=FutureWarning, module='category_encoders.base_contrast_encoder')

# 📂 **Data Import**

In [2]:
result_df = pd.read_csv("FE_golf.csv")

# 🤖 **Model Import**

In [3]:
#load saved compressed approach model
with bz2.BZ2File('approachModel.joblib.bz2', 'rb') as f:
    approachModel = joblib.load(f)

#load saved compressed putting model
with bz2.BZ2File('puttingModel.joblib.bz2', 'rb') as f:
    puttingModel = joblib.load(f)

# **⛳ Expected Strokes Predctions**

### **OTT xS**

In [4]:
# Compute the per-player summed hole scores
player_hole_score = (
    result_df.loc[result_df['shot'] == 1]
    .groupby(['player_id', 'round', 'hole'])['hole_score']
    .sum()
    .groupby(['hole', 'round'])
    .mean()
    .rename('xS_OTT')
)

# Map it back to result_df
result_df = result_df.merge(player_hole_score.reset_index(), on=['hole', 'round'], how='left')


### **Approach xS**

In [5]:
# Initialize xS column
result_df['xS_A'] = 0.0  

# Dynamically extract feature columns (excluding the target column 'xS_G')
feature_cols = [col for col in result_df.columns if col != 'xS_A']

# Extract the feature matrix for the conditionally filtered rows
X_input = result_df.loc[result_df['from_location_scorer'] != 'Green', feature_cols].copy()

# Predict only for rows where 'from_location_scorer' is an approach shot
result_df.loc[result_df['from_location_scorer'] != 'Green', 'xS_A'] = approachModel.predict(X_input)


### **Putting xS**

In [6]:
# Initialize xS column
result_df['xS_G'] = 0.0  

# Dynamically extract feature columns (excluding the target column 'xS_G')
feature_cols = [col for col in result_df.columns if col != 'xS_G']

# Extract the feature matrix for the conditionally filtered rows
X_input = result_df.loc[result_df['from_location_scorer'] == 'Green', feature_cols].copy()

# Predict only for rows where 'from_location_scorer' is 'Green'
result_df.loc[result_df['from_location_scorer'] == 'Green', 'xS_G'] = puttingModel.predict(X_input)


# **Expected Strokes Aggregation**

### **True xS**

In [7]:
# Create the new 'xS' column based on 'from_location_scorer'
result_df['xS_True'] = np.where(
    result_df['from_location_scorer'] == 'Tee Box', result_df['xS_OTT'], 
    np.where(
        result_df['from_location_scorer'] == 'Green', result_df['xS_G'], 
        result_df['xS_A']
    )
)

result_df['xS_True'] = np.maximum(result_df['xS_True'], 1)


### **Raw xS**

In [8]:
result_df['xS_Raw'] = np.where(
        result_df['from_location_scorer'] == 'Green', result_df['xS_G'], 
        result_df['xS_A']
)

# **Strokes Gained Calculation**

### **True SG**

In [9]:
# Ensure data is sorted correctly
result_df = result_df.sort_values(by=['player_id', 'round', 'hole', 'shot'])

# Compute xS_next using shift within each hole
result_df['xS_true_next'] = result_df.groupby(['player_id', 'round', 'hole'])['xS_True'].shift(-1)

# Identify last shot in each hole (where there's no next shot)
last_shot_mask = result_df['xS_true_next'].isna()

# Compute strokes gained
result_df['SG_True'] = (result_df['xS_True'] - result_df['xS_true_next']) - 1

# Special case for last shot in a hole
result_df.loc[last_shot_mask, 'SG_True'] = result_df.loc[last_shot_mask, 'xS_True'] - 1

### **Raw SG**

In [10]:
# Ensure data is sorted correctly
result_df = result_df.sort_values(by=['player_id', 'round', 'hole', 'shot'])

# Compute xS_next using shift within each hole
result_df['xS_raw_next'] = result_df.groupby(['player_id', 'round', 'hole'])['xS_Raw'].shift(-1)

# Identify last shot in each hole (where there's no next shot)
last_shot_mask = result_df['xS_raw_next'].isna()

# Compute strokes gained
result_df['SG_Raw'] = (result_df['xS_Raw'] - result_df['xS_raw_next']) - 1

# Special case for last shot in a hole
result_df.loc[last_shot_mask, 'SG_Raw'] = result_df.loc[last_shot_mask, 'xS_Raw'] - 1

# **Strokes Gained Enrichment**

### **Chunk SG**

In [11]:
result_df['SG_Raw_Chunk'] = result_df['xS_Raw'] - result_df['strokes_to_hole_out']
result_df['SG_True_Chunk'] = result_df['xS_True'] - result_df['strokes_to_hole_out']

### **True SG per Hole**

In [12]:
result_df['SG_True_per_Hole'] = result_df['xS_OTT'] - result_df['hole_score']

### **Rolling True SG Totals per Shot**

In [13]:
result_df['SG_True_Rolling_per_Shot_per_Hole_per_Round'] = result_df.groupby(['last_name','round', 'hole'])['SG_True'].cumsum() # how each shot affects the total score for each hole in a round
result_df['SG_True_Rolling_per_Shot'] = result_df.groupby(['last_name'])['SG_True'].cumsum() #how each shot affects the total score

### **SG Percentiles**

In [14]:
bins = [0, 50, 100, 150, 200, float('inf')]
labels = ['50-0', '100-50', '150-100', '200-150', '200+']

# Apply binning to the DataFrame
result_df['SG_bins'] = pd.cut(result_df['distance_to_pin'], bins=bins, labels=labels, right=False)

# Fill missing values with empty string
result_df['SG_bins'] = result_df['SG_bins'].cat.add_categories([''])
result_df['SG_bins'] = result_df['SG_bins'].fillna('')
# df['SG_bins'] = df['SG_bins'].cat.add_categories(['Green', 'Tee Box'])
result_df['SG_bins'] = result_df['SG_bins'].cat.add_categories(['Putting', 'OTT'])

# df.loc[df['from_location_scorer'] == 'Green', 'SG_bins'] = 'Green'
# df.loc[df['from_location_scorer'] == 'Tee Box', 'SG_bins'] = 'Tee Box'
result_df.loc[result_df['from_location_scorer'] == 'Green', 'SG_bins'] = 'Putting'
result_df.loc[result_df['from_location_scorer'] == 'Tee Box', 'SG_bins'] = 'OTT'

result_df['SG_True_binned_percentile'] = result_df.groupby(['par_value', 'SG_bins','round'], observed=False)['SG_True'].rank(pct=True)


# 📩 **Save**

In [15]:
result_df.to_csv('SG.csv')