In [1]:
import pandas as pd
import requests
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.ensemble import GradientBoostingRegressor


pd.set_option('display.max_columns', None)

# Querying Data

## Get Schedule Data first

In [2]:
##Get Schedule for year
def get_sched(year,orgID =1):
    url = 'https://live-golf-data.p.rapidapi.com/schedule'
    
    querystring = {"orgId":str(orgID),"year":str(year)}
    
    headers = {
        "X-RapidAPI-Key": "25ddd59395msh5e48096c9dca05bp1515c2jsn9630e24f9d7d",
        "X-RapidAPI-Host": "live-golf-data.p.rapidapi.com"
    }
    response = requests.get(url, headers=headers, params=querystring)
    response = pd.json_normalize(response.json()['schedule'])
    response['year'] = year
    return response


In [3]:
#Get Schedule for all years where round level data is available

df_fullSched = pd.DataFrame()
for i in range(2021,2026):
    df_fullSched = pd.concat([df_fullSched,get_sched(i)])

In [7]:
df_fullSched['date.start.$date.$numberLong'] = pd.to_numeric(df_fullSched['date.start.$date.$numberLong'])
df_fullSched['StartDate'] = pd.to_datetime(df_fullSched['date.start.$date.$numberLong'], unit = 'ms')
df_fullSched.reset_index(inplace = True)
df_fullSched.describe()

Unnamed: 0,level_0,index,date.start.$date.$numberLong,year,StartDate
count,264.0,264.0,264.0,264.0,264
mean,131.5,26.07197,1682281000000.0,2022.984848,2023-04-23 20:21:49.090909184
min,0.0,0.0,1599696000000.0,2021.0,2020-09-10 00:00:00
25%,65.75,13.0,1642486000000.0,2022.0,2022-01-18 06:00:00
50%,131.5,26.0,1682251000000.0,2023.0,2023-04-23 12:00:00
75%,197.25,39.0,1722017000000.0,2024.0,2024-07-26 18:00:00
max,263.0,60.0,1765498000000.0,2025.0,2025-12-12 00:00:00
std,76.354437,15.582191,47657050000.0,1.386984,


## Get Round Level Data

In [26]:
def structure_tourney(tournID, year, name = None):
    url = "https://live-golf-data.p.rapidapi.com/leaderboard"
    
    querystring = {"orgId":"1","tournId":tournID,"year":year}

    headers = {
        "X-RapidAPI-Key": "25ddd59395msh5e48096c9dca05bp1515c2jsn9630e24f9d7d",
        "X-RapidAPI-Host": "live-golf-data.p.rapidapi.com"
    }

    response = requests.get(url, headers=headers, params=querystring)
    if response.status_code != 200:
        print("{0} error from {1} - {2} - {3}".format(response,tournID,name,year))
        return pd.DataFrame()
        
    df_res = pd.json_normalize(response.json()['leaderboardRows'], record_path = "rounds", meta = ["lastName", "firstName", "playerId", "total"])
    df_res['tournId'] = tournID
    df_res['year'] = year
    df_res['tournName'] = name
 
    return df_res


In [27]:

all_rounds = pd.DataFrame()
for i in df_fullSched.index:
    tourney = df_fullSched.iloc[i,:].loc['tournId']
    year = df_fullSched.iloc[i,:].loc['year']
    name = df_fullSched.iloc[i,:].loc['name']
    
    #Schema validationSome tournaments are missing data for individual rounds. TODO: move this to sturcture_tourney()
    try:
        temp = structure_tourney(tourney,year,name)
    
        temp['StartDate'] = df_fullSched.iloc[i,:].loc['StartDate']
        
        if i == 0:
            all_rounds = temp
        else:
            all_rounds = pd.concat([all_rounds,temp],ignore_index = True)
    except Exception as e:
        print(e)
        continue


<Response [400]> error from 470 - World Golf Championships-Dell Technologies Match Play - 2021
"Key 'rounds' not found. If specifying a record_path, all elements of data should have the path."
"Key 'rounds' not found. If specifying a record_path, all elements of data should have the path."
<Response [400]> error from 472 - Barracuda Championship - 2021
"Key 'rounds' not found. If specifying a record_path, all elements of data should have the path."
<Response [400]> error from 470 - World Golf Championships-Dell Technologies Match Play - 2022
"Key 'rounds' not found. If specifying a record_path, all elements of data should have the path."
<Response [400]> error from 472 - Barracuda Championship - 2022
"Key 'rounds' not found. If specifying a record_path, all elements of data should have the path."
'leaderboardRows'
"Key 'rounds' not found. If specifying a record_path, all elements of data should have the path."
"Key 'rounds' not found. If specifying a record_path, all elements of data s

In [28]:
all_rounds.to_csv('all_rounds_raw.csv')

In [15]:
all_rounds_typed =  all_rounds.convert_dtypes(infer_objects = False)
all_rounds_typed['strokes'] =  pd.to_numeric(all_rounds_typed['strokes.$numberInt'])

In [16]:
#all_rounds_typed
new_index = all_rounds_typed["year"].astype(str) + '_' + all_rounds_typed["tournId"].astype(str) + '_' + all_rounds_typed["roundId.$numberInt"].astype(str)
all_rounds_typed = all_rounds_typed.set_index(new_index)
all_rounds_typed['round_avg'] = all_rounds_typed.groupby(all_rounds_typed.index)["strokes"].transform('mean')

In [17]:
all_rounds_typed.head()

Unnamed: 0,scoreToPar,courseId,courseName,roundId.$numberInt,strokes.$numberInt,lastName,firstName,playerId,total,status,tournId,year,StartDate,strokes,round_avg
2023_464_1,-7,665,Innisbrook Resort (Copperhead),1,64,Burns,Sam,47504,-17,active,464,2023,2022-09-15 00:00:02.048,64,69.909722
2023_464_2,-4,665,Innisbrook Resort (Copperhead),2,67,Burns,Sam,47504,-17,active,464,2023,2022-09-15 00:00:02.048,67,70.5
2023_464_3,-4,665,Innisbrook Resort (Copperhead),3,67,Burns,Sam,47504,-17,active,464,2023,2022-09-15 00:00:02.048,67,69.666667
2023_464_4,-2,665,Innisbrook Resort (Copperhead),4,69,Burns,Sam,47504,-17,active,464,2023,2022-09-15 00:00:02.048,69,70.902778
2023_464_1,-6,665,Innisbrook Resort (Copperhead),1,65,Riley,Davis,47995,-17,active,464,2023,2022-09-15 00:00:02.048,65,69.909722


In [78]:
all_rounds_typed['adjusted_score'] = all_rounds_typed['strokes'] - all_rounds_typed['round_avg']
all_rounds_typed['Date'] = all_rounds_typed['StartDate'] + pd.to_timedelta(all_rounds_typed["roundId.$numberInt"].astype(int) - 1, unit = 'D')
all_rounds_typed = all_rounds_typed.reset_index()

In [77]:
all_rounds_typed.reset_index()

Unnamed: 0,index,scoreToPar,courseId,courseName,roundId.$numberInt,strokes.$numberInt,lastName,firstName,playerId,total,status,tournId,year,StartDate,strokes,round_avg,adjusted_score,Date,lag_1_adjusted_score,lag_2_adjusted_score,lag_3_adjusted_score,lag_4_adjusted_score,lag_5_adjusted_score,lag_6_adjusted_score,lag_7_adjusted_score,lag_8_adjusted_score,lag_9_adjusted_score,lag_10_adjusted_score
0,2023_464_1,+2,665,Innisbrook Resort (Copperhead),1,73,Love III,Davis,01706,+3,cut,464,2023,2022-09-15 00:00:02.048,73,69.909722,3.090278,2022-09-15 00:00:02.048,,,,,,,,,,
1,2023_464_2,+1,665,Innisbrook Resort (Copperhead),2,72,Love III,Davis,01706,+3,cut,464,2023,2022-09-15 00:00:02.048,72,70.5,1.5,2022-09-16 00:00:02.048,3.090278,,,,,,,,,
2,2023_500_1,+2,665,Innisbrook Resort (Copperhead),1,73,Love III,Davis,01706,+3,cut,500,2023,2022-09-21 23:59:28.256,73,69.909722,3.090278,2022-09-21 23:59:28.256,1.5,3.090278,,,,,,,,
3,2023_500_2,+1,665,Innisbrook Resort (Copperhead),2,72,Love III,Davis,01706,+3,cut,500,2023,2022-09-21 23:59:28.256,72,70.5,1.5,2022-09-22 23:59:28.256,3.090278,1.5,3.090278,,,,,,,
4,2023_054_1,+2,665,Innisbrook Resort (Copperhead),1,73,Love III,Davis,01706,+3,cut,054,2023,2022-09-29 00:01:05.536,73,69.909722,3.090278,2022-09-29 00:01:05.536,1.5,3.090278,1.5,3.090278,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26347,2023_550_2,+3,665,Innisbrook Resort (Copperhead),2,74,Suber,Jackson,60019,+2,cut,550,2023,2023-11-29 23:59:30.304,74,70.5,3.5,2023-11-30 23:59:30.304,0.090278,3.5,0.090278,3.5,0.090278,3.5,0.090278,3.5,0.090278,3.5
26348,2023_551_1,-1,665,Innisbrook Resort (Copperhead),1,70,Suber,Jackson,60019,+2,cut,551,2023,2023-12-08 00:00:44.032,70,69.909722,0.090278,2023-12-08 00:00:44.032,3.5,0.090278,3.5,0.090278,3.5,0.090278,3.5,0.090278,3.5,0.090278
26349,2023_551_2,+3,665,Innisbrook Resort (Copperhead),2,74,Suber,Jackson,60019,+2,cut,551,2023,2023-12-08 00:00:44.032,74,70.5,3.5,2023-12-09 00:00:44.032,0.090278,3.5,0.090278,3.5,0.090278,3.5,0.090278,3.5,0.090278,3.5
26350,2023_088_1,-1,665,Innisbrook Resort (Copperhead),1,70,Suber,Jackson,60019,+2,cut,088,2023,2023-12-14 00:00:33.792,70,69.909722,0.090278,2023-12-14 00:00:33.792,3.5,0.090278,3.5,0.090278,3.5,0.090278,3.5,0.090278,3.5,0.090278


In [20]:
def lag_obs_v2 (df, num_lag,lag_col ):
    df.sort_values(['playerId','Date'], inplace = True)
    for lag in range(1,num_lag + 1):
        df['lag_'+ str(lag) +'_' + str(lag_col)] = df.groupby('playerId',sort = False)[lag_col].shift(lag)


In [82]:
lag_obs_v2(all_rounds_typed,10,'adjusted_score')
X = all_rounds_typed[['playerId','tournId','Date','lag_1_adjusted_score','lag_2_adjusted_score','lag_3_adjusted_score','lag_4_adjusted_score','lag_5_adjusted_score','lag_6_adjusted_score','lag_7_adjusted_score','lag_8_adjusted_score','lag_9_adjusted_score','lag_10_adjusted_score']]
y = all_rounds_typed[['adjusted_score']]
##Train Test split

X_train = X[X['Date'] < '2023-01-01']
y_train = y[X['Date'] < '2023-01-01']

X_test = X[X['Date'] >= '2023-01-01']
y_test = y[X['Date'] >= '2023-01-01']

X_train = X_train.drop('Date', axis = 1)
X_train = X_train.dropna()
y_train = y_train[y_train.index.isin(X_train.index)] 


X_test = X_test.drop('Date', axis = 1)
X_test = X_test.dropna()
y_test = y_test[y_test.index.isin(X_test.index)] 

In [83]:
all_rounds_typed.index

RangeIndex(start=0, stop=26352, step=1)

In [84]:
params = {
    "n_estimators": 600,
    "max_depth": 7,
    "min_samples_split": 5,
    "learning_rate": 0.001
    #"loss": "squared_error",
}
reg_v2 = GradientBoostingRegressor(**params)
reg_v2.fit(X_train, y_train.values.ravel())

In [85]:
reg_v2.score(X_test, y_test)

0.5766699417657957