# Imports

In [20]:
import requests
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Getting the Golf Data from an API

Here we put the information from ***DataGolf.Com API***:

The information will only be PGA events, the following code is going throught the JSON and API formatting to get the data to the required setup for data cleaning, factor creation and modeling. 

NOTE: The API key is not given here, DataGolf is a paid subscription.

In [21]:
# get list of available events
key=''
sched = requests.get(f'https://feeds.datagolf.com/historical-raw-data/event-list?file_format=[ file_format ]&key={key}').json()
schedtable = pd.json_normalize(sched)

In [24]:
schedtable.head()

Unnamed: 0,calendar_year,date,event_id,event_name,sg_categories,tour,traditional_stats
0,2022,2022-07-03,9600,Raiffeisen Pro Golf Tour St. Pölten 2022,no,pgt,no
1,2022,2022-07-03,30,John Deere Classic,yes,pga,yes
2,2022,2022-07-03,9599,PGA Championship Landeryd Masters,no,ngl,no
3,2022,2022-07-03,2,Portland,no,liv,no
4,2022,2022-07-03,9595,Asiad CC Busan Open,no,kor,no


In [25]:
# Only PGA events

pgaevents = schedtable[schedtable['tour']=='pga']

# Pull all values, creates list for all the different PGA tournament metadata needed
# to create the API URLs
events = pgaevents[['event_id','calendar_year','tour']].values

requestlist = []

# Create list of all the request URLs for the available tournament round data
for i in range(len(events)):
    # URL format:
    URL = f"https://feeds.datagolf.com/historical-raw-data/rounds?tour={events[i][2]}&event_id={events[i][0]}&year={events[i][1]}&file_format=json&key=b3e2af43efb0ce7ee4d63c4b793e"
    requestlist.append(URL)

In [26]:
# show request list:
requestlist[0:5]

['https://feeds.datagolf.com/historical-raw-data/rounds?tour=pga&event_id=30&year=2022&file_format=json&key=b3e2af43efb0ce7ee4d63c4b793e',
 'https://feeds.datagolf.com/historical-raw-data/rounds?tour=pga&event_id=34&year=2022&file_format=json&key=b3e2af43efb0ce7ee4d63c4b793e',
 'https://feeds.datagolf.com/historical-raw-data/rounds?tour=pga&event_id=26&year=2022&file_format=json&key=b3e2af43efb0ce7ee4d63c4b793e',
 'https://feeds.datagolf.com/historical-raw-data/rounds?tour=pga&event_id=32&year=2022&file_format=json&key=b3e2af43efb0ce7ee4d63c4b793e',
 'https://feeds.datagolf.com/historical-raw-data/rounds?tour=pga&event_id=23&year=2022&file_format=json&key=b3e2af43efb0ce7ee4d63c4b793e']

In [27]:
# Dataframe will have tournament metadata, and the 'scores' column will have JSON data for all the rounds in the tournament. 
tournamentlist = []

# iterate over the request list
for i in range(len(requestlist)):
    # .get from API, convert to JSON and append the JSON data to a list
    x = requests.get(requestlist[i]).json()
    tournamentlist.append(x)

**Here is the structure of the JSON data:**

{'event_completed': '2022-07-03',

 'event_id': '30',
 
 'event_name': 'John Deere Classic',
 
 'scores': [{'dg_id': 21554,
   'fin_text': '1',
   'player_name': 'Poston, J.T.',
   
   'round_1': {'course_name': 'TPC Deere Run',
    'course_num': 669,
    'course_par': 71,
    'driving_acc': 0.929,
    'driving_dist': 292.6,
    'gir': 0.889,
    'prox_fw': 30.963,
    'prox_rgh': 90.86,
    'score': 62,
    'scrambling': 1.0, 
    'sg_app': 2.578,    
    'sg_arg': 2.668,    
    'sg_ott': 1.224,    
    'sg_putt': 2.825,    
    'sg_t2g': 6.47,    
    'sg_total': 9.295}, ...

In [28]:
# take the scores JSON column and get the round columns out of it
tourn_round = []
for i in range(len(tournamentlist)):
    # json normalize to convert the column into actual pandas data, save the meta data as well
    x = pd.json_normalize(tournamentlist[i],['scores'],
                          meta=['event_id','event_name',
                                 'event_completed'], max_level=0)
    tourn_round.append(x)

In [29]:
# combine all the information together:
rounddata = pd.concat(tourn_round)

# DF information:
rounddata.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 31805 entries, 0 to 31
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   dg_id            31805 non-null  int64 
 1   fin_text         31805 non-null  object
 2   player_name      31805 non-null  object
 3   round_1          31805 non-null  object
 4   round_2          31352 non-null  object
 5   round_3          18522 non-null  object
 6   round_4          17380 non-null  object
 7   event_id         31805 non-null  object
 8   event_name       31805 non-null  object
 9   event_completed  31805 non-null  object
dtypes: int64(1), object(9)
memory usage: 2.7+ MB


In [30]:
rounddata.head(5)

Unnamed: 0,dg_id,fin_text,player_name,round_1,round_2,round_3,round_4,event_id,event_name,event_completed
0,21554,1,"Poston, J.T.","{'course_name': 'TPC Deere Run', 'course_num':...","{'course_name': 'TPC Deere Run', 'course_num':...","{'course_name': 'TPC Deere Run', 'course_num':...","{'course_name': 'TPC Deere Run', 'course_num':...",30,John Deere Classic,2022-07-03
1,12808,T2,"Grillo, Emiliano","{'course_name': 'TPC Deere Run', 'course_num':...","{'course_name': 'TPC Deere Run', 'course_num':...","{'course_name': 'TPC Deere Run', 'course_num':...","{'course_name': 'TPC Deere Run', 'course_num':...",30,John Deere Classic,2022-07-03
2,18103,T2,"Bezuidenhout, Christiaan","{'course_name': 'TPC Deere Run', 'course_num':...","{'course_name': 'TPC Deere Run', 'course_num':...","{'course_name': 'TPC Deere Run', 'course_num':...","{'course_name': 'TPC Deere Run', 'course_num':...",30,John Deere Classic,2022-07-03
3,12359,T4,"Stallings, Scott","{'course_name': 'TPC Deere Run', 'course_num':...","{'course_name': 'TPC Deere Run', 'course_num':...","{'course_name': 'TPC Deere Run', 'course_num':...","{'course_name': 'TPC Deere Run', 'course_num':...",30,John Deere Classic,2022-07-03
4,27774,T4,"Gotterup, Christopher","{'course_name': 'TPC Deere Run', 'course_num':...","{'course_name': 'TPC Deere Run', 'course_num':...","{'course_name': 'TPC Deere Run', 'course_num':...","{'course_name': 'TPC Deere Run', 'course_num':...",30,John Deere Classic,2022-07-03


Now the data is split into tournaments with different round columns (1,2,3,4). Within these round values, there are all the different player scores for that given tournament round. Need to extract this information and move into a pandas compatible structure. 

In [31]:
# Send the metadata to a seperate records dictionary, join back later
metadata = rounddata[['dg_id','player_name','event_id','event_name','event_completed','fin_text']].to_dict('records')

In [32]:
finallist = []

# iterate over each round column
for i in range(4):
    
    print("round"+ str(i+1))
    
    # get the data from round_x for each tournament
    rd = rounddata[f"round_{i+1}"].values
    
    for x in range(len(rd)):
        
        # make sure round data for player is not NaN (happens when player misses a cut)
        if isinstance(rd[x],dict):
            
            # merge the metadata info with the round info for each player
            l = pd.DataFrame(metadata[x] | rd[x],index=[f'round_{i+1}'])
            
            # append the dataframes to a list
            finallist.append(l)

# combine the list of dataframes
round_level_data = pd.concat(finallist).reset_index()

round1
round2
round3
round4


In [33]:
# rename the column that stores the round number
round_level_data.rename(columns={'index':'round'},inplace=True)
# sort values by date completed and the round
round_level_data.sort_values(['event_completed','round'],inplace=True)
round_level_data.head(5)

Unnamed: 0,round,dg_id,player_name,event_id,event_name,event_completed,fin_text,course_name,course_num,course_par,...,prox_fw,prox_rgh,score,scrambling,sg_app,sg_arg,sg_ott,sg_putt,sg_t2g,sg_total
31773,round_1,14139,"Thomas, Justin",16,SBS Tournament of Champions,2017-01-08,1,Plantation Course at Kapalua,656,73,...,35.32,60.002,67,0.833,0.188,-0.908,1.479,2.584,0.759,3.344
31774,round_1,13562,"Matsuyama, Hideki",16,SBS Tournament of Champions,2017-01-08,2,Plantation Course at Kapalua,656,73,...,44.075,19.185,69,0.6,-0.267,0.34,1.143,0.127,1.216,1.344
31775,round_1,14636,"Spieth, Jordan",16,SBS Tournament of Champions,2017-01-08,T3,Plantation Course at Kapalua,656,73,...,24.038,36.653,72,0.2,1.758,-1.691,-0.471,-1.251,-0.405,-1.656
31776,round_1,6762,"Perez, Pat",16,SBS Tournament of Champions,2017-01-08,T3,Plantation Course at Kapalua,656,73,...,30.122,,69,0.667,1.007,0.057,0.175,0.103,1.24,1.344
31777,round_1,7489,"Moore, Ryan",16,SBS Tournament of Champions,2017-01-08,T3,Plantation Course at Kapalua,656,73,...,29.709,43.138,67,0.75,3.491,0.015,-0.349,0.186,3.157,3.344


**Great! We have our data, let's save it:**

In [34]:
# save the data (to CSV)
round_level_data.to_csv('round_data.csv')