In [1]:
import numpy as np
import pandas as pd
import json
import requests

Define function to pull round data from every player at a tournament from a given year.
* Inputs:
 * **tourn_ID** = PGA's designated tournament number
 * **year** = year to grab data from
* Output: Pandas DataFrame with round data for every player at the tournament

In [16]:
def tourney(course_ID, year):
    '''
    Pull scores from all players of a tournament
    course_ID = course number as designated by PGA
    year = year of tournament to grab data from
    '''
    tourn = requests.get('http://www.pgatour.com/data/R/{}/{}/tournsum.json'.format(course_ID, year)).json() # load JSON for tournament and year specified
    data = tourn['years'][0]['tours'][0]['trns'][0] # ignore extraneous metadata for now, get straight to the meat
    out = {'RoundNum':[], 'CourseID':[], 'RoundPos':[], 'RoundScore':[], 'PlayerID':[], 'Year':[], 'PlayerName':[]} # initialize dictionary for dumping info into
    
    for player in data['plrs']:
        # save some stuff for later
        playername = player['name']['first'] + ' ' + player['name']['last']
        playerID = player['plrNum']
        
        for rnd in player['rnds']:
            out['Year'].append(year)
            out['PlayerName'].append(playername)
            out['PlayerID'].append(playerID)
            out['RoundNum'].append(rnd['rndNum'])
            out['CourseID'].append(rnd['courseNum'])
            out['RoundPos'].append(rnd['rndPos'])
            out['RoundScore'].append(rnd['rndScr'])
            
    return pd.DataFrame(out)

Define function to pull round data from a designated player at all tournaments from a given year.
* Inputs:
 * **PlayerName** = first and last name of player as a string
 * **year** = year to grab data from
* Output: Pandas DataFrame with round data for chosen player at all tournaments from given year

In [16]:
def player(PlayerName, year):
    '''
    Pull scores from all tournaments for a single player
    PlayerName = first and last name of player
    year = year of to grab data from
    '''
    out = {'RoundNum':[], 'CourseID':[], 'RoundPos':[], 'RoundScore':[], 'PlayerID':[], 'Year':[], 'PlayerName':[]} # initialize dictionary for dumping info into
    
    for CourseID in course_info(year)['CourseID']: # get course IDs for all tournaments in given year
        tourn = requests.get('http://www.pgatour.com/data/R/{}/{}/tournsum.json'.format(CourseID, year)).json() # load JSON for course and year specified
        data = tourn['years'][0]['tours'][0]['trns'][0]['plrs'] # ignore extraneous metadata for now, get straight to the meat
        
        for rnd in data if data['name']['first'] + ' ' + data['name']['last'] == PlayerName:
            out['Year'].append(year)
            out['PlayerName'].append(PlayerName)
            out['PlayerID'].append(data['plrNum'])
            out['RoundNum'].append(rnd['rndNum'])
            out['CourseID'].append(rnd['courseNum'])
            out['RoundPos'].append(rnd['rndPos'])
            out['RoundScore'].append(rnd['rndScr'])
            
    return pd.DataFrame(out)

See if I can get tournament numbers from PGA website

In [46]:
def course_info(year):
    '''
    Pull course information into DataFrame
    year = year to grab data from
    '''
    out = {'CourseName':[], 'CourseID':[], 'TournName':[]} # initialize dictionary for dumping info into
    
    # check all possible course numbers, given that they are 3-digit numbers
    for course in ['%03d' % (x,) for x in range(101)]:
        try:
            tourn = requests.get('http://www.pgatour.com/data/R/{}/{}/tournsum.json'.format(course, year)).json() # load JSON for course and year specified
            data = tourn['years'][0]['tours'][0]['trns'][0] # ignore extraneous metadata for now, get straight to the meat
            out['CourseName'].append(data['courses'][0]['courseName'])
            out['CourseID'].append(data['courses'][0]['courseNum'])
            out['TournName'].append(data['fullName'])
            print('loading data from course #' + course)
            
        except:
            print(course + ' is not a valid course ID')
        
    return pd.DataFrame(out)

In [47]:
courses_2017 = course_info('2017')

000 is not a valid course ID
001 is not a valid course ID
loading data from course #002
loading data from course #003
loading data from course #004
loading data from course #005
loading data from course #006
loading data from course #007
008 is not a valid course ID
loading data from course #009
loading data from course #010
loading data from course #011
loading data from course #012
loading data from course #013
loading data from course #014
015 is not a valid course ID
loading data from course #016
017 is not a valid course ID
loading data from course #018
loading data from course #019
loading data from course #020
loading data from course #021
022 is not a valid course ID
loading data from course #023
024 is not a valid course ID
loading data from course #025
loading data from course #026
loading data from course #027
028 is not a valid course ID
029 is not a valid course ID
loading data from course #030
031 is not a valid course ID
loading data from course #032
loading data from co

In [57]:
for course in courses_2017['CourseID']:
    print(course)

704
510
004
005
006
500
009
734
011
012
752
014
656
714
019
729
021
023
513
236
238
669
032
241
503
770
538
746
678
541


The following will be the big function that loops through years for a given tournament

In [55]:
years = [x for x in range(1997, 2018)] # define years to grab data from

In [None]:
for year in years:
    # get link to JSON data page from PGA website
    json = 'http://www.pgatour.com/data/R/014/{}/tournsum.json'.format(year)
    temp = pd.DataFrame(columns = ['RoundNum','CourseID','RoundPos','RoundScore','RelParScore','CumParScore','PlayerID','Year','PlayerName'])
    data = 