In [1]:
import pandas as pd
import numpy as np
import math
import scipy.stats as stats

import statsmodels.api as sm
from statsmodels.formula.api import ols
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('seaborn')

In [2]:
df=pd.read_csv('PGA Tour 2010-2018.csv')

In [3]:
golf=pd.read_csv('PGA_Data_Historical.csv')

In [4]:
golf.columns

Index(['Player Name', 'Season', 'Statistic', 'Variable', 'Value'], dtype='object')

In [5]:
set(golf.Variable)

{'Distance Analysis 180-190 yards - 7 Iron - (ROUNDS)',
 "Total 1 Putts - 10-15' - (TOTAL)",
 'Consecutive Par 5 Birdies - (EVENTS)',
 'Distance Analysis 200-210 yards - 3 Iron - (ROUNDS)',
 "GIR Putting Avg - > 35' - (PUTTS MADE)",
 'Longest Putts - (HOLE)',
 'Distance Analysis 240-260 yards - 2 Iron - (TOTAL ATTEMPTS DIST RANGE)',
 'Carry Distance - (LONGEST CARRY DISTANCE)',
 'Driving Pct. 300+ (All Drives) - (RELATIVE TO PAR)',
 'Proximity to Hole from Other Locations - (# OF SHOTS)',
 'Approaches from > 275 yards (Rgh) - (RELATIVE TO PAR)',
 'Distance Analysis 190-200 yards - 6 Iron - (TOTAL ATTEMPTS DIST RANGE)',
 'Approach 75-100 yards (RTP Score) - (ROUNDS)',
 'Distance Analysis 240-260 yards - 3 Iron - (%)',
 'Tenth Tee Late Lowest Round - (LOW RND)',
 'Average Putting Distance - GIR 3+ Putts - (ROUNDS)',
 'Distance Analysis 200-220 yards - 3 Iron - (ROUNDS)',
 'Distance Analysis 190-200 yards - 3 Iron - (TOTAL ATTEMPTS WITH CLUB)',
 'Percentage of Available Purse Won - (TOTAL

In [6]:
golf.Statistic.unique()

array(['Driving Distance', 'Driving Accuracy Percentage',
       'Greens in Regulation Percentage', 'Putting Average',
       'Par Breakers', 'Total Eagles', 'Total Birdies',
       'Scoring Average (Actual)', 'Money Leaders',
       'Sand Save Percentage', 'Par 3 Birdie or Better Leaders',
       'Par 4 Birdie or Better Leaders', 'Par 5 Birdie or Better Leaders',
       'Birdie or Better Conversion Percentage', 'Putts Per Round',
       'Scoring Average', 'All-Around Ranking', 'Total Driving',
       'Scrambling', 'Ryder Cup Points', 'PGA Championship Points',
       'Putts made Distance', 'Top 10 Finishes', 'Non-member Earnings',
       'Par 3 Scoring Average', 'Par 4 Scoring Average',
       'Par 5 Scoring Average', "3-Putt Avoidance - 15-20'",
       "3-Putt Avoidance - 20-25'", "3-Putt Avoidance > 25'",
       'Current Par or Better Streak', "Rounds in the 60's",
       'Money per Event Leaders', 'Eagles (Holes per)', 'Birdie Average',
       'World Money List', 'Ball Striking', '

In [7]:
drive=golf[golf['Statistic']=='Driving Distance']

In [8]:
driver=drive[drive['Variable']=='Driving Distance - (AVG.)']

In [9]:
driver=driver.rename(columns={'Value':'Drive_average'})

In [10]:
fairways=golf[golf['Variable']=='Hit Fairway Percentage - (%)']

In [11]:
fairways[fairways['Player Name']=='Joe Durant']

Unnamed: 0,Player Name,Season,Statistic,Variable,Value
48014,Joe Durant,2010,Hit Fairway Percentage,Hit Fairway Percentage - (%),76.05
369726,Joe Durant,2011,Hit Fairway Percentage,Hit Fairway Percentage - (%),76.44


In [12]:
scramble=golf[golf['Statistic']=='Scrambling']

In [13]:
scrambling=scramble[scramble['Variable']=='Scrambling - (%)']

In [14]:
scrambling[scrambling['Player Name']=='Tiger Woods']

Unnamed: 0,Player Name,Season,Statistic,Variable,Value
642151,Tiger Woods,2012,Scrambling,Scrambling - (%),63.17
953077,Tiger Woods,2013,Scrambling,Scrambling - (%),60.0
2447319,Tiger Woods,2018,Scrambling,Scrambling - (%),64.16


In [15]:
putt=golf[golf['Statistic']=='Overall Putting Average']

In [16]:
putting=putt[putt['Variable']=='Overall Putting Average - (AVG)']

In [17]:
putting=putting.rename(columns={'Variable':'Putting Average','Value':'Putts Average per hole'})

In [18]:
score=golf[golf['Statistic']=='Scoring Average']

In [19]:
scoring=score[score['Variable']=='Scoring Average - (AVG)']

In [20]:
scoring=scoring.rename(columns={'Variable':'Score AVG','Value':'Average Score per Round'})

In [21]:
gir=golf[golf['Statistic']=='Greens in Regulation Percentage']

In [22]:
greens=gir[gir['Variable']=='Greens in Regulation Percentage - (%)']

In [23]:
greens=greens.rename(columns={'Variable':'Greens in Regulation Percentage','Value':'Percentage of Greens hit in Regulation'})





In [24]:
scramfairway=pd.merge(fairways,scrambling,on=['Player Name','Season'],suffixes=('_fairways_%','_scrambling_%'),how='left')

In [25]:
fairscramputt=pd.merge(scramfairway,putting,on=['Player Name','Season'],how='left')

In [26]:
fairscramputtscore=pd.merge(fairscramputt,scoring,on=['Player Name','Season'],how='left')

In [27]:
total=pd.merge(fairscramputtscore,greens,on=['Player Name','Season'],how='left')

In [28]:
totals=pd.merge(total,driver,on=['Player Name','Season'],how='left')

In [35]:
col3=[  'Statistic_fairways_%',
       'Variable_fairways_%',
       'Statistic_scrambling_%', 'Variable_scrambling_%',
        'Statistic_x', 'Putting Average','Statistic_y', 
        'Statistic_x','Greens in Regulation Percentage', 'Statistic_y',
       'Variable','Score AVG']

In [36]:
totals.head()

Unnamed: 0,Player Name,Season,Statistic_fairways_%,Variable_fairways_%,Value_fairways_%,Statistic_scrambling_%,Variable_scrambling_%,Value_scrambling_%,Statistic_x,Putting Average,Putts Average per hole,Statistic_y,Score AVG,Average Score per Round,Statistic_x.1,Greens in Regulation Percentage,Percentage of Greens hit in Regulation,Statistic_y.1,Variable,Drive_average
0,Omar Uresti,2010,Hit Fairway Percentage,Hit Fairway Percentage - (%),76.36,Scrambling,Scrambling - (%),61.44,Overall Putting Average,Overall Putting Average - (AVG),1.52,Scoring Average,Scoring Average - (AVG),71.579,Greens in Regulation Percentage,Greens in Regulation Percentage - (%),68.22,Driving Distance,Driving Distance - (AVG.),272.0
1,Joe Durant,2010,Hit Fairway Percentage,Hit Fairway Percentage - (%),76.05,Scrambling,Scrambling - (%),59.09,Overall Putting Average,Overall Putting Average - (AVG),1.512,Scoring Average,Scoring Average - (AVG),70.365,Greens in Regulation Percentage,Greens in Regulation Percentage - (%),71.95,Driving Distance,Driving Distance - (AVG.),285.6
2,Craig Bowden,2010,Hit Fairway Percentage,Hit Fairway Percentage - (%),73.98,Scrambling,Scrambling - (%),58.31,Overall Putting Average,Overall Putting Average - (AVG),1.494,Scoring Average,Scoring Average - (AVG),72.229,Greens in Regulation Percentage,Greens in Regulation Percentage - (%),62.64,Driving Distance,Driving Distance - (AVG.),270.0
3,Brian Gay,2010,Hit Fairway Percentage,Hit Fairway Percentage - (%),73.86,Scrambling,Scrambling - (%),66.51,Overall Putting Average,Overall Putting Average - (AVG),1.557,Scoring Average,Scoring Average - (AVG),70.756,Greens in Regulation Percentage,Greens in Regulation Percentage - (%),63.44,Driving Distance,Driving Distance - (AVG.),266.4
4,Tim Clark,2010,Hit Fairway Percentage,Hit Fairway Percentage - (%),73.12,Scrambling,Scrambling - (%),62.67,Overall Putting Average,Overall Putting Average - (AVG),1.604,Scoring Average,Scoring Average - (AVG),70.246,Greens in Regulation Percentage,Greens in Regulation Percentage - (%),66.73,Driving Distance,Driving Distance - (AVG.),272.2


In [37]:
totals=totals.drop(col3,axis=1)

In [38]:
totals.head()

Unnamed: 0,Player Name,Season,Value_fairways_%,Value_scrambling_%,Putts Average per hole,Average Score per Round,Percentage of Greens hit in Regulation,Drive_average
0,Omar Uresti,2010,76.36,61.44,1.52,71.579,68.22,272.0
1,Joe Durant,2010,76.05,59.09,1.512,70.365,71.95,285.6
2,Craig Bowden,2010,73.98,58.31,1.494,72.229,62.64,270.0
3,Brian Gay,2010,73.86,66.51,1.557,70.756,63.44,266.4
4,Tim Clark,2010,73.12,62.67,1.604,70.246,66.73,272.2


In [39]:
total=totals.rename(columns={'Season_fairways_%':'Seasons','Value_fairways_%':'fairways_hit_%','Value_scrambling_%':'Scrambling_%','Percentage of Greens hit in Regulation':'Percent_of_greens_in_reg'})






In [40]:
total['fairways_hit_%'] = total['fairways_hit_%'].astype(float)

In [41]:
total['Scrambling_%'] = total['Scrambling_%'].astype(float)

In [42]:
total['Putts Average per hole'] = total['Putts Average per hole'].astype(float)

In [43]:
total['Average Score per Round'] = total['Average Score per Round'].astype(float)

In [44]:
total['Percent_of_greens_in_reg'] = total['Percent_of_greens_in_reg'].astype(float)

In [45]:
total['Drive_average'] = total['Drive_average'].astype(float)

In [46]:
total.head(30)

Unnamed: 0,Player Name,Season,fairways_hit_%,Scrambling_%,Putts Average per hole,Average Score per Round,Percent_of_greens_in_reg,Drive_average
0,Omar Uresti,2010,76.36,61.44,1.52,71.579,68.22,272.0
1,Joe Durant,2010,76.05,59.09,1.512,70.365,71.95,285.6
2,Craig Bowden,2010,73.98,58.31,1.494,72.229,62.64,270.0
3,Brian Gay,2010,73.86,66.51,1.557,70.756,63.44,266.4
4,Tim Clark,2010,73.12,62.67,1.604,70.246,66.73,272.2
5,Jim Furyk,2010,72.03,63.47,1.597,69.828,67.12,276.0
6,Heath Slocum,2010,71.64,62.87,1.578,70.673,69.59,278.9
7,David Toms,2010,71.52,61.29,1.605,70.337,67.58,281.3
8,Alex Cejka,2010,71.32,56.42,1.609,71.219,66.6,277.4
9,Richard Johnson,2010,70.55,59.35,1.498,71.156,68.5,277.3
