# Specialist Certificate in Data Analytics Essentials Project

## Chapter 1: Introduction

I have chosen to undertake a project examining strokes gained data for professional golfers on the PGA Tour.

## Chapter 2: Importing Data

### Web Scraping

In [18]:
from bs4 import BeautifulSoup
import requests
url = "https://www.pgatour.com/fedexcup/official-standings.html"
result = requests.get(url)

In [19]:
soup= BeautifulSoup(result.text, "html.parser")
print(soup.prettify())

<!DOCTYPE HTML>
<html lang="en">
 <head>
  <meta content="text/html; charset=utf-8" http-equiv="content-type"/>
  <meta content="width=device-width, initial-scale=1.0, shrink-to-fit=no" name="viewport">
   <link href="https://www.pgatour.com/fedexcup/official-standings.html" rel="canonical"/>
   <meta content="FedExCup - Official Standings | PGA TOUR" name="title"/>
   <meta content="The current cumulative points for the year that the player has earned in the FedExCup points race." name="description"/>
   <meta content="pga-tour/9wzdncrfhz2c" name="msApplication-ID"/>
   <meta content="xp_dFTcmlaq2Fgq3gyZfIjWo0KjdEC4cfaO4nWx5yPA" name="google-site-verification"/>
   <!-- Google Authorship and Publisher Markup -->
   <!-- Schema.org markup for Google+ http://schema.org/Article -->
   <meta content="FedExCup - Official Standings | PGA TOUR" itemprop="name"/>
   <meta content="The current cumulative points for the year that the player has earned in the FedExCup points race." itemprop="des

In [20]:
table_element = soup.find("table", class_="table-styled table-fedexcup-standings")
print(table_element)

<table cellpadding="0" cellspacing="0" class="table-styled table-fedexcup-standings">
<tr>
<th>RANK THIS<br/>WEEK</th>
<th class="hidden-small">RANK LAST<br/>WEEK</th>
<th>PLAYER<br/>NAME</th>
<th class="hidden-medium hidden-small">Events</th>
<th class="hidden-small">Points</th>
<th class="hidden-medium hidden-small"># of Wins</th>
<th class="hidden-medium hidden-small"># of Top-10s</th>
<th>Points behind Lead</th>
<th class="hidden-medium hidden-small">Reset Points</th>
</tr>
<tr class="odd">
<td><b>1</b></td>
<td class="hidden-small">1</td>
<td><a href="/players/player.46046.scottie-scheffler.html">Scottie Scheffler</a></td>
<td class="hidden-medium hidden-small">16</td>
<td class="hidden-small">
                    2,842</td>
<td class="hidden-medium hidden-small">
                    4</td>
<td class="hidden-medium hidden-small">
                    7</td>
<td>
                    0</td>
<td class="hidden-medium hidden-small">
                    --</td>
</tr>
<tr>
<td><b>2</b></t

In [21]:
table_rows = []


for row in table_element.find_all("tr"):
    details= row.find_all("td")
    table_rows.append(details)

table_rows = list(filter(None, table_rows))
table_rows = [item for item in table_rows if len(item)==9]

In [22]:
fedex_ranking = []
player_name = []
events_played = []
fedex_points = []
num_wins = []
num_top_10s = []


for row in table_rows:
    fedex_ranking.append(row[0])
    player_name.append(row[2])
    events_played.append(row[3])
    fedex_points.append(row[4])
    num_wins.append(row[5])
    num_top_10s.append(row[6])
                

In [23]:
import re

player_name = [re.sub(r'<.*?>','', str(name)) for name in player_name]
player_name = [player.replace('\xa0',' ') for player in player_name]

In [24]:
def remove_tags(lists):
    """remove HTML tags and convert to integer"""
    lists = [re.sub(r'<.*?>','', str(element)) for element in lists]
    lists = [element.replace(',','') for element in lists]
    lists = [element.replace('--','0') for element in lists]
    lists = [int(element) for element in lists]
    return lists


fedex_ranking = remove_tags(fedex_ranking)
events_played = remove_tags(events_played)
fedex_points = remove_tags(fedex_points)
num_wins = remove_tags(num_wins)
num_top_10s = remove_tags(num_top_10s)

In [25]:
import pandas as pd 
pd.DataFrame({'Player Name': player_name,'Fedex Ranking': fedex_ranking,'Events Played':events_played,
             'Fedex Points': fedex_points, 'Number of Wins': num_wins, 
              'Number of Top 10s' : num_top_10s}).to_csv('fedex_cup_rankings.csv')

### Importing Data from a CSV File into a Pandas DataFrame

In [26]:
fedex_cup = pd.read_csv (r'Documents\UCD Project\fedex_cup_rankings_230522.csv')
print(fedex_cup.head())
strokes_gained = pd.read_csv (r'Documents\UCD Project\strokes_gained_data_230522.csv')
print(strokes_gained.head())

   Unnamed: 0        Player Name  Fedex Ranking  Events Played  Fedex Points  \
0           0  Scottie Scheffler              1             16          2842   
1           1      Cameron Smith              2             11          1603   
2           2          Sam Burns              3             15          1601   
3           3      Justin Thomas              4             13          1568   
4           4   Hideki Matsuyama              5             14          1544   

   Number of Wins  Number of Top 10s  
0               4                  7  
1               2                  5  
2               2                  6  
3               1                  8  
4               2                  5  
          player_name  events_played  wins  x_wins  x_wins_majors  \
0      Thomas, Justin             14     1   0.345          0.090   
1      Smith, Cameron             12     2   1.757          0.069   
2       McIlroy, Rory             11     1   0.247          0.269   
3  Scheff

## Chapter 3: Analysing Data

In [27]:
strokes_gained['first_name'] = strokes_gained['player_name'].str.rsplit(',').str[-1].str.strip()
strokes_gained['surname'] = strokes_gained['player_name'].str.extract('(.+?),')
strokes_gained['player_name'] = strokes_gained['first_name'] + " " + strokes_gained['surname']
strokes_gained.drop(["first_name", "surname"], axis = 1, inplace = True)

In [28]:
strokes_gained

Unnamed: 0,player_name,events_played,wins,x_wins,x_wins_majors,rounds_played,shotlink_played,putt_raw,arg_raw,app_raw,ott_raw,t2g_raw,total_raw
0,Justin Thomas,14,1,0.345,0.090,55,43,0.322515,0.395559,0.915848,0.373701,1.685109,1.871881
1,Cameron Smith,12,2,1.757,0.069,42,35,0.715735,0.268823,1.217806,-0.120312,1.366317,2.012048
2,Rory McIlroy,11,1,0.247,0.269,42,38,0.329505,0.618240,0.408667,0.932341,1.959247,1.973157
3,Scottie Scheffler,17,4,0.866,0.793,65,44,0.458728,0.396876,0.695335,0.179576,1.271787,1.057540
4,Jon Rahm,12,1,1.023,0.000,46,39,0.043139,-0.193457,0.493248,1.280379,1.580171,1.349366
...,...,...,...,...,...,...,...,...,...,...,...,...,...
443,Cormac Sharvin,10,0,0.000,0.000,21,13,-0.034146,-0.105258,-1.027368,-1.360492,-2.493119,-2.153761
444,Ted Potter Jr,8,0,0.000,0.000,19,3,-1.041467,-2.123030,-0.645564,-0.933956,-3.702550,-2.611746
445,Richard S. Johnson,5,0,0.000,0.000,13,3,-1.138339,-0.939325,-1.025755,-1.475916,-3.440996,-3.125817
446,Kevin Stadler,7,0,0.000,0.000,17,5,-1.973688,-1.006540,-1.632037,-0.812414,-3.450991,-3.813691


In [32]:
df = strokes_gained.merge(fedex_cup, left_on='player_name', right_on='Player Name', how = 'inner')

In [33]:
df

Unnamed: 0.1,player_name,events_played,wins,x_wins,x_wins_majors,rounds_played,shotlink_played,putt_raw,arg_raw,app_raw,ott_raw,t2g_raw,total_raw,Unnamed: 0,Player Name,Fedex Ranking,Events Played,Fedex Points,Number of Wins,Number of Top 10s
0,Justin Thomas,14,1,0.345,0.090,55,43,0.322515,0.395559,0.915848,0.373701,1.685109,1.871881,3,Justin Thomas,4,13,1568,1,8
1,Cameron Smith,12,2,1.757,0.069,42,35,0.715735,0.268823,1.217806,-0.120312,1.366317,2.012048,1,Cameron Smith,2,11,1603,2,5
2,Rory McIlroy,11,1,0.247,0.269,42,38,0.329505,0.618240,0.408667,0.932341,1.959247,1.973157,13,Rory McIlroy,14,8,1190,1,5
3,Scottie Scheffler,17,4,0.866,0.793,65,44,0.458728,0.396876,0.695335,0.179576,1.271787,1.057540,0,Scottie Scheffler,1,16,2842,4,7
4,Jon Rahm,12,1,1.023,0.000,46,39,0.043139,-0.193457,0.493248,1.280379,1.580171,1.349366,8,Jon Rahm,9,12,1279,1,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
222,Bo Van Pelt,12,0,0.000,0.000,27,12,-0.525409,-0.007384,-1.192706,-0.098910,-1.299000,-1.348408,220,Bo Van Pelt,221,9,35,0,0
223,Davis Love III,7,0,0.000,0.000,19,9,-0.388319,-0.304571,-1.071721,0.022630,-1.353662,-1.241088,242,Davis Love III,243,5,3,0,0
224,Jonas Blixt,13,0,0.000,0.000,30,22,0.022044,-0.141398,-1.504132,-0.249205,-1.894736,-2.149036,231,Jonas Blixt,232,13,15,0,0
225,Ben Crane,6,0,0.000,0.000,15,4,0.265817,-0.104388,-0.891400,0.086690,-0.909097,-1.848540,243,Ben Crane,244,5,3,0,0


In [36]:
df=df.sort_values(by=['Fedex Ranking'])