In [1]:
import urllib.parse

import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
from IPython.display import HTML, Markdown, display

#### Headers to help robot navigate the web

In [2]:
headers = {
    "User-Agent": ( "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:98.0) Gecko/20100101 Firefox/98.0"
                  ),
    "Accept": "text/html,applicationxhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
    "Accept-Lange": "en-US,en;q=0.5",
    "Accept-ancoding": "gzip, deflat",
    "Connection": "keep-alive",
    "Upgrade-Insecure-Requests": "1",
    "Sec-Fetch-Dest": "document",
    "Sec-Fetch-Mode": "navigate",
    "Sec-Fetch-Site": "none",
    "Sec-Fetch-User": "?1",
    "Cache-Control": "max-age=0",

}

## Basketball Reference Example

### Check robot.txt rules for bbal ref website

In [3]:
url = "https://www.basketball-reference.com/teams/MIN/2024.html"

parsed = urllib.parse.urlparse(url)

robots_url = f"{parsed.scheme}://{parsed.netloc}/robots.txt"

response = requests.get(robots_url, headers=headers)
response.raise_for_status()

print(response.text)

User-agent: AhrefsBot
Disallow: /

User-agent: GPTBot
Disallow: /

User-agent: Twitterbot
Disallow:

User-agent: *
Disallow: /basketball/
Disallow: /blazers/
Disallow: /dump/
Disallow: /fc/
Disallow: /my/
Disallow: /7103
Disallow: /play-index/*.cgi?*
Disallow: /play-index/plus/*.cgi?*
Disallow: */gamelog/
Disallow: */splits/
Disallow: */on-off/
Disallow: */lineups/
Disallow: */shooting/

Disallow: /req/
Disallow: /short/
Disallow: /nocdn/

Crawl-delay: 3

# Disallow the plagiarism.org robot, www.slysearch.com
User-agent: SlySearch
User-agent: GroundControl
User-agent: Ground-Control
User-agent: Carmine
User-agent: Skynet
User-agent: The-Matrix
User-agent: Matrix
User-agent: HAL9000
Disallow: /            #Will disallow or robot from all urls on your site




### We are allowed to grab data so set up http requests

In [4]:
response = requests.get(url, headers=headers)
response.raise_for_status()

soup = BeautifulSoup(response.text, "html.parser")

soup.title

<title>2023-24 Minnesota Timberwolves Roster and Stats | Basketball-Reference.com</title>

### The Minnesota timberwolves are doing well this season, let's look at some stats to see why. Let's see what statistics are available in each table

In [5]:
tables = soup.find_all("table")#, id="per_minute")# class_="stats_table sortable now_sortable sticky_table eq2 re2 le2")

for table in tables:
    id = table.get("id")
    print(id)

roster
per_game
totals
advanced


### Let's investigate the advanced stats table since the numbers have gone through many preprocessing steps already to find value in the data

In [6]:
table = soup.find("table", id="advanced")

In [7]:
stats = table.find_all("td", class_="right")
print(len(stats))

360


In [8]:
names = []

elements = table.find_all("td", class_="left")
for element in elements:
    names.append(element.get("csk"))

print(len(names))

15


In [9]:
PER = []
TS = []
TRB = []
AST = []
USG = []
WS = []
VORP = []

for stat in stats:
    type = stat.get("data-stat")
    
    if type == "per":
        PER.append(stat.text)
    elif type == "ts_pct":
        TS.append(stat.text)
    elif type == "trb_pct":
        TRB.append(stat.text)
    elif type == "ast_pct":
        AST.append(stat.text)
    elif type == "usg_pct":
        USG.append(stat.text)
    elif type == "ws":
        WS.append(stat.text)
    elif type == "vorp":
        VORP.append(stat.text)

In [10]:
PER = [float(x) for x in PER]
TS = [float(x) for x in TS]
TRB = [float(x) for x in TRB]
AST = [float(x) for x in AST]
USG = [float(x) for x in USG]
WS = [float(x) for x in WS]
VORP = [float(x) for x in VORP]

PER

[19.7,
 19.1,
 17.0,
 15.4,
 13.4,
 11.2,
 18.9,
 9.7,
 7.5,
 10.6,
 14.8,
 12.6,
 29.5,
 17.7,
 -12.6]

In [11]:
import pandas as pd

df = pd.DataFrame({'Name': names, 'PER': PER, 'TS': TS, 'TRB': TRB, 'AST': AST, 'USG': USG, 'WS': WS, 'VORP': VORP})
df.head()

Unnamed: 0,Name,PER,TS,TRB,AST,USG,WS,VORP
0,"Edwards,Anthony",19.7,0.573,8.9,24.6,32.5,1.1,0.5
1,"Towns,Karl-Anthony",19.1,0.596,15.1,13.8,26.8,1.3,0.5
2,"Gobert,Rudy",17.0,0.565,21.0,5.6,16.0,1.4,0.2
3,"Conley,Mike",15.4,0.648,6.0,24.2,13.5,1.4,0.4
4,"Anderson,Kyle",13.4,0.579,8.6,22.2,14.5,0.7,0.3


In [12]:
df.describe()

Unnamed: 0,PER,TS,TRB,AST,USG,WS,VORP
count,15.0,15.0,15.0,15.0,15.0,15.0,15.0
mean,13.633333,0.549933,10.58,11.933333,19.826667,0.566667,0.14
std,9.008144,0.200338,8.663981,10.076328,9.881257,0.548591,0.21314
min,-12.6,0.0,0.0,0.0,6.2,0.0,-0.2
25%,10.9,0.5085,5.0,5.2,14.1,0.1,0.0
50%,14.8,0.565,8.6,8.7,16.0,0.4,0.0
75%,18.3,0.622,15.25,19.2,23.85,1.05,0.3
max,29.5,1.0,27.9,32.5,46.0,1.4,0.5


### The Timberwolves are averaging a PER of 13.6 per player which is is high, indicating their players are highly skilled on average. The average usage rate is also high at 20% indicating they are sharing the ball and getting everyone involved offensively, a hallmark of team success.

In [13]:
from sklearn.cluster import KMeans
import numpy as np

k = 3

kmeans = KMeans(n_clusters=k).fit(np.array(df.set_index('Name')))

labels = kmeans.labels_

print("Cluster labels:", labels)

Cluster labels: [0 1 1 0 0 1 1 1 1 1 0 1 2 1 1]




In [14]:
df['Cluster'] = labels
df

Unnamed: 0,Name,PER,TS,TRB,AST,USG,WS,VORP,Cluster
0,"Edwards,Anthony",19.7,0.573,8.9,24.6,32.5,1.1,0.5,0
1,"Towns,Karl-Anthony",19.1,0.596,15.1,13.8,26.8,1.3,0.5,1
2,"Gobert,Rudy",17.0,0.565,21.0,5.6,16.0,1.4,0.2,1
3,"Conley,Mike",15.4,0.648,6.0,24.2,13.5,1.4,0.4,0
4,"Anderson,Kyle",13.4,0.579,8.6,22.2,14.5,0.7,0.3,0
5,"McDaniels,Jaden",11.2,0.648,5.0,7.9,14.9,0.7,0.0,1
6,"Reid,Naz",18.9,0.664,10.3,4.8,21.2,1.0,0.3,1
7,"Alexander-Walker,Nickeil",9.7,0.519,5.0,16.2,13.7,0.4,0.1,1
8,"Milton,Shake",7.5,0.505,7.1,10.0,17.9,0.2,-0.2,1
9,"Brown,Troy",10.6,0.46,15.4,8.5,15.3,0.1,0.0,1


### Luka Garza appears far and away the best player because of his inflated and inaccurate PER. Let's remove outliers and try clustering one more time

In [15]:
df = df.drop(12)
df = df.drop(14)

kmeans = KMeans(n_clusters=k).fit(np.array(df.set_index('Name')))

labels = kmeans.labels_

df['Cluster'] = labels

df



Unnamed: 0,Name,PER,TS,TRB,AST,USG,WS,VORP,Cluster
0,"Edwards,Anthony",19.7,0.573,8.9,24.6,32.5,1.1,0.5,0
1,"Towns,Karl-Anthony",19.1,0.596,15.1,13.8,26.8,1.3,0.5,0
2,"Gobert,Rudy",17.0,0.565,21.0,5.6,16.0,1.4,0.2,2
3,"Conley,Mike",15.4,0.648,6.0,24.2,13.5,1.4,0.4,1
4,"Anderson,Kyle",13.4,0.579,8.6,22.2,14.5,0.7,0.3,1
5,"McDaniels,Jaden",11.2,0.648,5.0,7.9,14.9,0.7,0.0,2
6,"Reid,Naz",18.9,0.664,10.3,4.8,21.2,1.0,0.3,2
7,"Alexander-Walker,Nickeil",9.7,0.519,5.0,16.2,13.7,0.4,0.1,1
8,"Milton,Shake",7.5,0.505,7.1,10.0,17.9,0.2,-0.2,2
9,"Brown,Troy",10.6,0.46,15.4,8.5,15.3,0.1,0.0,2


## BOOM! And now we have some interesting results which I will explore in the summary.