In [39]:
import pandas as pd
import requests
import lxml.html as lh
from bs4 import BeautifulSoup
from tabulate import tabulate
from urllib.request import urlopen
# NBA season we will be analyzing
year = 2019

# URL page we will scraping (see image above)
#url = "https://www.basketball-reference.com/leagues/NBA_{}_per_game.html".format(year)
url = "https://www.basketball-reference.com/players/a/afflaar01.html"

# this is the HTML from the given URL
html = urlopen(url)

soup = BeautifulSoup(html)

# use findALL() to get the column headers
soup.findAll('tr', limit=2)

# use getText()to extract the text we need into a list
headers = [th.getText() for th in soup.findAll('tr', limit=2)[0].findAll('th')]

# exclude the first column as we will not need the ranking order from Basketball Reference for the analysis
headers = headers[1:]

#avoid first header row
rows = soup.findAll('tr')[1:]
player_stats = [[td.getText() for td in rows[i].findAll('td')]
            for i in range(len(rows))]
stats = pd.DataFrame(player_stats, columns = headers)
stats

fin_stat = stats[0:12]
fin_stat

Unnamed: 0,Age,Tm,Lg,Pos,G,GS,MP,FG,FGA,FG%,...,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
0,22,DET,NBA,SG,75,9,12.9,1.3,3.2,0.411,...,0.782,0.5,1.3,1.8,0.7,0.4,0.1,0.5,1.1,3.7
1,23,DET,NBA,SG,74,8,16.7,1.8,4.1,0.437,...,0.817,0.4,1.4,1.8,0.6,0.4,0.2,0.6,1.9,4.9
2,24,DEN,NBA,SG,82,75,27.1,3.3,7.1,0.465,...,0.735,0.7,2.4,3.1,1.7,0.6,0.4,0.9,2.7,8.8
3,25,DEN,NBA,SG,69,69,33.7,4.5,9.1,0.498,...,0.847,0.7,3.0,3.6,2.4,0.5,0.4,1.0,2.2,12.6
4,26,DEN,NBA,SG,62,62,33.6,5.3,11.3,0.471,...,0.798,0.6,2.5,3.2,2.4,0.6,0.2,1.4,2.2,15.2
5,27,ORL,NBA,SF,64,64,36.0,6.2,14.1,0.439,...,0.857,0.5,3.3,3.7,3.2,0.6,0.2,2.2,2.1,16.5
6,28,ORL,NBA,SG,73,73,35.0,6.4,13.8,0.459,...,0.815,0.4,3.2,3.6,3.4,0.5,0.0,2.0,1.9,18.2
7,29,TOT,NBA,SG,78,72,32.1,4.8,11.3,0.424,...,0.843,0.3,2.8,3.2,1.7,0.5,0.1,1.5,2.1,13.3
8,29,DEN,NBA,SG,53,53,33.0,5.3,12.4,0.428,...,0.841,0.4,3.0,3.4,1.9,0.6,0.1,1.6,2.0,14.5
9,29,POR,NBA,SG,25,19,30.1,3.8,9.1,0.414,...,0.851,0.2,2.4,2.7,1.1,0.4,0.1,1.3,2.4,10.6


AssertionError: 30 columns passed, passed data had 29 columns

In [None]:
#1 python
# nba_player_stats_scraper.py - scrapes basketball-reference and turns into pandas dataframe
import requests
import bs4
import copy
import pandas as pd
import os
from dotenv import load_dotenv

# loading our environment variables
load_dotenv()

# url we're scraping
url_data = os.getenv("player_stats_url")

# getting our response
res = requests.get(url_data)
res.raise_for_status()  # raises exception if an issue with getting the url_data

# turning our response into soup
soup = bs4.BeautifulSoup(res.text, "html.parser")

# getting column headers for our data
column_headers = [th.getText() for th in
                  soup.findAll('tr', limit=1)[0].findAll('th')]

# getting data_rows (neccesary for getting player data)
data_rows = soup.findAll('tr')[1:]

# for some reason 'rank row' is in a 'tr' tag and all other data is in a 'td' tag,
# so delete the 'rank row' to get rid of the assertion error
column_headers.remove(column_headers[0])

# getting player data
player_data = [[td.getText() for td in data_rows[i].findAll('td')]
               for i in range(len(data_rows))]

# building our data frame
df_raw = pd.DataFrame(player_data, columns=column_headers)

# there are some blank columns with 'none' in them, so we'll get rid of them with notnull
df_raw = df_raw[df_raw.Player.notnull()]

# renaming columns for clarity
df_raw.rename(columns={'WS/48': 'WS-per-48',
                       'Player': 'Name', 'Tm': 'Team'}, inplace=True)

# replacing all column headers that have '%' with '-Perc' instead
df_raw.columns = df_raw.columns.str.replace('%', '-Perc')

# players who change teams show up more than once, we'll get the first entry from
# the table(which is there combined TOT stat) and drop the rest with drop_duplicates
df_raw = df_raw.drop_duplicates(['Name'], keep='first')

# argument convert_numeric changes types that have numbers to the most suitable type
df_raw = df_raw.apply(pd.to_numeric, errors='ignore')

# this gets rid of any NaN columns still left
df_raw = df_raw.dropna(axis=1, how='all')

# getting rid of any symbols that might be in the player names
df_raw['Name'] = df_raw['Name'].str.replace("\'|\\.", "")
df_raw['Name'] = df_raw['Name'].str.replace("\\-", " ")

# filtering by MP using quantile(65th)
mins_quan = df_raw.MP.quantile(q=.65)

# updating df by mins played
df = df_raw.loc[df_raw['MP'] >= mins_quan]

# PER sorted from highest to lowest
PER = df.sort_values('PER', axis=0, ascending=False)

# to avoid a 'SettingWithCopyWarning', make a deepcopy
nba_df = copy.deepcopy(df)

# making new column and weighting the equation
usage_weight = nba_df['USG-Perc'] * .5
rebound_weight = nba_df['TRB-Perc'] * .625  # rebounds = 1.25
assist_weight = nba_df['AST-Perc'] * .75  # assists = 1.5
steal_weight = nba_df['STL-Perc'] * 1  # steals = 2
block_weight = nba_df['BLK-Perc'] * 1  # blocks = 2
turn_over_weight = nba_df['TOV-Perc'] * .25  # turnovers = -.5

# adding new column using equation
nba_df['Z-Stat'] = ((usage_weight + rebound_weight + assist_weight +
                     steal_weight + block_weight) // turn_over_weight)

# filtering our player dataframe
nba_df = nba_df.loc[(nba_df['Z-Stat'] >= 10) & (
    nba_df['VORP'] > 0)].sort_values('Z-Stat', axis=0, ascending=False)

# renaming for export readability
player_df = nba_df

In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from tabulate import tabulate

#res = requests.get("http://www.nationmaster.com/country-info/stats/Media/Internet-users")
res = requests.get("https://www.basketball-reference.com/players/a/afflaar01.html")
soup = BeautifulSoup(res.content,'lxml')
table = soup.find_all('table')[0] 
df = pd.read_html(str(table))
print( tabulate(df[0], headers='keys', tablefmt='psql') )

+----+-----------+-------+------+------+-------+-----+------+-------+-------+-------+---------+-------+-------+---------+-------+-------+---------+---------+-------+-------+---------+-------+-------+-------+-------+-------+-------+-------+-------+-------+
|    | Season    |   Age | Tm   | Lg   | Pos   |   G |   GS |    MP |    FG |   FGA |     FG% |    3P |   3PA |     3P% |    2P |   2PA |     2P% |    eFG% |    FT |   FTA |     FT% |   ORB |   DRB |   TRB |   AST |   STL |   BLK |   TOV |    PF |   PTS |
|----+-----------+-------+------+------+-------+-----+------+-------+-------+-------+---------+-------+-------+---------+-------+-------+---------+---------+-------+-------+---------+-------+-------+-------+-------+-------+-------+-------+-------+-------|
|  0 | 2007-08   |    22 | DET  | NBA  | SG    |  75 |    9 |  12.9 |   1.3 |   3.2 |   0.411 |   0.1 |   0.6 |   0.208 |   1.2 |   2.6 |   0.461 |   0.432 |   0.9 |   1.2 |   0.782 |   0.5 |   1.3 |   1.8 |   0.7 |   0.4 |   0.1 | 

In [10]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import pandas as pd

url = "https://www.basketball-reference.com/players/b/barneha02/gamelog/2019"
ht = requests.get(url)

soup = BeautifulSoup(ht.content,'html.parser')
print("HTML",soup.content)
print(soup.prettify())
# use findALL() to get the column headers
col_hed = soup.findAll('tr',limit=2)
print("Column head", col_hed)
# use getText()to extract the text we need into a list
headers = [th.getText() for th in col_hed[0].findAll('th')]

# exclude the first column as we will not need the ranking order from Basketball Reference for the analysis
headers 


HTML None
<!DOCTYPE html>
<html class="no-js" data-root="/home/bbr/www" data-version="klecko-" itemscope="" itemtype="https://schema.org/WebSite" lang="en">
 <head>
  <meta charset="utf-8"/>
  <meta content="ie=edge" http-equiv="x-ua-compatible"/>
  <meta content="width=device-width, initial-scale=1.0, maximum-scale=2.0" name="viewport">
   <link href="https://d2p3bygnnzw9w3.cloudfront.net/req/201904231" rel="dns-prefetch"/>
   <!-- no:cookie fast load the css.           -->
   <link crossorigin="" href="https://d2p3bygnnzw9w3.cloudfront.net" rel="preconnect"/>
   <link crossorigin="" href="https://d2cwpp38twqe55.cloudfront.net" rel="preconnect"/>
   <style>
   </style>
   <link as="style" crossorigin="" href="https://d2p3bygnnzw9w3.cloudfront.net/req/201906112/css/bbr/sr-min.css" onload="this.rel='stylesheet'" rel="preload"/>
   <noscript>
    <link href="https://d2p3bygnnzw9w3.cloudfront.net/req/201906112/css/bbr/sr-min.css" rel="stylesheet" type="text/css"/>
   </noscript>
   <link 

[]