In [1]:
# Scrape odds from odds portal
import time
import requests
import pandas as pd
from datetime import date
from selenium import webdriver 
from selenium.webdriver.chrome.options import Options
chrome_options = Options()
# chrome_options.add_argument('--proxy-server=140.227.211.47:8080')
chrome_options.headless = True # also works
PATH = '/home/dev/Desktop/Projects/AI/chromedriver'
driver = webdriver.Chrome(PATH, options=chrome_options)

In [2]:
# Scraping Functions

east = 'divs_standings_E'
west = 'divs_standings_W'
expanded_stats = 'expanded_standings'

def scrape_table(table_id, driver=driver):
    print(driver)
    table = driver.find_element_by_tag_name(f'table#{table_id}')
    cols = []
    header = table.find_element_by_tag_name('thead')
    header_row = header.find_elements_by_tag_name('tr')[-1]
    header_cols = header_row.find_elements_by_tag_name('th')
    cols = [i.text.replace('Eastern Conference', 'Team').replace('Western Conference', 'Team') for i in header_cols]

    # Create DF
    df = pd.DataFrame(columns = cols)

    # Get row values
    body = table.find_element_by_tag_name('tbody')
    rows = body.find_elements_by_tag_name('tr')
    for row in rows:
        if 'Division' in row.text:
            continue
        
        # Find each team name
        team = row.find_element_by_tag_name('th').text
        try:
            team_name = row.find_element_by_tag_name('a').text.replace('*', '')
        except:
            team_name = 'LgAvg'
        
        # Find each teams stats
        stats = row.find_elements_by_tag_name('td')
        
        # If scraping expanded_standings or ratings we must reverse the team and rank names
        if (table_id == 'expanded_standings' or table_id == 'ratings') and len(stats) > 2:
            team_name = stats[0].text
            stats.pop(0)
            rank = team
            
        # Team name changes
        if team_name == 'New Orleans Hornets':
            team_name = 'New Orleans Pelicans'
        if team_name == 'Charlotte Bobcats':
            team_name = 'Charlotte Hornets'
        if team_name == 'New Jersey Nets':
            team_name = 'Brooklyn Nets'
        
        # Initialize the table row array
        stat_list = [team_name]
        
        # Clean up formatting in these two tables
        if (table_id == 'expanded_standings' or table_id == 'ratings') and len(stats) > 2:
            stat_list.append(rank)
            stat_list.append(team_name)
            stat_list.pop(0)
            
        for stat in stats:
            stat_list.append(stat.text)

        # If nothing is in the row, skip to next row
        if len(stat_list) <= 1:
            continue
        df.loc[len(df)] = stat_list
    
    if 'Overall' in cols:
        df.drop('Overall', axis=1, inplace=True)
    
    df.set_index('Team', inplace=True)
    
    return df

# Clean merged table
# go through every col with '-' and split it
def clean_df(x):
    x = x.astype(str)
    x = x.replace([''], '0')
    cols_to_delim = []
    for col in x.columns:
        result = x[col].str.contains(pat='\d-\d')
        if result.any():
            cols_to_delim.append(col)

    for col in cols_to_delim:
            x[[col + '1', col + '2']] = x[col].str.split('-', expand=True)
            del x[col]

    x = x.astype(float)
    return x

In [3]:
years = [i for i in range(2013, 2022)]
for year in years:
    print('Scraping ', year)
    print(driver)
    standings_url = f'https://www.basketball-reference.com/leagues/NBA_{year}_standings.html'
    
    # Driver
#     chrome_options.headless = True # also works
#     PATH = '/home/dev/Desktop/Projects/AI/chromedriver'
#     driver = webdriver.Chrome(PATH, options=chrome_options)
    driver.get(standings_url)
    
    expanded_stats_df = scrape_table(expanded_stats)
    expanded_stats_df = clean_df(expanded_stats_df)
    westen_df = scrape_table(west)
    eastern_df = scrape_table(east)

    # Concat tables
    merged = pd.concat([eastern_df, westen_df]).merge(expanded_stats_df, on='Team')

    # Scrape ratings
    # Open ratings url
    ratings_url = f'https://www.basketball-reference.com/leagues/NBA_{year}_ratings.html'
    driver.get(ratings_url)

    #Scrape ratings table
    ratings = scrape_table('ratings')
    ratings.drop(['Rk', 'Conf', 'Div', 'W', 'L', 'W/L%'], axis=1, inplace=True)

    # Merge ratings and stats
    merged = merged.merge(ratings, on='Team')
    merged.to_csv(f'./Stats/NBA {year} Stats.csv')
    driver.delete_all_cookies()

Scraping  2013
<selenium.webdriver.chrome.webdriver.WebDriver (session="46f1002a7a8434523e0bd045f1ef4787")>
<selenium.webdriver.chrome.webdriver.WebDriver (session="46f1002a7a8434523e0bd045f1ef4787")>
<selenium.webdriver.chrome.webdriver.WebDriver (session="46f1002a7a8434523e0bd045f1ef4787")>
<selenium.webdriver.chrome.webdriver.WebDriver (session="46f1002a7a8434523e0bd045f1ef4787")>
<selenium.webdriver.chrome.webdriver.WebDriver (session="46f1002a7a8434523e0bd045f1ef4787")>
Scraping  2014
<selenium.webdriver.chrome.webdriver.WebDriver (session="46f1002a7a8434523e0bd045f1ef4787")>
<selenium.webdriver.chrome.webdriver.WebDriver (session="46f1002a7a8434523e0bd045f1ef4787")>
<selenium.webdriver.chrome.webdriver.WebDriver (session="46f1002a7a8434523e0bd045f1ef4787")>
<selenium.webdriver.chrome.webdriver.WebDriver (session="46f1002a7a8434523e0bd045f1ef4787")>
<selenium.webdriver.chrome.webdriver.WebDriver (session="46f1002a7a8434523e0bd045f1ef4787")>
Scraping  2015
<selenium.webdriver.chrom

In [16]:
def scrape_year(year):
    
    print('Scraping: ', year)
    chrome_options = Options()
    chrome_options.headless = True # also works
    driver = webdriver.Chrome(PATH, options=chrome_options)
    # Open stats webpage
    stats_url = f'https://www.basketball-reference.com/leagues/NBA_{year}_standings.html'
    driver.get(stats_url)
    time.sleep(5)

    # Scrape stats tables
    eastern_conf = scrape_table(east)
    western_conf = scrape_table(west)
    expanded_stats_df = scrape_table(expanded_stats)
    expanded_stats_df = clean_df(expanded_stats_df)
    

    # Merge stats tables together
    merged = pd.concat([eastern_conf, western_conf]).merge(expanded_stats_df, on='Team')

    # Open ratings url
    ratings_url = f'https://www.basketball-reference.com/leagues/NBA_{year}_ratings.html'
    driver.get(ratings_url)

    #Scrape ratings table
    ratings = scrape_table('ratings')
    ratings.drop(['Rk', 'Conf', 'Div', 'W', 'L', 'W/L%'], axis=1, inplace=True)

    # Merge ratings and stats
    merged = merged.merge(ratings, on='Team')
    merged.to_csv(f'Stats/NBA {year} stats.csv', index=False)
    
    print('Stats tables')
    print(eastern_conf)
    print(western_conf)
    print(expanded_stats_df)
    
    print('Ratings')
    print(ratings)
    
    print('Merged')
    print(merged)
    
    driver.close()
    return merged
    

In [17]:
scrape_year(2007)

Scraping:  2007


NoSuchElementException: Message: no such element: Unable to locate element: {"method":"css selector","selector":"div#divs_standings_E"}
  (Session info: headless chrome=92.0.4515.131)


In [4]:
years = [i for i in range(2007, 2022)]

In [10]:
# Scrape years
for year in years:
    scrape_year(year)


Scraping:  2007
Scraping:  2008
Scraping:  2009
Scraping:  2010
Scraping:  2011
Scraping:  2012
Scraping:  2013
Scraping:  2014
Scraping:  2015
Scraping:  2016
Scraping:  2017
Scraping:  2018
Scraping:  2019
Scraping:  2020
Scraping:  2021
