In [1]:
# All the packages

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.svm import SVR
from sklearn import neighbors
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from operator import itemgetter
from urllib.request import urlopen
from bs4 import BeautifulSoup

In [2]:
def season_stats(year):
    
    adv_url = "https://www.basketball-reference.com/leagues/NBA_{}_advanced.html".format(year)
    adv_html = urlopen(adv_url)
    soup_adv = BeautifulSoup(adv_html)
    
    pg_url = "https://www.basketball-reference.com/leagues/NBA_{}_per_game.html".format(year)
    pg_html = urlopen(pg_url)
    soup_pg = BeautifulSoup(pg_html)
    
    # To see the column headers we have
    soup_adv.findAll('tr', limit=2)
    # GetText() function will help extract the text from the data we need into a list format
    headers_adv = [th.getText() for th in soup_adv.findAll('tr', limit=2)[0].findAll('th')]
    # Exclude the first column to remove ranking of the players from the webpage
    headers_adv = headers_adv[1:]
    headers_adv
    
    # To see the column headers we have
    soup_pg.findAll('tr', limit=2)
    # GetText() function will help extract the text from the data we need into a list format
    headers_pg = [th.getText() for th in soup_pg.findAll('tr', limit=2)[0].findAll('th')]
    # Exclude the first column to remove ranking of the players from the webpage
    headers_pg = headers_pg[1:]
    headers_pg
    
    # Remove the first header row
    rows_adv = soup_adv.findAll('tr')[1:]
    player_stats_adv = [[td.getText() for td in rows_adv[i].findAll('td')]
                for i in range(len(rows_adv))]

    # Remove the first header row
    rows_pg = soup_pg.findAll('tr')[1:]
    player_stats_pg = [[td.getText() for td in rows_pg[i].findAll('td')]
                for i in range(len(rows_pg))]

    # Year's Player per game stats
    stats_year_pg = pd.DataFrame(player_stats_pg, columns = headers_pg)

    # Use notnull() function to find the non-missing values, when there are missing values in the dataframe.
    stats_year_pg = stats_year_pg[stats_year_pg['Player'].notnull()]

    # stats_2010_pg.drop(stats_2010_pg.index[0])

    # Convert string/object to float
    cols = ['Age', 'G', 'GS','MP', 'FG','3P','FT%','FG%', '3P%','AST', 'STL', 'BLK','TRB', 'PTS','eFG%']
    stats_year_pg[cols] = stats_year_pg[cols].apply(pd.to_numeric, errors='coerce', axis=1)

    # Replacing na values to 0
    stats_year_pg = stats_year_pg[:].fillna(0)

    # # Keep the first record in the dataset
    # stats_year_pg = stats_year_pg.drop_duplicates(['Player'], keep = 'first')

    # Year Player advanced stats
    stats_year_adv = pd.DataFrame(player_stats_adv, columns = headers_adv)

    # Use notnull() function to find the non-missing values, when there are missing values in the dataframe.
    stats_year_adv = stats_year_adv[stats_year_adv['Player'].notnull()]

    # stats_year_pg.drop(stats_year_pg.index[0])

    # Convert string/object to float
    cols = ['Age', 'G', 'TRB%','AST%','STL%','BLK%','PER','TS%', 'WS','USG%','BPM','VORP']
    stats_year_adv[cols] = stats_year_adv[cols].apply(pd.to_numeric, errors='coerce', axis=1)

    # Replacing na values to 0
    stats_year_adv = stats_year_adv[:].fillna(0)

    # # Keep the first record in the dataset
    # stats_year_adv = stats_year_adv.drop_duplicates(['Player'], keep = 'first')
    
    # Merge both per game and advanced stats
    merged_year_stats = pd.merge(stats_year_pg, stats_year_adv, on=['Player','Tm'], how='outer')

    # Save the datasets

    stats_year = "season" + "_" + str(year) + "_" + "stats"

    merged_year_stats.to_csv(str(stats_year) + ".csv", index=False)
    


In [3]:
# Create list of Years to scrape data
years = list(range(2015, 2020))
years

[2015, 2016, 2017, 2018, 2019]

In [4]:
for year in years:
    season_stats(year)