In [1]:
import requests
import pandas as pd
from bs4 import BeautifulSoup

#### Get player stats for MVP each year

In [2]:
years = range(1990, 2022)

In [3]:
# storing html pages for each year
for year in years:
    url = f'https://www.basketball-reference.com/awards/awards_{year}.html'
    data = requests.get(url)
    
    with open(f'mvp_data/{year}.html', 'w+') as f:
        f.write(data.text)

In [4]:
# extracting table from each html page
tables = []
for year in years:
    with open(f'mvp_data/{year}.html') as f:
        page = f.read()
    soup = BeautifulSoup(page, 'html.parser')
    soup.find('tr', class_='over_header').decompose()
    mvp_table = soup.find_all(id='mvp')
    table_df = pd.read_html(str(mvp_table))[0]
    table_df['Year'] = year
    tables.append(table_df)

In [5]:
tables[0].head()

Unnamed: 0,Rank,Player,Age,Tm,First,Pts Won,Pts Max,Share,G,MP,...,TRB,AST,STL,BLK,FG%,3P%,FT%,WS,WS/48,Year
0,1,Magic Johnson,30,LAL,27.0,636.0,920,0.691,79,37.2,...,6.6,11.5,1.7,0.4,0.48,0.384,0.89,16.5,0.27,1990
1,2,Charles Barkley,26,PHI,38.0,614.0,920,0.667,79,39.1,...,11.5,3.9,1.9,0.6,0.6,0.217,0.749,17.3,0.269,1990
2,3,Michael Jordan,26,CHI,21.0,564.0,920,0.613,82,39.0,...,6.9,6.3,2.8,0.7,0.526,0.376,0.848,19.0,0.285,1990
3,4,Karl Malone,26,UTA,2.0,214.0,920,0.233,82,38.1,...,11.1,2.8,1.5,0.6,0.562,0.372,0.762,15.9,0.245,1990
4,5,Patrick Ewing,27,NYK,1.0,162.0,920,0.176,82,38.6,...,10.9,2.2,1.0,4.0,0.551,0.25,0.775,13.5,0.205,1990


In [6]:
mvps = pd.concat(tables)
mvps.head()

Unnamed: 0,Rank,Player,Age,Tm,First,Pts Won,Pts Max,Share,G,MP,...,TRB,AST,STL,BLK,FG%,3P%,FT%,WS,WS/48,Year
0,1,Magic Johnson,30,LAL,27.0,636.0,920,0.691,79,37.2,...,6.6,11.5,1.7,0.4,0.48,0.384,0.89,16.5,0.27,1990
1,2,Charles Barkley,26,PHI,38.0,614.0,920,0.667,79,39.1,...,11.5,3.9,1.9,0.6,0.6,0.217,0.749,17.3,0.269,1990
2,3,Michael Jordan,26,CHI,21.0,564.0,920,0.613,82,39.0,...,6.9,6.3,2.8,0.7,0.526,0.376,0.848,19.0,0.285,1990
3,4,Karl Malone,26,UTA,2.0,214.0,920,0.233,82,38.1,...,11.1,2.8,1.5,0.6,0.562,0.372,0.762,15.9,0.245,1990
4,5,Patrick Ewing,27,NYK,1.0,162.0,920,0.176,82,38.6,...,10.9,2.2,1.0,4.0,0.551,0.25,0.775,13.5,0.205,1990


In [7]:
mvps.to_csv('mvps.csv')

#### Get all player stats in order to predict next season's mvp

In [13]:
from selenium import webdriver
import time

In [17]:
driver = webdriver.Chrome()

# storing html pages for each year
for year in years:
    url = f'https://www.basketball-reference.com/leagues/NBA_{year}_per_game.html'
    driver.get(url)
    driver.execute_script('window.scrollTo(1, 10000)')
    time.sleep(2)
    html = driver.page_source
    with open(f'player_data/{year}.html', 'w+') as f:
        f.write(html)

In [23]:
# extracting table from each html page
tables = []
for year in years:
    with open(f'player_data/{year}.html') as f:
        page = f.read()
    soup = BeautifulSoup(page, 'html.parser')
    soup.find('tr', class_='thead').decompose()
    players_table = soup.find_all(id='per_game_stats')
    table_df = pd.read_html(str(players_table))[0]
    table_df['Year'] = year
    tables.append(table_df)

In [27]:
players = pd.concat(tables)
players.head()

Unnamed: 0,Rk,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,...,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,Year
0,1,Mark Acres,C,27,ORL,80,50,21.1,1.7,3.6,...,1.9,3.5,5.4,0.8,0.5,0.3,0.9,3.1,4.5,1990
1,2,Michael Adams,PG,27,DEN,79,74,34.1,5.0,12.5,...,0.6,2.2,2.8,6.3,1.5,0.0,1.8,1.7,15.5,1990
2,3,Mark Aguirre,SF,30,DET,78,40,25.7,5.6,11.5,...,1.5,2.4,3.9,1.9,0.4,0.2,1.6,2.6,14.1,1990
3,4,Danny Ainge,PG,30,SAC,75,68,36.4,6.7,15.4,...,0.9,3.4,4.3,6.0,1.5,0.2,2.5,3.2,17.9,1990
4,5,Mark Alarie,PF,26,WSB,82,10,23.1,4.5,9.6,...,1.8,2.7,4.6,1.7,0.7,0.5,1.2,2.7,10.5,1990


In [28]:
players.to_csv('players.csv')

#### Get win-loss record for each team