In [1]:
import time
import re
from collections import defaultdict

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import requests
from bs4 import BeautifulSoup

def work_player_profile(param, season):
    url = "https://www.basketball-reference.com" + param
    res = requests.get(url)
    soup = BeautifulSoup(res.text)
    
    data_dict = {}
    
    per_game = soup.find(attrs={'id': 'all_per_game'})
    for row in per_game.findAll("tr"):
        if 'id' in row.attrs and row.attrs['id'] == "per_game." + season:
            data_dict['fga'] = float(row.find('td', attrs={'data-stat': 'fga_per_g'}).text)
            data_dict['fg3a'] = float(row.find('td', attrs={'data-stat': 'fg3a_per_g'}).text)
            data_dict['fta'] = float(row.find('td', attrs={'data-stat': 'fta_per_g'}).text)
            break
    
    advanced_table = soup.find(attrs={'id': 'all_advanced'})
    for child in advanced_table.children:
        if "table_outer_container" in child:
            other_soup = BeautifulSoup(child)
            rows = other_soup.findAll("tr")
    for row in rows:
        if 'id' in row.attrs and row.attrs['id'] == "advanced." + season:
            data_dict.update(
                {
                    'per': float(row.find('td', attrs={'data-stat': 'per'}).text),
                    'ts_pct': float(row.find('td', attrs={'data-stat': 'ts_pct'}).text),
                    'usg_pct': float(row.find('td', attrs={'data-stat': 'usg_pct'}).text),
                    'bpm': float(row.find('td', attrs={'data-stat': 'bpm'}).text),
                    'season': str(int(season)-1) + "-" + season[-2:],
                }
            )
            return data_dict

def get_stats_of_voting(url):
    res = requests.get(url)
    soup = BeautifulSoup(res.text)
    item = soup.find(attrs={'class': 'stats_table'})
    rows = item.findAll("tr")
    
    season = url.split(".html")[0][-4:]
    
    print(f"Current season: {season}")
    
    players_stats = defaultdict(list)
    
    for index, row in enumerate(rows):
        
        print(f"\tCurrent index: {index} of {len(rows)}")
        header_cells = row.findAll("th")
        for header_cell in header_cells:
            if 'data-stat' in header_cell.attrs and header_cell['data-stat'] == 'ranker' and 'csk' in header_cell.attrs:
                rank = int(header_cell.getText())
        td_cells = row.findAll("td")
        if not td_cells:
            continue
        for cell in td_cells:
            if 'data-stat' not in cell.attrs:
                continue
            if cell['data-stat'] == 'age':
                continue
            if cell['data-stat'] == 'team_id':
                base = "https://www.basketball-reference.com"
                try:
                    link = cell.find("a")['href']
                except Exception:
                    players_stats['win_pct'].append(0.5)  # average
                    continue
                url = base + link
                time.sleep(1)
                soup = BeautifulSoup(requests.get(url).text)
                for item in soup.findAll("p"):
                    if "Record" in item.text:
                        record = re.findall("\d+\-\d+", item.text)[0]
                        splitted = record.split("-")
                        players_stats['win_pct'].append(float(splitted[0]) / (float(splitted[1]) + float(splitted[0])))
                        break
                continue
            if cell['data-stat'] == 'player':
                time.sleep(1)
                advanced_dict = work_player_profile(cell.find("a")['href'], season)
                for key in advanced_dict:
                    players_stats[key].append(advanced_dict[key])
                players_stats[cell['data-stat']].append(cell.getText())
            else:
                text = cell.getText() or "0"
                players_stats[cell['data-stat']].append(float(text))
    return players_stats
    
seasons = range(1981, 2020)

mvp_data = defaultdict(list)

for season in seasons:
    full_url = f"https://www.basketball-reference.com/awards/awards_{str(season)}.html"
    cur_season_dict = get_stats_of_voting(full_url)
    for key in cur_season_dict:
        mvp_data[key].extend(cur_season_dict[key])

Current season: 1981
	Current index: 0 of 33
	Current index: 1 of 33
	Current index: 2 of 33
	Current index: 3 of 33
	Current index: 4 of 33
	Current index: 5 of 33
	Current index: 6 of 33
	Current index: 7 of 33
	Current index: 8 of 33
	Current index: 9 of 33
	Current index: 10 of 33
	Current index: 11 of 33
	Current index: 12 of 33
	Current index: 13 of 33
	Current index: 14 of 33
	Current index: 15 of 33
	Current index: 16 of 33
	Current index: 17 of 33
	Current index: 18 of 33
	Current index: 19 of 33
	Current index: 20 of 33
	Current index: 21 of 33
	Current index: 22 of 33
	Current index: 23 of 33
	Current index: 24 of 33
	Current index: 25 of 33
	Current index: 26 of 33
	Current index: 27 of 33
	Current index: 28 of 33
	Current index: 29 of 33
	Current index: 30 of 33
	Current index: 31 of 33
	Current index: 32 of 33
Current season: 1982
	Current index: 0 of 27
	Current index: 1 of 27
	Current index: 2 of 27
	Current index: 3 of 27
	Current index: 4 of 27
	Current index: 5 of 27

	Current index: 6 of 19
	Current index: 7 of 19
	Current index: 8 of 19
	Current index: 9 of 19
	Current index: 10 of 19
	Current index: 11 of 19
	Current index: 12 of 19
	Current index: 13 of 19
	Current index: 14 of 19
	Current index: 15 of 19
	Current index: 16 of 19
	Current index: 17 of 19
	Current index: 18 of 19
Current season: 1997
	Current index: 0 of 22
	Current index: 1 of 22
	Current index: 2 of 22
	Current index: 3 of 22
	Current index: 4 of 22
	Current index: 5 of 22
	Current index: 6 of 22
	Current index: 7 of 22
	Current index: 8 of 22
	Current index: 9 of 22
	Current index: 10 of 22
	Current index: 11 of 22
	Current index: 12 of 22
	Current index: 13 of 22
	Current index: 14 of 22
	Current index: 15 of 22
	Current index: 16 of 22
	Current index: 17 of 22
	Current index: 18 of 22
	Current index: 19 of 22
	Current index: 20 of 22
	Current index: 21 of 22
Current season: 1998
	Current index: 0 of 21
	Current index: 1 of 21
	Current index: 2 of 21
	Current index: 3 of 21
	

	Current index: 3 of 19
	Current index: 4 of 19
	Current index: 5 of 19
	Current index: 6 of 19
	Current index: 7 of 19
	Current index: 8 of 19
	Current index: 9 of 19
	Current index: 10 of 19
	Current index: 11 of 19
	Current index: 12 of 19
	Current index: 13 of 19
	Current index: 14 of 19
	Current index: 15 of 19
	Current index: 16 of 19
	Current index: 17 of 19
	Current index: 18 of 19
Current season: 2015
	Current index: 0 of 14
	Current index: 1 of 14
	Current index: 2 of 14
	Current index: 3 of 14
	Current index: 4 of 14
	Current index: 5 of 14
	Current index: 6 of 14
	Current index: 7 of 14
	Current index: 8 of 14
	Current index: 9 of 14
	Current index: 10 of 14
	Current index: 11 of 14
	Current index: 12 of 14
	Current index: 13 of 14
Current season: 2016
	Current index: 0 of 12
	Current index: 1 of 12
	Current index: 2 of 12
	Current index: 3 of 12
	Current index: 4 of 12
	Current index: 5 of 12
	Current index: 6 of 12
	Current index: 7 of 12
	Current index: 8 of 12
	Current 

NameError: name 'data_mvp' is not defined

In [18]:
seasons = range(2019, 2020)

new_data2 = defaultdict(list)

for season in seasons:
    full_url = f"https://www.basketball-reference.com/awards/awards_{str(season)}.html"
    cur_season_dict = get_stats_of_voting(full_url)
    for key in cur_season_dict:
        new_data2[key].extend(cur_season_dict[key])

Current season: 2019
	Current index: 0 of 14
	Current index: 1 of 14
	Current index: 2 of 14
	Current index: 3 of 14
	Current index: 4 of 14
	Current index: 5 of 14
	Current index: 6 of 14
	Current index: 7 of 14
	Current index: 8 of 14
	Current index: 9 of 14
	Current index: 10 of 14
	Current index: 11 of 14
	Current index: 12 of 14
	Current index: 13 of 14


In [19]:
for key in new_data2:
    new_data[key].extend(new_data2[key])

In [24]:
nba = pd.DataFrame(new_data)

In [25]:
nba.to_csv('mvp_data.csv')

In [26]:
nba.season.unique()

array(['1980-81', '1981-82', '1982-83', '1983-84', '1984-85', '1985-86',
       '1986-87', '1987-88', '1988-89', '1989-90', '1990-91', '1991-92',
       '1992-93', '1993-94', '1994-95', '1995-96', '1996-97', '1997-98',
       '1998-99', '1999-00', '2000-01', '2001-02', '2002-03', '2003-04',
       '2004-05', '2005-06', '2006-07', '2007-08', '2008-09', '2009-10',
       '2010-11', '2011-12', '2012-13', '2013-14', '2014-15', '2015-16',
       '2016-17', '2017-18', '2018-19'], dtype=object)

In [27]:
nba.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 649 entries, 0 to 648
Data columns (total 26 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   fga          649 non-null    float64
 1   fg3a         649 non-null    float64
 2   fta          649 non-null    float64
 3   per          649 non-null    float64
 4   ts_pct       649 non-null    float64
 5   usg_pct      649 non-null    float64
 6   bpm          649 non-null    float64
 7   season       649 non-null    object 
 8   player       649 non-null    object 
 9   win_pct      649 non-null    float64
 10  votes_first  649 non-null    float64
 11  points_won   649 non-null    float64
 12  points_max   649 non-null    float64
 13  award_share  649 non-null    float64
 14  g            649 non-null    float64
 15  mp_per_g     649 non-null    float64
 16  pts_per_g    649 non-null    float64
 17  trb_per_g    649 non-null    float64
 18  ast_per_g    649 non-null    float64
 19  stl_per_