In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import requests
from bs4 import BeautifulSoup
import time

# Getting the data

Data will be from 1980-81 up to past season for train and validation splits. That is because prior to that season voting was done by players.

In [3]:
from collections import defaultdict

In [4]:
def work_player_profile(param, season):
    url = "https://www.basketball-reference.com" + param
    res = requests.get(url)
    soup = BeautifulSoup(res.text)
    
    data_dict = {}
    
    per_game = soup.find(attrs={'id': 'all_per_game'})
    for row in per_game.findAll("tr"):
        if 'id' in row.attrs and row.attrs['id'] == "per_game." + season:
            data_dict['fga'] = float(row.find('td', attrs={'data-stat': 'fga_per_g'}).text)
            data_dict['fg3a'] = float(row.find('td', attrs={'data-stat': 'fg3a_per_g'}).text)
            data_dict['fta'] = float(row.find('td', attrs={'data-stat': 'fta_per_g'}).text)
            break
    
    advanced_table = soup.find(attrs={'id': 'all_advanced'})
    for child in advanced_table.children:
        if "table_outer_container" in child:
            other_soup = BeautifulSoup(child)
            rows = other_soup.findAll("tr")
    for row in rows:
        if 'id' in row.attrs and row.attrs['id'] == "advanced." + season:
            data_dict.update(
                {
                    'per': float(row.find('td', attrs={'data-stat': 'per'}).text),
                    'ts_pct': float(row.find('td', attrs={'data-stat': 'ts_pct'}).text),
                    'usg_pct': float(row.find('td', attrs={'data-stat': 'usg_pct'}).text),
                    'bpm': float(row.find('td', attrs={'data-stat': 'bpm'}).text),
                    'season': str(int(season)-1) + "-" + season[-2:],
                }
            )
            return data_dict
        

In [4]:
def get_stats_of_voting(url):
    res = requests.get(url)
    soup = BeautifulSoup(res.text)
    item = soup.find(attrs={'class': 'stats_table'})
    rows = item.findAll("tr")
    
    season = url.split(".html")[0][-4:]
    
    print(f"Current season: {season}")
    
    players_stats = defaultdict(list)
    
    for index, row in enumerate(rows):
        
        header_cells = row.findAll("th")
        for header_cell in header_cells:
            if 'data-stat' in header_cell.attrs and header_cell['data-stat'] == 'ranker' and 'csk' in header_cell.attrs:
                rank = int(header_cell.getText())
        td_cells = row.findAll("td")
        if not td_cells:
            continue
        for cell in td_cells:
            if 'data-stat' not in cell.attrs:
                continue
            if cell['data-stat'] == 'age' or cell['data-stat'] == 'team_id':
                continue
            if cell['data-stat'] == 'player':
                time.sleep(1)
                advanced_dict = work_player_profile(cell.find("a")['href'], season)
                for key in advanced_dict:
                    players_stats[key].append(advanced_dict[key])
                players_stats[cell['data-stat']].append(cell.getText())
            else:
                text = cell.getText() or "0"
                players_stats[cell['data-stat']].append(float(text))
    return players_stats

In [36]:
seasons = range(1981, 2019)

data_mvp = defaultdict(list)

for season in seasons:
    full_url = f"https://www.basketball-reference.com/awards/awards_{str(season)}.html"
    cur_season_dict = get_stats_of_voting(full_url)
    for key in cur_season_dict:
        data_mvp[key].extend(cur_season_dict[key])
        
data_frame = pd.DataFrame(data_mvp)
data_frame.to_csv("../../Data/mvp_data/mvp_votings.csv")

Current season: 1981
Current season: 1982
Current season: 1983
Current season: 1984
Current season: 1985
Current season: 1986
Current season: 1987
Current season: 1988
Current season: 1989
Current season: 1990
Current season: 1991
Current season: 1992
Current season: 1993
Current season: 1994
Current season: 1995
Current season: 1996
Current season: 1997
Current season: 1998
Current season: 1999
Current season: 2000
Current season: 2001
Current season: 2002
Current season: 2003
Current season: 2004
Current season: 2005
Current season: 2006
Current season: 2007
Current season: 2008
Current season: 2009
Current season: 2010
Current season: 2011
Current season: 2012
Current season: 2013
Current season: 2014
Current season: 2015
Current season: 2016
Current season: 2017
Current season: 2018


In [3]:
data_frame = pd.read_csv("../../../Data/mvp_data/mvp_votings.csv")

# Defining the pipeline

In [4]:
from sklearn.preprocessing import PolynomialFeatures, normalize, StandardScaler, MinMaxScaler
from sklearn.linear_model import SGDRegressor, LinearRegression, Ridge, Lasso
from sklearn.svm import SVR

In [5]:
from sklearn.pipeline import Pipeline

In [6]:
from sklearn.metrics import mean_squared_error
from sklearn.utils import shuffle

In [7]:
import logging

In [8]:
def pipeline(data_frame, estimators, params, should_scale, should_poly):
    
    unique_seasons = data_frame.season.unique()
    
    train_columns = [
        'fga', 'fg3a', 'fta', 'fg_pct', 'fg3_pct', 'ft_pct', 'per', 'ts_pct', 'usg_pct', 'bpm',
        'mp_per_g', 'pts_per_g', 'trb_per_g', 'ast_per_g', 'stl_per_g', 'blk_per_g', 'ws_per_48'            
    ]
    target_columns = ['award_share']
    
    for handler in logging.root.handlers[:]:
        logging.root.removeHandler(handler)
    logging.basicConfig(filename='log_no_scale_poly_2_no_inter.txt', filemode='w', level=logging.INFO)
    logger = logging.getLogger()
    
    minimal_error, best_estimator = None, None
    
    for estimator in estimators:
        
        print(f"Starting with estimator: {estimator.__name__}")
        logging.info(f"Starting with estimator: {estimator.__name__}")
        
        for index, cur_params in enumerate(params[estimator.__name__]):
            
            regressor = estimator(**cur_params)
            
            # To collect MSE over each split
            errors = []
    
            for season in unique_seasons:

                train_data = data_frame.loc[data_frame.season != season]
                validation_data = data_frame.loc[data_frame.season == season]

                # Get train data
                train_x = train_data[train_columns].to_numpy()
                train_y = train_data[target_columns].to_numpy()
                train_y = train_y.reshape(train_y.shape[0], )

                # Validate over one season only
                val_x = validation_data[train_columns].to_numpy()
                val_y = validation_data[target_columns].to_numpy()
                val_y = val_y.reshape(val_y.shape[0], )
                
                if should_poly:
                    poly_fit = PolynomialFeatures(degree=2, interaction_only=False)
                    train_x = poly_fit.fit_transform(train_x)
                    val_x = poly_fit.fit_transform(val_x)
                    
                if should_scale:
                    min_max_scaler = MinMaxScaler()
                    train_x = min_max_scaler.fit_transform(train_x)
                    val_x = min_max_scaler.fit_transform(val_x)
                
                shuffle_x, shuffle_y = shuffle(train_x, train_y)
                
                regressor.fit(shuffle_x, shuffle_y)
                predicted_y = regressor.predict(val_x)
                
                curr_error = mean_squared_error(y_true=val_y, y_pred=predicted_y)
                errors.append(curr_error)
            
            mean_error = np.average(errors)
            logging.info(f"Params: {cur_params}, MSE over all splits is: {mean_error:.4f}, with scale: {should_scale}")
            print(f"Params: {cur_params}, MSE over all splits is: {mean_error:.4f}, with scale: {should_scale}")
            
            if minimal_error is None or mean_error < minimal_error:
                minimal_error = mean_error
                best_estimator = estimator(*cur_params)
            
    return best_estimator
        

In [9]:
def define_rbf_svr():
    dicts = []
    for C in [0.1, 1, 10, 50, 100]:
        for gamma in [0.00001, 0.0001, 0.001, 0.01, 0.1]:
            dicts.append(
                {
                    'kernel': 'rbf',
                    'C': C,
                    'gamma': gamma
                }
            )
    return dicts

def define_poly_svr():
    dicts = []
    for C in [0.1, 1, 10, 100]:
        for gamma in [0.0001, 0.001, 0.01, 0.1]:
            for degree in [1, 2, 3]:
                dicts.append(
                    {
                        'kernel': 'poly',
                        'C': C,
                        'gamma': gamma,
                        'degree': degree
                    }
                )
    return dicts

def define_linear_svr():
    dicts = []
    for C in [0.1, 1, 10, 100]:
        dicts.append(
            {
                'kernel': 'linear',
                'C': C
            }
        )
    return dicts

In [359]:
    """
    LinearRegression.__name__: [
        {
            'n_jobs': -1,
        },
        {
            'n_jobs': -1,
            'normalize': True
        },
        {
            'n_jobs': -1,
        },
        {
            'n_jobs': -1,
            'normalize': True
        }
    ],
    """
    
    """
    LinearRegression.__name__: [
        True,
        True,
        False,
        False,
    ],
    """

'\nLinearRegression.__name__: [\n    True,\n    True,\n    False,\n    False,\n],\n'

In [386]:
estimators = [
    LinearRegression,
    Ridge,
    Lasso,
    #SVR,
]
params = {
    LinearRegression.__name__: [
        {
            'n_jobs': -1,
        },
        {
            'n_jobs': -1,
            'normalize': True
        },
    ],
    Ridge.__name__: [
        {
            'alpha': 1.0
        },
        {
            'alpha': 10.0
        },
        {
            'alpha': 50.0
        },
        {
            'alpha': 100.0
        },
        {
            'alpha': 200.0
        },
        {
            'alpha': 500.0
        },
    ],
    Lasso.__name__: [
        {
            'alpha': 1.0,
        },
        {
            'alpha': 10.0
        },
        {
            'alpha': 100.0
        },
    ],
    #SVR.__name__: [
        #*define_poly_svr(),
        #*define_linear_svr(),
        #*define_rbf_svr(),
    #]
}

In [387]:
best_estimator = pipeline(
    data_frame=data_frame,
    estimators=estimators,
    params=params,
    should_scale=False,
    should_poly=True
)

Starting with estimator: LinearRegression
Params: {'n_jobs': -1}, MSE over all splits is: 0.0649, with scale: False
Params: {'n_jobs': -1, 'normalize': True}, MSE over all splits is: 0.0359, with scale: False
Starting with estimator: Ridge
Params: {'alpha': 1.0}, MSE over all splits is: 0.0312, with scale: False
Params: {'alpha': 10.0}, MSE over all splits is: 0.0309, with scale: False
Params: {'alpha': 50.0}, MSE over all splits is: 0.0306, with scale: False
Params: {'alpha': 100.0}, MSE over all splits is: 0.0306, with scale: False
Params: {'alpha': 200.0}, MSE over all splits is: 0.0310, with scale: False
Params: {'alpha': 500.0}, MSE over all splits is: 0.0322, with scale: False
Starting with estimator: Lasso
Params: {'alpha': 1.0}, MSE over all splits is: 0.0395, with scale: False
Params: {'alpha': 10.0}, MSE over all splits is: 0.0462, with scale: False
Params: {'alpha': 100.0}, MSE over all splits is: 0.0693, with scale: False


## Results Analysis

Based on the results of couple of simple regressors, the best one is Ridge regressor with poly level 2 and all polynoms (not only interacitons). Alpha is set to 50 or 100.

# Test set

In [49]:
train_columns = [
    'fga', 'fg3a', 'fta', 'fg_pct', 'fg3_pct', 'ft_pct', 'per', 'ts_pct', 'usg_pct', 'bpm',
    'mp_per_g', 'pts_per_g', 'trb_per_g', 'ast_per_g', 'stl_per_g', 'blk_per_g', 'ws_per_48'            
]
target_columns = ['award_share']

In [50]:
test_columns = [
    'FGA', '3PA', 'FTA', 'FG_pct', '3P_pct', 'FT_pct', 'PER', 'TS_pct', 'USG_pct', 'BPM',
    'MP_x', 'PTS', 'TRB', 'AST', 'STL', 'BLK', 'WS_48'            
]

In [51]:
per_g_df = pd.read_csv('player_per_g.csv')

In [52]:
advanced_df = pd.read_csv('advanced.csv')

In [53]:
merged = pd.merge(per_g_df, advanced_df, on=['PlayerId', 'Player'])

In [84]:
sorted_merged = merged.sort_values(by='PER', ascending=False)

In [85]:
top_players = sorted_merged[:50]

In [152]:
test_x = top_players[test_columns]

Using top model

In [153]:
train_x = data_frame[train_columns].to_numpy()
train_y = data_frame[target_columns].to_numpy()

shuffle_x, shuffle_y = shuffle(train_x, train_y)
train_y = train_y.reshape(train_y.shape[0], )

In [154]:
ridge = Ridge(alpha=100)
poly_fit = PolynomialFeatures(degree=2, interaction_only=False)

In [155]:
train_x = poly_fit.fit_transform(train_x)

In [156]:
ridge.fit(train_x, train_y)

Ridge(alpha=100, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

In [157]:
test_x = test_x.to_numpy()

In [158]:
test_x = np.nan_to_num(test_x)

In [159]:
test_x = poly_fit.fit_transform(test_x)

In [160]:
predict_y = ridge.predict(test_x)

In [161]:
sorted_indices = np.argsort(predict_y)[::-1]

In [162]:
predictions = predict_y[sorted_indices]

In [150]:
rank = 1
for i in range(len(sorted_indices)):
    if predictions[i] < 0:
        continue
    print(f"{rank}. {top_players.iloc[sorted_indices[i]].Player}: {predictions[i]}")
    rank += 1

1. Giannis Antetokounmpo: 0.8411406251629865
2. James Harden: 0.5289575080559985
3. Nikola Jokic: 0.37276804561100846
4. Rudy Gobert: 0.2273300234762307
5. Kawhi Leonard: 0.18433245210820604
6. Kyrie Irving: 0.16419885310873972
7. Kevin Durant: 0.11770066232962462
8. Clint Capela: 0.10063022152486134
9. Damian Lillard: 0.09804011426326209
10. Russell Westbrook: 0.08824654886066852
11. Nikola Vucevic: 0.06581035736472052
12. Ben Simmons: 0.05633753587527501
13. Chris Paul: 0.047065930067474615
14. Joel Embiid: 0.03218541730778879
