# Workbook

In [49]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import BayesianRidge
from sklearn.linear_model import Ridge
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split

In [93]:
# Load the data
data = pd.read_csv("./data/players_stats.csv")

# drop Player and Link columns
data = data.drop(columns=["mp", "g", "pts", "trb", "ast"])

# drop colinear columns
data = data.drop(columns=["Player", "Link"])

# data = data[["Salary", "Age", "ft_pct", "def_rtg", "plus_minus_per_200_poss", "mp_per_g"]]

# remove Nan values
data = data.dropna()

# "Salary" is the target variable
Y = data["Salary"]
X = data.drop(columns=["Salary"])

In [94]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 273 entries, 0 to 385
Data columns (total 28 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Age                      273 non-null    int64  
 1   Salary                   273 non-null    float64
 2   Height_cm                273 non-null    float64
 3   Weight_kg                273 non-null    float64
 4   gs                       273 non-null    float64
 5   fg                       273 non-null    float64
 6   fga                      273 non-null    float64
 7   fg3                      273 non-null    float64
 8   fg3a                     273 non-null    float64
 9   ft                       273 non-null    float64
 10  fta                      273 non-null    float64
 11  orb                      273 non-null    float64
 12  stl                      273 non-null    float64
 13  blk                      273 non-null    float64
 14  tov                      273 no

In [86]:
# create a test train split
X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, test_size=0.2, random_state=42
)

In [87]:
# fit a linear model
model = LinearRegression()
model.fit(X_train, Y_train)
# evaluate the model
score = model.score(X_test, Y_test)
print("R^2 score:", score)

R^2 score: 0.4141215298804247


In [88]:
# fit a linear model
model = Ridge()
model.fit(X_train, Y_train)
# evaluate the model
score = model.score(X_test, Y_test)
print("R^2 score:", score)

R^2 score: 0.4152294509066893


In [89]:
model = BayesianRidge()
model.fit(X_train, Y_train)
# evaluate the model
score = model.score(X_test, Y_test)
print("R^2 score:", score)

R^2 score: -0.027368766195444483


In [90]:
reg = GradientBoostingRegressor()
reg.fit(X_train, Y_train)
reg.score(X_test, Y_test)

0.6482571225777825

In [91]:
# Access feature importances
importances = reg.feature_importances_

print(len(importances))
# Print feature importance for each feature
for i, importance in enumerate(importances):
    if importance > 0.05:
        print(f"Feature {i}: {importance}")

5
Feature 0: 0.08679792155143644
Feature 1: 0.0764774288196061
Feature 3: 0.1037772286022248
Feature 4: 0.7072527944136429


In [92]:
from sklearn.inspection import permutation_importance

# Assuming you have the model and data
results = permutation_importance(reg, X, Y, n_repeats=100, random_state=42)

results_importances = results.importances_mean

# Display the importance of each feature
# Print feature importance for each feature
for i, importance in enumerate(results_importances):
    if importance > 0.03:
        print(f"Feature {i}: {importance}")

Feature 0: 0.12804227571138224
Feature 1: 0.07970245149111553
Feature 2: 0.04060114153669128
Feature 3: 0.10923952003886321
Feature 4: 1.2078911501478118
