# Workbook

In [269]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import BayesianRidge
from sklearn.linear_model import Ridge
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [270]:
# Load the data
data = pd.read_csv("./data/players_stats.csv")

# drop colinear columns
data = data.drop(columns=["Player", "Link"])
data = data.drop(columns=["mp_per_g", "pts_per_g", "trb_per_g", "ast_per_g", "g", "gs"])
data = data.drop(columns=["fta", "fga", "fg3a", "ft", "fg", "fg3", "fg_pct", "fg3_pct", "ft_pct"])
# data = data.drop(columns=["pf", "Weight_kg"])
data = data.drop(columns=["usg_pct", "orb", "off_rtg"])

# List of columns to normalize (Salary and columns from 7 to 20)
columns_to_normalize = ['Salary'] + data.columns[5:19].tolist()

print(columns_to_normalize)

# Divide the selected columns by the 'mp' column
data[columns_to_normalize] = data[columns_to_normalize].div(data['mp'], axis=0)
data = data.drop(columns=["mp"])


# features produced cutting edge R2
# data = data[["Salary", "Age", "ft_pct", "def_rtg", "plus_minus_per_200_poss", "mp_per_g"]]

# remove Nan values
data = data.dropna()

Y = data["Salary"]
X = data.drop(columns=["Salary"])
print(X.info())

['Salary', 'trb', 'ast', 'stl', 'blk', 'tov', 'pf', 'pts', 'ts_pct', 'def_rtg', 'plus_minus_per_200_poss']
<class 'pandas.core.frame.DataFrame'>
Index: 331 entries, 0 to 385
Data columns (total 13 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Age                      331 non-null    int64  
 1   Height_cm                331 non-null    float64
 2   Weight_kg                331 non-null    float64
 3   trb                      331 non-null    float64
 4   ast                      331 non-null    float64
 5   stl                      331 non-null    float64
 6   blk                      331 non-null    float64
 7   tov                      331 non-null    float64
 8   pf                       331 non-null    float64
 9   pts                      331 non-null    float64
 10  ts_pct                   331 non-null    float64
 11  def_rtg                  331 non-null    float64
 12  plus_minus_per_200_poss  331 non

In [271]:
X.corr()

Unnamed: 0,Age,Height_cm,Weight_kg,trb,ast,stl,blk,tov,pf,pts,ts_pct,def_rtg,plus_minus_per_200_poss
Age,1.0,-0.115332,0.163446,-0.040534,0.173549,-0.075959,-0.086076,-0.061305,-0.129878,0.013011,-0.188052,-0.204963,0.071808
Height_cm,-0.115332,1.0,0.69382,0.623459,-0.321598,-0.063503,0.553111,0.051163,0.375415,0.075879,0.114603,0.085675,-0.01544
Weight_kg,0.163446,0.69382,1.0,0.551518,-0.217799,-0.085864,0.401284,0.080309,0.293116,0.098129,0.014779,-0.015771,0.024247
trb,-0.040534,0.623459,0.551518,1.0,0.041434,0.307616,0.759444,0.335075,0.682791,0.250734,0.291535,0.201583,0.008812
ast,0.173549,-0.321598,-0.217799,0.041434,1.0,0.233533,-0.073005,0.673852,-0.040423,0.403555,0.011296,-0.028002,0.021773
stl,-0.075959,-0.063503,-0.085864,0.307616,0.233533,1.0,0.310755,0.269736,0.384696,0.07117,0.398567,0.364603,-0.041194
blk,-0.086076,0.553111,0.401284,0.759444,-0.073005,0.310755,1.0,0.238864,0.646826,0.17724,0.242824,0.17722,-0.090349
tov,-0.061305,0.051163,0.080309,0.335075,0.673852,0.269736,0.238864,1.0,0.252339,0.581577,0.09695,0.083718,-0.147299
pf,-0.129878,0.375415,0.293116,0.682791,-0.040423,0.384696,0.646826,0.252339,1.0,0.019882,0.414165,0.361733,-0.107226
pts,0.013011,0.075879,0.098129,0.250734,0.403555,0.07117,0.17724,0.581577,0.019882,1.0,-0.030072,-0.104514,0.084072


In [281]:
correlation_matrix = X.corr()

mask = correlation_matrix.abs() > 0.5

correlation_pairs = correlation_matrix[mask].stack()

# Rename the columns for clarity
correlation_pairs.columns = ['Feature_1', 'Feature_2', 'Correlation']

# Remove self-correlations (pairs with the same feature)
# correlation_pairs = correlation_pairs[correlation_pairs['Feature_1'] != correlation_pairs['Feature_2']]

# Optionally sort the pairs by correlation value
# correlation_pairs = correlation_pairs.sort_values(by='Correlation', ascending=False)

correlation_pairs

KeyError: 'Feature_1'

In [272]:
data.head()

Unnamed: 0,Age,Salary,Height_cm,Weight_kg,trb,ast,stl,blk,tov,pf,pts,ts_pct,def_rtg,plus_minus_per_200_poss
0,35,18337.162666,211.0,108.0,0.177356,0.135435,0.024722,0.032605,0.087424,0.047653,0.728054,0.000224,0.041204,0.001827
1,31,28411.958121,193.0,93.0,0.131296,0.149972,0.029428,0.01528,0.074137,0.071873,0.545557,0.000344,0.06678,0.001698
2,27,20108.622803,198.0,93.0,0.125868,0.192889,0.024928,0.011443,0.072742,0.084185,0.75235,0.00025,0.048631,0.002575
3,30,8722.329163,213.0,131.0,0.402791,0.144851,0.03898,0.038499,0.084216,0.122233,0.398941,0.000269,0.052454,0.004572
4,28,6217.668126,193.0,89.0,0.11739,0.09033,0.027457,0.017907,0.037803,0.062475,0.403502,0.00027,0.046956,0.002029


In [273]:
# create a test train split
X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, test_size=0.2, random_state=42
)


# # "Salary" is the target variable
scaler = StandardScaler().set_output(transform="pandas")

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [274]:
# fit a linear model
model = LinearRegression()
model.fit(X_train, Y_train)
# evaluate the model
score = model.score(X_test, Y_test)
print("R^2 score:", score)

R^2 score: 0.14905799790637875


In [275]:
# fit a linear model
model = Ridge()
model.fit(X_train, Y_train)
# evaluate the model
score = model.score(X_test, Y_test)
print("R^2 score:", score)

R^2 score: 0.14798495474206885


In [276]:
model = BayesianRidge()
model.fit(X_train, Y_train)
# evaluate the model
score = model.score(X_test, Y_test)
print("R^2 score:", score)

R^2 score: 0.1459377778223997


In [277]:
reg = GradientBoostingRegressor()
reg.fit(X_train, Y_train)
reg.score(X_test, Y_test)

0.2956996879458341

In [278]:
# Access feature importances
importances = reg.feature_importances_

print(len(importances))
# Print feature importance for each feature
for i, importance in enumerate(importances):
    # if importance > 0.01:
    print(f"Feature {i}: {importance}")

13
Feature 0: 0.04731427835045303
Feature 1: 0.004085442522173061
Feature 2: 0.08002911657344339
Feature 3: 0.08825191811028324
Feature 4: 0.02694038450256069
Feature 5: 0.012739485711002292
Feature 6: 0.01723659771007056
Feature 7: 0.05147366738222096
Feature 8: 0.015398561900726701
Feature 9: 0.12990925474950069
Feature 10: 0.27878558929572733
Feature 11: 0.21890070536401054
Feature 12: 0.02893499782782754


In [279]:
from sklearn.inspection import permutation_importance

# Assuming you have the model and data
results = permutation_importance(reg, X, Y, n_repeats=100, random_state=42)

results_importances = results.importances_mean

# Display the importance of each feature
# Print feature importance for each feature
for i, importance in enumerate(results_importances):
    # if importance > 0.03:
    print(f"Feature {i}: {importance}")

Feature 0: 0.0
Feature 1: 0.0
Feature 2: 0.0
Feature 3: -9.664360223583746e-05
Feature 4: 0.0
Feature 5: 0.0
Feature 6: 0.0
Feature 7: 0.0
Feature 8: 1.0322023079655241e-05
Feature 9: 0.15432049749423943
Feature 10: 0.0
Feature 11: 0.06457852546329707
Feature 12: 0.0025389394251925756
