## Load/Clean Data

Sources:

https://www.basketball-reference.com/players/

**Scrape multiple seasons**

In [695]:
import pandas as pd

PlayerID = "w/whiteco01"

def scrape_season_data(season):
    url = f'https://www.basketball-reference.com/players/{PlayerID}/gamelog/{season}'
    print(f'Scraped from url: {url}')

    html = pd.read_html(url, header=0)
    scraped_data = html[7]

    return scraped_data

# Main loop to scrape data for multiple seasons
all_data = pd.DataFrame()  # Master DataFrame to store all seasons

seasons_to_scrape = ['2023', '2024']  # which seasons to scrape

for i in seasons_to_scrape:
    season_data = scrape_season_data(i)
    all_data = pd.concat([all_data, season_data], ignore_index=True)


pd.reset_option('display.max_rows')
pd.reset_option('display.max_columns')

df = all_data.reset_index(drop=True)
all_data

Scraped from url: https://www.basketball-reference.com/players/w/whiteco01/gamelog/2023
Scraped from url: https://www.basketball-reference.com/players/w/whiteco01/gamelog/2024


Unnamed: 0,Rk,G,Date,Age,Tm,Unnamed: 5,Opp,Unnamed: 7,GS,MP,...,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,GmSc,+/-
0,1,1,2022-10-19,22-245,CHI,@,MIA,W (+8),0,24:33,...,2,2,0,1,0,1,3,10,3.3,0
1,2,2,2022-10-21,22-247,CHI,@,WAS,L (-2),0,31:03,...,2,2,1,1,0,1,1,10,7.1,+17
2,3,3,2022-10-22,22-248,CHI,,CLE,L (-32),0,23:36,...,0,1,1,0,0,0,3,5,0.4,-30
3,4,4,2022-10-24,22-250,CHI,,BOS,W (+18),0,9:24,...,0,0,4,0,0,0,1,0,1.7,+5
4,5,5,2022-10-26,22-252,CHI,,IND,W (+15),0,11:56,...,3,3,0,1,0,0,1,6,4.1,+3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119,33,33,2023-12-28,23-315,CHI,,IND,L (-16),1,33:04,...,5,5,2,0,0,2,3,16,4.4,-24
120,34,34,2023-12-30,23-317,CHI,,PHI,W (+13),1,39:30,...,6,8,2,0,0,5,3,20,10.3,+10
121,35,35,2024-01-02,23-320,CHI,@,PHI,L (-13),1,28:30,...,4,4,3,2,0,1,4,14,7.8,-26
122,36,36,2024-01-03,23-321,CHI,@,NYK,L (-16),1,37:42,...,4,4,8,0,1,3,1,26,20.4,-23


**Checking for games where they didn't play**

In [696]:
inactive_rows = df[df['GS'].isin(['Inactive', 'Did Not Dress', "Did Not Play"])]

print("Number of rows with 'Inactive' or 'Did Not Dress/Play' in 'GS':", len(inactive_rows))


Number of rows with 'Inactive' or 'Did Not Dress/Play' in 'GS': 8


**Data cleaning**

In [697]:
import numpy as np

df = df.drop(df[df.G == "G"].index)                          # deletes repeating headers in content
df = df.drop(inactive_rows.index)                            # deletes rows/games where he was inactive

df.rename(columns={'Rk': 'SeasonGame',
                   'Unnamed: 5': 'Home',
                   'PTS': 'LastPTS',
                   'GmSc': 'LastGmSc',
                   'MP': 'MPLast'}, inplace=True)            # renames columns to mean previous game stats

df = df.drop(columns=['SeasonGame',                          # deletes useless columns
                      'Date',                                
                      'G', # which game played in                                
                      'Tm', # team they play for
                      'Unnamed: 7',
                      'Age',
                      'GS', # did they start the game
                      'PF', # personal fouls
                      '+/-'])

df['Home'].replace({np.nan: '0', '@': '1'}, inplace=True)       # reformats "Home" values
df['MPLast'] = pd.to_datetime(df['MPLast'], format='%M:%S').dt.minute   # turns MPLast into a datetime type and rounds to nearest minute
df.fillna(0, inplace=True)                                      # replaces NaN with 0
df = df.reset_index(drop=True)                                  # resets index

df

Unnamed: 0,Home,Opp,MPLast,FG,FGA,FG%,3P,3PA,3P%,FT,...,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,LastPTS,LastGmSc
0,1,MIA,24,4,11,.364,2,7,.286,0,...,0,0,2,2,0,1,0,1,10,3.3
1,1,WAS,31,3,6,.500,2,4,.500,2,...,.500,0,2,2,1,1,0,1,10,7.1
2,0,CLE,23,2,8,.250,0,5,.000,1,...,1.000,1,0,1,1,0,0,0,5,0.4
3,0,BOS,9,0,1,.000,0,1,.000,0,...,0,0,0,0,4,0,0,0,0,1.7
4,0,IND,11,2,6,.333,2,6,.333,0,...,0,0,3,3,0,1,0,0,6,4.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106,0,IND,33,5,19,.263,2,6,.333,4,...,1.000,0,5,5,2,0,0,2,16,4.4
107,0,PHI,39,7,15,.467,1,5,.200,5,...,.833,2,6,8,2,0,0,5,20,10.3
108,1,PHI,28,5,15,.333,0,5,.000,4,...,.800,0,4,4,3,2,0,1,14,7.8
109,1,NYK,37,9,19,.474,6,9,.667,2,...,1.000,0,4,4,8,0,1,3,26,20.4


**Additional cleaning of data**

**creating columns that average stats from last 5 games**


In [698]:
# creating a column that is the average points from last 5 games
# .rolling() creates a rolling window of size 5, considering at least 1 value to calculate the average.
df['avgPTSLast5'] = df['LastPTS'].rolling(window=5, min_periods=1).mean()   

df['NextOpp'] = df['Opp'].shift(-1)                             # Shift 'Opp' values down by one and assign to 'NextOpp' and so on
df['NextPTS'] = df['LastPTS'].shift(-1)

df = df.drop(columns=['Opp', 'NextOpp'])                        # removing oppositions (categorical) and instead trying historical opponent data




mostRecentGame = df.iloc[-1:]                                   # keeps the last row for prediction later
df = df.drop(df.index[-1])                                      # drops the last row because the predictor variable (NextPTS) is 0 so that could mess up the training

df

Unnamed: 0,Home,MPLast,FG,FGA,FG%,3P,3PA,3P%,FT,FTA,...,DRB,TRB,AST,STL,BLK,TOV,LastPTS,LastGmSc,avgPTSLast5,NextPTS
0,1,24,4,11,.364,2,7,.286,0,0,...,2,2,0,1,0,1,10,3.3,10.000000,10
1,1,31,3,6,.500,2,4,.500,2,4,...,2,2,1,1,0,1,10,7.1,10.000000,5
2,0,23,2,8,.250,0,5,.000,1,1,...,0,1,1,0,0,0,5,0.4,8.333333,0
3,0,9,0,1,.000,0,1,.000,0,0,...,0,0,4,0,0,0,0,1.7,6.250000,6
4,0,11,2,6,.333,2,6,.333,0,0,...,3,3,0,1,0,0,6,4.1,6.200000,19
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
105,0,36,7,22,.318,0,7,.000,5,8,...,5,5,5,0,0,1,19,8.4,19.800000,16
106,0,33,5,19,.263,2,6,.333,4,4,...,5,5,2,0,0,2,16,4.4,18.200000,20
107,0,39,7,15,.467,1,5,.200,5,6,...,6,8,2,0,0,5,20,10.3,18.800000,14
108,1,28,5,15,.333,0,5,.000,4,5,...,4,4,3,2,0,1,14,7.8,17.200000,26


In [699]:
# Convert multiple columns to numeric
columns_to_convert = ['FG', 'FGA', 'FG%', '3P', '3PA', '3P%', 'FT', 
                      'FTA', 'FT%', 'ORB', 'DRB', 'TRB', 'AST',
                      'STL', 'BLK', 'TOV', 'LastPTS', 
                      'LastGmSc', 'NextPTS']

for column in columns_to_convert:
    df[column] = pd.to_numeric(df[column], errors='coerce').fillna(0)

print(df.dtypes)

Home            object
MPLast           int32
FG               int64
FGA              int64
FG%            float64
3P               int64
3PA              int64
3P%            float64
FT               int64
FTA              int64
FT%            float64
ORB              int64
DRB              int64
TRB              int64
AST              int64
STL              int64
BLK              int64
TOV              int64
LastPTS          int64
LastGmSc       float64
avgPTSLast5    float64
NextPTS          int64
dtype: object


## Correlation Analysis of variables

correlation coefficients closer to 0 may contribute less or none at all to the prediction. If negative, it has an inverse relationship

In [700]:
correlation_matrix = df.corr()
correlations = correlation_matrix['NextPTS'].sort_values(ascending=False)
pd.DataFrame(correlations)
correlations

NextPTS        1.000000
avgPTSLast5    0.569505
LastPTS        0.474403
FG             0.441170
FGA            0.436362
MPLast         0.395923
LastGmSc       0.387658
3P             0.364980
3PA            0.360585
FT             0.308990
FTA            0.300610
TRB            0.284977
DRB            0.277350
AST            0.272421
TOV            0.241766
BLK            0.219965
FT%            0.208104
3P%            0.195041
FG%            0.185648
ORB            0.140835
Home           0.019523
STL           -0.101354
Name: NextPTS, dtype: float64

## Feature Importance (Run this)

Check those importance scores to understand which features have a higher impact on predictions. 

This cell must be run because the X and y dataframes are created here and may be altered in the next cell to remove columns before training

In [701]:
from sklearn.ensemble import RandomForestRegressor

X = df.drop(['NextPTS'], axis=1)
y = df['NextPTS']

model = RandomForestRegressor()
model.fit(X, y)

feature_importances = pd.Series(model.feature_importances_, index=X.columns).sort_values(ascending=False)

feature_importances


avgPTSLast5    0.426512
LastGmSc       0.067710
MPLast         0.054355
FG%            0.048509
3P%            0.046480
AST            0.036379
STL            0.035718
TOV            0.033273
FGA            0.032405
3PA            0.027950
TRB            0.024901
BLK            0.024077
FG             0.023133
LastPTS        0.022883
DRB            0.022750
FTA            0.020621
3P             0.020351
ORB            0.008362
FT             0.008239
FT%            0.008222
Home           0.007170
dtype: float64

### OPTIONAL: 

remove certain columns if they may be causing excess noise, which may lead to poor generalization and overfitting

In [702]:
X = X.drop(columns=['ORB', 'Home']) 
print(X.columns)

Index(['MPLast', 'FG', 'FGA', 'FG%', '3P', '3PA', '3P%', 'FT', 'FTA', 'FT%',
       'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'LastPTS', 'LastGmSc',
       'avgPTSLast5'],
      dtype='object')


## Random Forest

In [703]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
import pandas as pd


# Separation is done in the Feature Importance cell
# X = df.drop("NextPTS", axis=1)
# y = df['NextPTS']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train the RandomForestRegressor
rf = RandomForestRegressor(random_state=42)
rf.fit(X_train, y_train)

# Predictions
y_rf_train_pred = rf.predict(X_train)
y_rf_test_pred = rf.predict(X_test)


### SKIP THIS IF RAN ABOVE

This is a random forest regressor that uses hyperparamters, takes longer to load, needs good gpu

**n_estimators**: The number of trees in the forest. Increasing the number of trees generally helps, but it may also lead to longer training times.

**max_depth**: The maximum depth of each tree in the forest. Controlling the depth can help prevent overfitting.

**min_samples_split**: The minimum number of samples required to split an internal node. Increasing this parameter can lead to a more robust model.

**min_samples_leaf**: The minimum number of samples required to be at a leaf node. Similar to min_samples_split, increasing this parameter can prevent overfitting.

**max_features**: The number of features to consider when looking for the best split. Limiting the number of features considered can be useful to reduce overfitting.

In [None]:
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.ensemble import RandomForestRegressor
import pandas as pd


# Separation is done in the Feature Importance cell
# X = df.drop("NextPTS", axis=1)
# y = df['NextPTS']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the parameter grid for hyperparameter tuning
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2']
    }

# Create a RandomForestRegressor
rf = RandomForestRegressor()

# Use GridSearchCV to find the best hyperparameters
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Get the best hyperparameters
best_params = grid_search.best_params_

# Create a Random Forest Regressor with the best hyperparameters
best_rf = RandomForestRegressor(**best_params)

# Fit the model on the training data
best_rf.fit(X_train, y_train)

# Predictions
y_rf_train_pred = best_rf.predict(X_train) # this is predictions for the train set, which its already seen, so kinda useless. only for comparing the MSE
y_rf_test_pred = best_rf.predict(X_test)

**Actually Predicting**


In [704]:
from sklearn.metrics import mean_squared_error, r2_score

rf_train_mse = mean_squared_error(y_train, y_rf_train_pred)
rf_train_r2 = r2_score(y_train, y_rf_train_pred)

rf_test_mse = mean_squared_error(y_test, y_rf_test_pred)
rf_test_r2 = r2_score(y_test, y_rf_test_pred)

In [705]:
rf_results = pd.DataFrame(["Random Forest", rf_train_mse, rf_train_r2, rf_test_mse, rf_test_r2]).transpose()
rf_results.columns = ["Method", "Training MSE", "Training R2", "Test MSE", "Test R2"]
rf_results

Unnamed: 0,Method,Training MSE,Training R2,Test MSE,Test R2
0,Random Forest,5.051423,0.905054,60.848277,-0.080636


In [706]:
X_new = df
X_new = X_new.loc[[92]]
#X_new = mostRecentGame
X_new

Unnamed: 0,Home,MPLast,FG,FGA,FG%,3P,3PA,3P%,FT,FTA,...,DRB,TRB,AST,STL,BLK,TOV,LastPTS,LastGmSc,avgPTSLast5,NextPTS
92,1,29,7,12,0.583,3,5,0.6,2,2,...,1,1,3,0,0,0,19,15.8,19.8,23


In [707]:
# Ensure that X_new has the same order of columns as X used during training
X_new = X_new[X.columns]

next_game_predictions = rf.predict(X_new)

# Make predictions using the trained model w/ HYPERPARAMETERS
#next_game_predictions = best_rf.predict(X_new)

# Display
print("Predicted Points for Next Game:", next_game_predictions)


Predicted Points for Next Game: [21.81]


## One hot encoding explained

Takes a column which has categorical data, which has been label encoded and then splits the column into multiple columns. The numbers are replaced by 1s and 0s, depending on which column has what value.

This is better than label encoding because since there are different numbers in the same column, the model will misunderstand the data to be in some kind of order, 0 < 1 < 2

![Example One Hot Encoding](Data/one_hot_encoding_example.png)

# Decision Tree Classifier


In [710]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report

# Step 1: Convert points to categories
# Example: 0-5 points, 6-10 points, 11-15 points, and so on
bins = [-1, 5, 10, 15, 20, 25, 30, 35, 40, 45, float('inf')]
labels = ['0-5', '6-10', '11-15', '16-20', '21-25', '26-30', '31-35', '36-40', '41-45', '46+']

# Step 2: Modify the target variable in the original DataFrame
df['PointsCategory'] = pd.cut(df['NextPTS'], bins=bins, labels=labels, right=False)

# Step 2: Apply one-hot encoding to the categorical columns in the entire DataFrame
df_encoded = pd.get_dummies(df, columns=categorical_columns, drop_first=True)

# Step 3: Split data into features (X) and the new categorical target variable (y)
X = df_encoded.drop(['NextPTS', 'PointsCategory'], axis=1)
y = df_encoded['PointsCategory']

# Step 4: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=95)

# Step 5: Decision Tree Classifier
dt_classifier = DecisionTreeClassifier(random_state=42)
dt_classifier.fit(X_train, y_train)

# Step 6: Make Predictions
predictions = dt_classifier.predict(X_test)

# Evaluate the classifier
accuracy = accuracy_score(y_test, predictions)
print("Accuracy:", accuracy)
print("\nClassification Report:\n", classification_report(y_test, predictions))


Accuracy: 0.3181818181818182

Classification Report:
               precision    recall  f1-score   support

         0-5       0.00      0.00      0.00         1
       11-15       0.40      0.67      0.50         6
       16-20       0.20      0.25      0.22         4
       21-25       0.00      0.00      0.00         1
       26-30       0.00      0.00      0.00         1
        6-10       1.00      0.22      0.36         9

    accuracy                           0.32        22
   macro avg       0.27      0.19      0.18        22
weighted avg       0.55      0.32      0.33        22



In [711]:
predResults = pd.DataFrame({'y_test': y_test, 'predictions': predictions})
predResults['correct'] = predResults['y_test'] == predResults['predictions']
predResults

Unnamed: 0,y_test,predictions,correct
21,11-15,11-15,True
105,16-20,26-30,False
79,11-15,16-20,False
59,6-10,11-15,False
55,6-10,11-15,False
40,6-10,11-15,False
83,16-20,16-20,True
38,6-10,6-10,True
26,6-10,11-15,False
99,21-25,16-20,False
