## Load/Clean Data

Sources:

https://www.basketball-reference.com/leagues/NBA_2024_per_game.html

In [106]:
import pandas as pd

playerID = "g/gilgesh01"
url = "https://www.basketball-reference.com/players/" + playerID + "/gamelog/2023"

html = pd.read_html(url, header=0)
df = html[7]                              # selects desired table on website page
df

Unnamed: 0,Rk,G,Date,Age,Tm,Unnamed: 5,Opp,Unnamed: 7,GS,MP,...,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,GmSc,+/-
0,1,1,2022-10-19,24-099,OKC,@,MIN,L (-7),1,36:47,...,4,6,5,3,2,2,1,32,28.8,-1
1,2,2,2022-10-22,24-102,OKC,@,DEN,L (-5),1,33:56,...,3,3,7,1,0,5,4,28,15.8,-6
2,3,,2022-10-23,24-103,OKC,,MIN,L (-10),Did Not Dress,Did Not Dress,...,Did Not Dress,Did Not Dress,Did Not Dress,Did Not Dress,Did Not Dress,Did Not Dress,Did Not Dress,Did Not Dress,Did Not Dress,Did Not Dress
3,4,3,2022-10-25,24-105,OKC,,LAC,W (+14),1,36:44,...,4,5,8,3,3,2,0,33,32.6,+23
4,5,4,2022-10-27,24-107,OKC,,LAC,W (+8),1,34:01,...,4,5,6,3,0,3,4,24,16.4,+12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
81,79,66,2023-04-02,24-264,OKC,,PHO,L (-10),1,37:32,...,3,3,5,2,0,1,4,39,31.8,-6
82,80,67,2023-04-04,24-266,OKC,@,GSW,L (-11),1,33:41,...,4,5,7,0,0,3,5,32,25.9,-9
83,Rk,G,Date,Age,Tm,,Opp,,GS,MP,...,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,GmSc,+/-
84,81,68,2023-04-06,24-268,OKC,@,UTA,W (+16),1,37:43,...,5,7,7,1,0,1,3,22,16.9,+7


**Data cleaning**

In [107]:
import numpy as np

df = df.drop(df[df.G == "G"].index)                          # deletes repeating headers in content
df = df.drop(df[df.FG == "Inactive"].index)                  # deletes games where he was inactive
df = df.drop(df[df.DRB == "Did Not Dress"].index)                  # deletes "did not dress"
df.rename(columns={'Unnamed: 5': 'Home'}, inplace=True)      # renames columns to previous game stats

df = df.drop(columns=["Date",                                # deletes useless columns
                      "Tm",
                      "Unnamed: 7",
                      "Age",
                      "GS",
                      "+/-"])
df['Home'].replace({np.nan: '1', '@': '0'}, inplace=True)       # reformats "Home" values
df['MP'] = pd.to_datetime(df['MP'], format='%M:%S').dt.minute   # turns MP into a datetime type and rounds to nearest minute
df['NextOpp'] = df['Opp'].shift(-1)                             # Shift 'Opp' values down by one and assign to 'NextOpp' and so on
df['NextPTS'] = df['PTS'].shift(-1)
df.fillna(0, inplace=True)                                      # replaces NaN with 0
df = df.reset_index(drop=True)                                  # resets index

df

Unnamed: 0,Rk,G,Home,Opp,MP,FG,FGA,FG%,3P,3PA,...,TRB,AST,STL,BLK,TOV,PF,PTS,GmSc,NextOpp,NextPTS
0,1,1,0,MIN,36,12,23,.522,2,6,...,6,5,3,2,2,1,32,28.8,DEN,28
1,2,2,0,DEN,33,11,24,.458,1,5,...,3,7,1,0,5,4,28,15.8,LAC,33
2,4,3,1,LAC,36,11,22,.500,2,2,...,5,8,3,3,2,0,33,32.6,LAC,24
3,5,4,1,LAC,34,10,23,.435,2,4,...,5,6,3,0,3,4,24,16.4,DAL,38
4,6,5,0,DAL,41,15,27,.556,0,2,...,6,9,2,1,3,2,38,32.9,ORL,34
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
63,75,64,0,POR,35,9,17,.529,0,1,...,2,3,4,0,3,1,31,26.0,IND,39
64,78,65,0,IND,38,11,22,.500,0,2,...,9,4,1,1,3,6,39,29.8,PHO,39
65,79,66,1,PHO,37,11,22,.500,0,0,...,3,5,2,0,1,4,39,31.8,GSW,32
66,80,67,0,GSW,33,11,17,.647,2,4,...,5,7,0,0,3,5,32,25.9,UTA,22


**creating a column that is the average points from last 5 games**


In [108]:
last5 = df['PTS']
last5 = pd.DataFrame(last5)
last5['averageLast5'] = last5['PTS'].shift(-1)
last5


Unnamed: 0,PTS,averageLast5
0,32,28
1,28,33
2,33,24
3,24,38
4,38,34
...,...,...
63,31,39
64,39,39
65,39,32
66,32,22


In [109]:
# Convert multiple columns to numeric
columns_to_convert = ['G', 'FG', 'FGA', '3P', '3PA', 'FT',
                      'FTA', 'ORB', 'DRB', 'TRB', 'AST',
                      'STL', 'BLK', 'TOV', 'PF', 'PTS', 'NextPTS']

for column in columns_to_convert:
    df[column] = pd.to_numeric(df[column], errors='coerce').fillna(0)

print(df.dtypes)

Rk         object
G           int64
Home       object
Opp        object
MP          int64
FG          int64
FGA         int64
FG%        object
3P          int64
3PA         int64
3P%        object
FT          int64
FTA         int64
FT%        object
ORB         int64
DRB         int64
TRB         int64
AST         int64
STL         int64
BLK         int64
TOV         int64
PF          int64
PTS         int64
GmSc       object
NextOpp    object
NextPTS     int64
dtype: object


In [110]:
df = df.drop(df.index[-1]) # drops the last row because the predictor variable (NextPTS) is 0 so that could mess up the training
df

Unnamed: 0,Rk,G,Home,Opp,MP,FG,FGA,FG%,3P,3PA,...,TRB,AST,STL,BLK,TOV,PF,PTS,GmSc,NextOpp,NextPTS
0,1,1,0,MIN,36,12,23,.522,2,6,...,6,5,3,2,2,1,32,28.8,DEN,28
1,2,2,0,DEN,33,11,24,.458,1,5,...,3,7,1,0,5,4,28,15.8,LAC,33
2,4,3,1,LAC,36,11,22,.500,2,2,...,5,8,3,3,2,0,33,32.6,LAC,24
3,5,4,1,LAC,34,10,23,.435,2,4,...,5,6,3,0,3,4,24,16.4,DAL,38
4,6,5,0,DAL,41,15,27,.556,0,2,...,6,9,2,1,3,2,38,32.9,ORL,34
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62,74,63,0,LAL,35,9,22,.409,1,3,...,5,8,1,1,2,4,27,20.0,POR,31
63,75,64,0,POR,35,9,17,.529,0,1,...,2,3,4,0,3,1,31,26.0,IND,39
64,78,65,0,IND,38,11,22,.500,0,2,...,9,4,1,1,3,6,39,29.8,PHO,39
65,79,66,1,PHO,37,11,22,.500,0,0,...,3,5,2,0,1,4,39,31.8,GSW,32


## Random Forest

In [111]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
import pandas as pd

categorical_columns = ['Opp', 'NextOpp']

# Apply one-hot encoding to each categorical column
df_encoded = pd.get_dummies(df, columns=categorical_columns, drop_first=True)
df_encoded = df_encoded.drop(df.index[-1]) # drops the last row because the predictor variable (NextPTS) is 0 so that could mess up the training

# Separate features (X) and target variable (y)
X = df_encoded.drop("NextPTS", axis=1)
y = df_encoded['NextPTS']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=100)

# Create and train the RandomForestRegressor
rf = RandomForestRegressor(random_state=42)
rf.fit(X_train, y_train)

# Predictions
y_rf_train_pred = rf.predict(X_train)
y_rf_test_pred = rf.predict(X_test)


**Actually Predicting**


In [112]:
from sklearn.metrics import mean_squared_error, r2_score

rf_train_mse = mean_squared_error(y_train, y_rf_train_pred)
rf_train_r2 = r2_score(y_train, y_rf_train_pred)

rf_test_mse = mean_squared_error(y_test, y_rf_test_pred)
rf_test_r2 = r2_score(y_test, y_rf_test_pred)

In [113]:
rf_results = pd.DataFrame(["Random Forest", rf_train_mse, rf_train_r2, rf_test_mse, rf_test_r2]).transpose()
rf_results.columns = ["Method", "Training MSE", "Training R2", "Test MSE", "Test R2"]
rf_results

Unnamed: 0,Method,Training MSE,Training R2,Test MSE,Test R2
0,Random Forest,6.799906,0.853648,44.785571,-0.211089


In [114]:
X_new = df
X_new = X_new.loc[[32]]
X_new

Unnamed: 0,Rk,G,Home,Opp,MP,FG,FGA,FG%,3P,3PA,...,TRB,AST,STL,BLK,TOV,PF,PTS,GmSc,NextOpp,NextPTS
32,36,33,1,PHI,32,4,15,0.267,1,2,...,6,5,0,0,2,3,14,6.8,ORL,33


In [115]:
# Apply one-hot encoding to the categorical columns in X_new
X_new_encoded = pd.get_dummies(X_new, columns=categorical_columns, drop_first=True)

# Ensure that the order of columns in X_new_encoded is consistent with the order used during training
X_new_encoded = X_new_encoded.reindex(columns=X.columns, fill_value=0)

# Make predictions using the trained model
next_game_predictions = rf.predict(X_new_encoded)

# Display or use the predictions as needed
print("Predicted Points for Next Game:", next_game_predictions)

Predicted Points for Next Game: [24.16]
