## Load/Clean Data

Sources: 

https://www.basketball-reference.com/leagues/NBA_2024_per_game.html

https://www.basketball-reference.com/players/a/antetgi01/gamelog/2024

In [38]:
import pandas as pd

url = "https://www.basketball-reference.com/players/a/antetgi01/gamelog/2024"   # Giannis 23-24 season game-by-game stats

html = pd.read_html(url, header=0)
df = html[7]                              # selects desired table on website page
df

Unnamed: 0,Rk,G,Date,Age,Tm,Unnamed: 5,Opp,Unnamed: 7,GS,MP,...,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,GmSc,+/-
0,1,1,2023-10-26,28-324,MIL,,PHI,W (+1),1,35:07,...,8,12,3,2,2,7,4,23,12.0,-13
1,2,2,2023-10-29,28-327,MIL,,ATL,L (-17),1,29:28,...,6,11,3,1,2,3,2,26,25.0,-11
2,3,3,2023-10-30,28-328,MIL,,MIA,W (+8),1,31:51,...,6,7,2,0,1,2,3,33,23.3,+18
3,4,4,2023-11-01,28-330,MIL,@,TOR,L (-19),1,27:41,...,1,4,2,1,0,3,3,16,9.7,-10
4,5,5,2023-11-03,28-332,MIL,,NYK,W (+5),1,35:47,...,7,8,6,0,1,7,3,22,14.5,+13
5,6,6,2023-11-06,28-335,MIL,@,BRK,W (+4),1,35:15,...,10,12,3,1,2,3,1,36,28.2,+15
6,7,7,2023-11-08,28-337,MIL,,DET,W (+2),1,21:52,...,9,9,5,0,3,1,1,15,11.9,+5
7,8,8,2023-11-09,28-338,MIL,@,IND,L (-2),1,36:29,...,10,12,3,0,0,8,4,54,40.2,+6
8,9,9,2023-11-11,28-340,MIL,@,ORL,L (-15),1,36:39,...,8,10,7,4,2,6,4,35,29.3,-11
9,10,10,2023-11-13,28-342,MIL,,CHI,W (+9),1,34:42,...,8,11,2,0,0,3,4,35,23.7,+13


**Data cleaning**

In [39]:
import numpy as np

df = df.drop(df[df.Date == "Date"].index)                    # deletes repeating headers in content
df = df.drop(df[df.GS == "Inactive"].index)                  # deletes games where he was inactive
df = df.drop(columns=["Date", "Unnamed: 7", "Age", "GS", "+/-"])            # deletes useless columns
df.rename(columns={'Unnamed: 5': 'Home'}, inplace=True)      # renames "Unnamed 5" to "Home"
df['Home'].replace({np.nan: 'Yes', '@': 'No'}, inplace=True) # reformats "Home" values
df['NextPTS'] = df['PTS'].shift(-1)                          # Shift 'PTS' values down by one and assign to 'NextPTS'
df.fillna(0, inplace=True)                                   # replaces NaN with 0
df = df.reset_index(drop=True)                               # resets index

df


Unnamed: 0,Rk,G,Tm,Home,Opp,MP,FG,FGA,FG%,3P,...,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,GmSc,NextPTS
0,1,1,MIL,Yes,PHI,35:07,10,21,0.476,0,...,8,12,3,2,2,7,4,23,12.0,26
1,2,2,MIL,Yes,ATL,29:28,9,14,0.643,0,...,6,11,3,1,2,3,2,26,25.0,33
2,3,3,MIL,Yes,MIA,31:51,12,21,0.571,1,...,6,7,2,0,1,2,3,33,23.3,16
3,4,4,MIL,No,TOR,27:41,5,11,0.455,0,...,1,4,2,1,0,3,3,16,9.7,22
4,5,5,MIL,Yes,NYK,35:47,6,10,0.6,3,...,7,8,6,0,1,7,3,22,14.5,36
5,6,6,MIL,No,BRK,35:15,15,27,0.556,1,...,10,12,3,1,2,3,1,36,28.2,15
6,7,7,MIL,Yes,DET,21:52,6,16,0.375,0,...,9,9,5,0,3,1,1,15,11.9,54
7,8,8,MIL,No,IND,36:29,19,25,0.76,0,...,10,12,3,0,0,8,4,54,40.2,35
8,9,9,MIL,No,ORL,36:39,15,22,0.682,0,...,8,10,7,4,2,6,4,35,29.3,35
9,10,10,MIL,Yes,CHI,34:42,13,22,0.591,1,...,8,11,2,0,0,3,4,35,23.7,16


In [40]:
print(df.dtypes)

Rk         object
G          object
Tm         object
Home       object
Opp        object
MP         object
FG         object
FGA        object
FG%        object
3P         object
3PA        object
3P%        object
FT         object
FTA        object
FT%        object
ORB        object
DRB        object
TRB        object
AST        object
STL        object
BLK        object
TOV        object
PF         object
PTS        object
GmSc       object
NextPTS    object
dtype: object


In [41]:
# Convert multiple columns to numeric
columns_to_convert = ['G', 'FG', 'FGA', '3P', '3PA', 'FT', 
                      'FTA', 'ORB', 'DRB', 'TRB', 'AST', 
                      'STL', 'BLK', 'TOV', 'PF', 'PTS', 'NextPTS']

for column in columns_to_convert:
    df[column] = pd.to_numeric(df[column], errors='coerce').fillna(0)

print(df.dtypes)

Rk         object
G           int64
Tm         object
Home       object
Opp        object
MP         object
FG          int64
FGA         int64
FG%        object
3P          int64
3PA         int64
3P%        object
FT          int64
FTA         int64
FT%        object
ORB         int64
DRB         int64
TRB         int64
AST         int64
STL         int64
BLK         int64
TOV         int64
PF          int64
PTS         int64
GmSc       object
NextPTS     int64
dtype: object


In [42]:
df

Unnamed: 0,Rk,G,Tm,Home,Opp,MP,FG,FGA,FG%,3P,...,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,GmSc,NextPTS
0,1,1,MIL,Yes,PHI,35:07,10,21,0.476,0,...,8,12,3,2,2,7,4,23,12.0,26
1,2,2,MIL,Yes,ATL,29:28,9,14,0.643,0,...,6,11,3,1,2,3,2,26,25.0,33
2,3,3,MIL,Yes,MIA,31:51,12,21,0.571,1,...,6,7,2,0,1,2,3,33,23.3,16
3,4,4,MIL,No,TOR,27:41,5,11,0.455,0,...,1,4,2,1,0,3,3,16,9.7,22
4,5,5,MIL,Yes,NYK,35:47,6,10,0.6,3,...,7,8,6,0,1,7,3,22,14.5,36
5,6,6,MIL,No,BRK,35:15,15,27,0.556,1,...,10,12,3,1,2,3,1,36,28.2,15
6,7,7,MIL,Yes,DET,21:52,6,16,0.375,0,...,9,9,5,0,3,1,1,15,11.9,54
7,8,8,MIL,No,IND,36:29,19,25,0.76,0,...,10,12,3,0,0,8,4,54,40.2,35
8,9,9,MIL,No,ORL,36:39,15,22,0.682,0,...,8,10,7,4,2,6,4,35,29.3,35
9,10,10,MIL,Yes,CHI,34:42,13,22,0.591,1,...,8,11,2,0,0,3,4,35,23.7,16


## Data Seperation/Splitting

In [47]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
import pandas as pd

categorical_columns = ['Tm', 'Home', 'Opp', 'MP']

# Apply one-hot encoding to each categorical column
df_encoded = pd.get_dummies(df, columns=categorical_columns, drop_first=True)

# Separate features (X) and target variable (y)
X = df_encoded.drop("NextPTS", axis=1)
y = df_encoded['NextPTS']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=100)

# Create and train the RandomForestRegressor
rf = RandomForestRegressor(random_state=42)
rf.fit(X_train, y_train)

# Predictions 
y_rf_train_pred = rf.predict(X_train)
y_rf_test_pred = rf.predict(X_test)


Apply model to make prediction

In [48]:
from sklearn.metrics import mean_squared_error, r2_score

rf_train_mse = mean_squared_error(y_train, y_rf_train_pred)
rf_train_r2 = r2_score(y_train, y_rf_train_pred)

rf_test_mse = mean_squared_error(y_test, y_rf_test_pred)
rf_test_r2 = r2_score(y_test, y_rf_test_pred)

In [49]:
rf_results = pd.DataFrame(["Random Forest", rf_train_mse, rf_train_r2, rf_test_mse, rf_test_r2]).transpose()
rf_results.columns = ["Method", "Training MSE", "Training R2", "Test MSE", "Test R2"]
rf_results

Unnamed: 0,Method,Training MSE,Training R2,Test MSE,Test R2
0,Random Forest,14.410918,0.878021,154.081,0.246209


## Actually Predicting


In [62]:
X_new = df
X_new = X_new.loc[[34]]   
X_new

Unnamed: 0,Rk,G,Tm,Home,Opp,MP,FG,FGA,FG%,3P,...,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,GmSc,NextPTS
34,36,35,MIL,No,HOU,38:56,16,25,0.64,1,...,14,17,2,0,1,2,4,48,39.3,0


In [68]:
# Apply one-hot encoding to the categorical columns in X_new
X_new_encoded = pd.get_dummies(X_new, columns=categorical_columns, drop_first=True)

# Ensure that the order of columns in X_new_encoded is consistent with the order used during training
X_new_encoded = X_new_encoded.reindex(columns=X.columns, fill_value=0)

# Make predictions using the trained model
next_game_predictions = rf.predict(X_new_encoded)

# Display or use the predictions as needed
print("Predicted Points for Next Game:", next_game_predictions)

Predicted Points for Next Game: [27.46]
