## Load/Clean Data

Sources: 

https://www.basketball-reference.com/leagues/NBA_2024_per_game.html

https://www.basketball-reference.com/players/a/antetgi01/gamelog/2024

In [25]:
import pandas as pd

playerID = "r/reidna01"   
url = "https://www.basketball-reference.com/players/" + playerID + "/gamelog/2024"

html = pd.read_html(url, header=0)
df = html[7]                              # selects desired table on website page
df

Unnamed: 0,Rk,G,Date,Age,Tm,Unnamed: 5,Opp,Unnamed: 7,GS,MP,...,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,GmSc,+/-
0,1,1,2023-10-25,24-060,MIN,@,TOR,L (-3),0,23:46,...,3,4,1,0,0,1,0,10,6.9,+1
1,2,2,2023-10-28,24-063,MIN,,MIA,W (+16),0,28:26,...,7,8,1,2,0,0,3,25,23.5,+30
2,3,3,2023-10-30,24-065,MIN,@,ATL,L (-14),0,23:58,...,1,1,0,0,0,1,1,13,5.5,-21
3,4,4,2023-11-01,24-067,MIN,,DEN,W (+21),0,19:19,...,4,5,1,1,0,1,0,16,14.7,+4
4,5,5,2023-11-04,24-070,MIN,,UTA,W (+28),0,22:07,...,3,5,1,0,0,1,0,16,13.3,+9
5,6,6,2023-11-06,24-072,MIN,,BOS,W (+5),0,25:06,...,3,4,2,2,2,3,0,14,13.1,-2
6,7,7,2023-11-08,24-074,MIN,,NOP,W (+21),0,21:46,...,3,4,0,1,0,0,2,9,6.8,+17
7,8,8,2023-11-10,24-076,MIN,@,SAS,W (+7),0,18:14,...,2,3,0,0,2,0,1,7,5.6,+7
8,9,9,2023-11-12,24-078,MIN,@,GSW,W (+6),0,18:41,...,5,6,2,0,0,2,2,10,6.4,+10
9,10,10,2023-11-14,24-080,MIN,@,GSW,W (+3),0,16:24,...,2,2,0,0,0,1,4,10,6.1,-8


**Data cleaning**

In [26]:
import numpy as np

df = df.drop(df[df.Date == "Date"].index)                    # deletes repeating headers in content
df = df.drop(df[df.GS == "Inactive"].index)                  # deletes games where he was inactive
df = df.drop(columns=["Date", "Unnamed: 7", "Age", "GS", "+/-"])            # deletes useless columns
df.rename(columns={'Unnamed: 5': 'Home'}, inplace=True)      # renames "Unnamed 5" to "Home"
df['Home'].replace({np.nan: 'Yes', '@': 'No'}, inplace=True) # reformats "Home" values
df['NextPTS'] = df['PTS'].shift(-1)                          # Shift 'PTS' values down by one and assign to 'NextPTS'
df.fillna(0, inplace=True)                                   # replaces NaN with 0
df = df.reset_index(drop=True)                               # resets index

df


Unnamed: 0,Rk,G,Tm,Home,Opp,MP,FG,FGA,FG%,3P,...,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,GmSc,NextPTS
0,1,1,MIN,No,TOR,23:46,3,8,0.375,0,...,3,4,1,0,0,1,0,10,6.9,25
1,2,2,MIN,Yes,MIA,28:26,10,14,0.714,4,...,7,8,1,2,0,0,3,25,23.5,13
2,3,3,MIN,No,ATL,23:58,5,12,0.417,3,...,1,1,0,0,0,1,1,13,5.5,16
3,4,4,MIN,Yes,DEN,19:19,6,9,0.667,1,...,4,5,1,1,0,1,0,16,14.7,16
4,5,5,MIN,Yes,UTA,22:07,5,9,0.556,3,...,3,5,1,0,0,1,0,16,13.3,14
5,6,6,MIN,Yes,BOS,25:06,5,9,0.556,2,...,3,4,2,2,2,3,0,14,13.1,9
6,7,7,MIN,Yes,NOP,21:46,4,8,0.5,1,...,3,4,0,1,0,0,2,9,6.8,7
7,8,8,MIN,No,SAS,18:14,3,7,0.429,1,...,2,3,0,0,2,0,1,7,5.6,10
8,9,9,MIN,No,GSW,18:41,3,8,0.375,2,...,5,6,2,0,0,2,2,10,6.4,10
9,10,10,MIN,No,GSW,16:24,4,5,0.8,2,...,2,2,0,0,0,1,4,10,6.1,13


In [13]:
print(df.dtypes)

Rk         object
G          object
Tm         object
Home       object
Opp        object
MP         object
FG         object
FGA        object
FG%        object
3P         object
3PA        object
3P%        object
FT         object
FTA        object
FT%        object
ORB        object
DRB        object
TRB        object
AST        object
STL        object
BLK        object
TOV        object
PF         object
PTS        object
GmSc       object
NextPTS    object
dtype: object


In [27]:
# Convert multiple columns to numeric
columns_to_convert = ['G', 'FG', 'FGA', '3P', '3PA', 'FT', 
                      'FTA', 'ORB', 'DRB', 'TRB', 'AST', 
                      'STL', 'BLK', 'TOV', 'PF', 'PTS', 'NextPTS']

for column in columns_to_convert:
    df[column] = pd.to_numeric(df[column], errors='coerce').fillna(0)

print(df.dtypes)

Rk         object
G           int64
Tm         object
Home       object
Opp        object
MP         object
FG          int64
FGA         int64
FG%        object
3P          int64
3PA         int64
3P%        object
FT          int64
FTA         int64
FT%        object
ORB         int64
DRB         int64
TRB         int64
AST         int64
STL         int64
BLK         int64
TOV         int64
PF          int64
PTS         int64
GmSc       object
NextPTS     int64
dtype: object


In [15]:
df

Unnamed: 0,Rk,G,Tm,Home,Opp,MP,FG,FGA,FG%,3P,...,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,GmSc,NextPTS
0,1,1,MIN,No,TOR,23:46,3,8,0.375,0,...,3,4,1,0,0,1,0,10,6.9,25
1,2,2,MIN,Yes,MIA,28:26,10,14,0.714,4,...,7,8,1,2,0,0,3,25,23.5,13
2,3,3,MIN,No,ATL,23:58,5,12,0.417,3,...,1,1,0,0,0,1,1,13,5.5,16
3,4,4,MIN,Yes,DEN,19:19,6,9,0.667,1,...,4,5,1,1,0,1,0,16,14.7,16
4,5,5,MIN,Yes,UTA,22:07,5,9,0.556,3,...,3,5,1,0,0,1,0,16,13.3,14
5,6,6,MIN,Yes,BOS,25:06,5,9,0.556,2,...,3,4,2,2,2,3,0,14,13.1,9
6,7,7,MIN,Yes,NOP,21:46,4,8,0.5,1,...,3,4,0,1,0,0,2,9,6.8,7
7,8,8,MIN,No,SAS,18:14,3,7,0.429,1,...,2,3,0,0,2,0,1,7,5.6,10
8,9,9,MIN,No,GSW,18:41,3,8,0.375,2,...,5,6,2,0,0,2,2,10,6.4,10
9,10,10,MIN,No,GSW,16:24,4,5,0.8,2,...,2,2,0,0,0,1,4,10,6.1,13


## Data Seperation/Splitting (Random Forest)

In [28]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
import pandas as pd

categorical_columns = ['Tm', 'Home', 'Opp', 'MP'] # NEED TO FIX Minutes Played LATER TO ACTUALLY BE A NUMERIC

# Apply one-hot encoding to each categorical column
df_encoded = pd.get_dummies(df, columns=categorical_columns, drop_first=True)
df_encoded = df_encoded.drop(df.index[-1]) # drops the last row because the predictor variable (NextPTS) is 0 so that could mess up the training

# Separate features (X) and target variable (y)
X = df_encoded.drop("NextPTS", axis=1)
y = df_encoded['NextPTS']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=100)

# Create and train the RandomForestRegressor
rf = RandomForestRegressor(random_state=42)
rf.fit(X_train, y_train)

# Predictions 
y_rf_train_pred = rf.predict(X_train)
y_rf_test_pred = rf.predict(X_test)


Apply model to make prediction

In [29]:
from sklearn.metrics import mean_squared_error, r2_score

rf_train_mse = mean_squared_error(y_train, y_rf_train_pred)
rf_train_r2 = r2_score(y_train, y_rf_train_pred)

rf_test_mse = mean_squared_error(y_test, y_rf_test_pred)
rf_test_r2 = r2_score(y_test, y_rf_test_pred)

In [30]:
rf_results = pd.DataFrame(["Random Forest", rf_train_mse, rf_train_r2, rf_test_mse, rf_test_r2]).transpose()
rf_results.columns = ["Method", "Training MSE", "Training R2", "Test MSE", "Test R2"]
rf_results

Unnamed: 0,Method,Training MSE,Training R2,Test MSE,Test R2
0,Random Forest,3.713965,0.872795,56.651071,-1.309403


## Actually Predicting


In [38]:
X_new = df
X_new = X_new.loc[[8]]   
X_new

Unnamed: 0,Rk,G,Tm,Home,Opp,MP,FG,FGA,FG%,3P,...,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,GmSc,NextPTS
8,9,9,MIN,No,GSW,18:41,3,8,0.375,2,...,5,6,2,0,0,2,2,10,6.4,10


In [39]:
# Apply one-hot encoding to the categorical columns in X_new
X_new_encoded = pd.get_dummies(X_new, columns=categorical_columns, drop_first=True)

# Ensure that the order of columns in X_new_encoded is consistent with the order used during training
X_new_encoded = X_new_encoded.reindex(columns=X.columns, fill_value=0)

# Make predictions using the trained model
next_game_predictions = rf.predict(X_new_encoded)

# Display or use the predictions as needed
print("Predicted Points for Next Game:", next_game_predictions)

Predicted Points for Next Game: [11.52]


## Decision Tree (used for categorical predictions)

In [22]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report

# Step 1: Convert points to categories
# Example: 0-5 points, 6-10 points, 11-15 points, and so on
bins = [-1, 5, 10, 15, 20, 25, 30, 35, 40, 45, float('inf')]
labels = ['0-5', '6-10', '11-15', '16-20', '21-25', '26-30', '31-35', '36-40', '41-45', '46+']

# Step 2: Modify the target variable in the original DataFrame
df['PointsCategory'] = pd.cut(df['NextPTS'], bins=bins, labels=labels, right=False)

# Step 2: Apply one-hot encoding to the categorical columns in the entire DataFrame
df_encoded = pd.get_dummies(df, columns=categorical_columns, drop_first=True)
df_encoded = df_encoded.drop(df.index[-1]) # drops the last row because the predictor variable (NextPTS) is 0 so that could mess up the training

# Step 3: Split data into features (X) and the new categorical target variable (y)
X = df_encoded.drop(['NextPTS', 'PointsCategory'], axis=1)
y = df_encoded['PointsCategory']

# Step 4: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=100)

# Step 5: Decision Tree Classifier
dt_classifier = DecisionTreeClassifier(random_state=42)
dt_classifier.fit(X_train, y_train)

# Step 6: Make Predictions
predictions = dt_classifier.predict(X_test)

# Evaluate the classifier
accuracy = accuracy_score(y_test, predictions)
print("Accuracy:", accuracy)
print("\nClassification Report:\n", classification_report(y_test, predictions))


Accuracy: 0.2857142857142857

Classification Report:
               precision    recall  f1-score   support

       11-15       0.00      0.00      0.00         2
       16-20       0.00      0.00      0.00         0
       21-25       0.00      0.00      0.00         1
       26-30       0.00      0.00      0.00         0
        6-10       0.67      0.50      0.57         4

    accuracy                           0.29         7
   macro avg       0.13      0.10      0.11         7
weighted avg       0.38      0.29      0.33         7



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
