## Load/Clean Data

Sources: 

https://www.basketball-reference.com/leagues/NBA_2024_per_game.html

In [21]:
import pandas as pd

playerID = "d/doncilu01"   
url = "https://www.basketball-reference.com/players/" + playerID + "/gamelog/2024"

html = pd.read_html(url, header=0)
df = html[7]                              # selects desired table on website page
df

Unnamed: 0,Rk,G,Date,Age,Tm,Unnamed: 5,Opp,Unnamed: 7,GS,MP,...,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,GmSc,+/-
0,1,1,2023-10-25,24-239,DAL,@,SAS,W (+7),1,34:14,...,12,13,10,2,0,4,1,33,28.4,+4
1,2,2,2023-10-27,24-241,DAL,,BRK,W (+5),1,36:02,...,9,10,7,0,0,0,1,49,45.0,+9
2,3,3,2023-10-30,24-244,DAL,@,MEM,W (+15),1,40:00,...,12,12,12,1,1,6,4,35,28.9,+6
3,4,4,2023-11-01,24-246,DAL,,CHI,W (+9),1,40:03,...,7,7,10,2,1,6,2,18,13.0,+11
4,5,5,2023-11-03,24-248,DAL,@,DEN,L (-11),1,38:16,...,7,10,8,0,1,9,2,34,21.9,-4
5,6,6,2023-11-05,24-250,DAL,,CHO,W (+6),1,36:25,...,11,12,9,0,1,4,3,23,18.3,+3
6,7,7,2023-11-06,24-251,DAL,@,ORL,W (+15),1,35:09,...,3,3,6,2,0,6,2,29,21.0,+7
7,8,8,2023-11-08,24-253,DAL,,TOR,L (-11),1,38:24,...,7,7,8,4,1,4,3,31,23.2,-7
8,9,9,2023-11-10,24-255,DAL,,LAC,W (+18),1,31:41,...,6,6,6,1,0,1,1,44,41.7,+32
9,10,10,2023-11-12,24-257,DAL,@,NOP,W (+12),1,27:59,...,4,4,9,1,0,2,0,30,27.9,+14


**Data cleaning**

In [22]:
import numpy as np

df = df.drop(df[df.G == "G"].index)                          # deletes repeating headers in content
df = df.drop(df[df.FG == "Inactive"].index)                  # deletes games where he was inactive
df.rename(columns={'Unnamed: 5': 'Home'}, inplace=True)      # renames columns to previous game stats
      
df = df.drop(columns=["Date",                                # deletes useless columns
                      "Tm",
                      "Unnamed: 7", 
                      "Age", 
                      "GS", 
                      "+/-"])       
df['Home'].replace({np.nan: '1', '@': '0'}, inplace=True)       # reformats "Home" values
df['MP'] = pd.to_datetime(df['MP'], format='%M:%S').dt.minute   # turns MP into a datetime type and rounds to nearest minute
df['NextOpp'] = df['Opp'].shift(-1)                             # Shift 'Opp' values down by one and assign to 'NextOpp' and so on
df['NextPTS'] = df['PTS'].shift(-1)                          
df.fillna(0, inplace=True)                                      # replaces NaN with 0
df = df.reset_index(drop=True)                                  # resets index

df

Unnamed: 0,Rk,G,Home,Opp,MP,FG,FGA,FG%,3P,3PA,...,TRB,AST,STL,BLK,TOV,PF,PTS,GmSc,NextOpp,NextPTS
0,1,1,0,SAS,34,13,25,0.52,3,11,...,13,10,2,0,4,1,33,28.4,BRK,49
1,2,2,1,BRK,36,16,25,0.64,9,14,...,10,7,0,0,0,1,49,45.0,MEM,35
2,3,3,0,MEM,40,11,22,0.5,6,12,...,12,12,1,1,6,4,35,28.9,CHI,18
3,4,4,1,CHI,40,5,16,0.313,1,8,...,7,10,2,1,6,2,18,13.0,DEN,34
4,5,5,0,DEN,38,11,24,0.458,6,11,...,10,8,0,1,9,2,34,21.9,CHO,23
5,6,6,1,CHO,36,8,19,0.421,1,9,...,12,9,0,1,4,3,23,18.3,ORL,29
6,7,7,0,ORL,35,9,17,0.529,5,10,...,3,6,2,0,6,2,29,21.0,TOR,31
7,8,8,1,TOR,38,11,26,0.423,2,10,...,7,8,4,1,4,3,31,23.2,LAC,44
8,9,9,1,LAC,31,17,21,0.81,6,9,...,6,6,1,0,1,1,44,41.7,NOP,30
9,10,10,0,NOP,27,11,18,0.611,4,9,...,4,9,1,0,2,0,30,27.9,NOP,16


In [None]:
#this is where i left off

In [83]:
# Convert multiple columns to numeric
columns_to_convert = ['G', 'FG', 'FGA', '3P', '3PA', 'FT', 
                      'FTA', 'ORB', 'DRB', 'TRB', 'AST', 
                      'STL', 'BLK', 'TOV', 'PF', 'PTS', 'NextPTS']

df = df.apply(pd.to_numeric, errors='coerce').fillna(0)

print(df.dtypes)

Home         int64
G            int64
FG           int64
FGA          int64
3P           int64
3PA          int64
FT           int64
FTA          int64
GmSc       float64
PTS          int64
NextPTS      int64
dtype: object


In [84]:
df = df.drop(df.index[-1]) # drops the last row because the predictor variable (NextPTS) is 0 so that could mess up the training
df

Unnamed: 0,Home,G,FG,FGA,3P,3PA,FT,FTA,GmSc,PTS,NextPTS
0,0,1,13,25,3,11,4,7,28.4,33,49
1,1,2,16,25,9,14,8,10,45.0,49,35
2,0,3,11,22,6,12,7,10,28.9,35,18
3,1,4,5,16,1,8,7,9,13.0,18,34
4,0,5,11,24,6,11,6,7,21.9,34,23
5,1,6,8,19,1,9,6,7,18.3,23,29
6,0,7,9,17,5,10,6,6,21.0,29,31
7,1,8,11,26,2,10,7,10,23.2,31,44
8,1,9,17,21,6,9,4,4,41.7,44,30
9,0,10,11,18,4,9,4,5,27.9,30,16


## Linear Regression

In [85]:
from sklearn.model_selection import train_test_split

y = df['NextPTS']
X = df.drop('NextPTS', axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=100)

y_test

13    30
27    39
1     35
25    39
5     29
Name: NextPTS, dtype: int64

In [86]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression()
# train the empty linear regression model on the following data set
lr.fit(X_train, y_train)
# applying the model to make prediction
y_lr_train_pred = lr.predict(X_train)
y_lr_test_pred = lr.predict(X_test)

y_lr_test_pred

array([38.70561126, 26.5279756 , 23.24342857, 47.96512052, 41.04765674])

In [87]:
from sklearn.metrics import mean_squared_error, r2_score

lr_train_mse = mean_squared_error(y_train, y_lr_train_pred)
lr_train_r2 = r2_score(y_train, y_lr_train_pred)

lr_test_mse = mean_squared_error(y_test, y_lr_test_pred)
lr_test_r2 = r2_score(y_test, y_lr_test_pred)



In [88]:
lr_results = pd.DataFrame(["Linear Regression", lr_train_mse, lr_train_r2, lr_test_mse, lr_test_r2]).transpose()
lr_results.columns = ["Method", "Training MSE", "Training R2", "Test MSE", "Test R2"]
lr_results

Unnamed: 0,Method,Training MSE,Training R2,Test MSE,Test R2
0,Linear Regression,39.817186,0.474617,119.01509,-5.52495


**Actually Predicting**


In [79]:
X_new = df
X_new = X_new.loc[[29]]   
X_new

Unnamed: 0,Home,G,FG,FGA,3P,3PA,FT,FTA,GmSc,PTS,NextPTS
29,0,30,14,29,5,13,6,7,29.8,39,19


In [65]:
# Ensure that the order of columns in X_new_encoded is consistent with the order used during training
X_new_encoded = X_new_encoded.reindex(columns=X.columns, fill_value=0)

# Make predictions using the trained model
next_game_predictions = rf.predict(X_new_encoded)

# Display or use the predictions as needed
print("Predicted Points for Next Game:", next_game_predictions)

NameError: name 'X_new_encoded' is not defined

## Random Forest