v1 Uses only 'G', 'FG', 'FGA', 'PTS'

## Load/Clean Data

Sources: 

https://www.basketball-reference.com/leagues/NBA_2024_per_game.html

https://www.basketball-reference.com/players/a/antetgi01/gamelog/2024

In [1]:
import pandas as pd

playerID = "e/edwaran01"   
url = "https://www.basketball-reference.com/players/" + playerID + "/gamelog/2024"


html = pd.read_html(url, header=0)
df = html[7]                              # selects desired table on website page
df


Unnamed: 0,Rk,G,Date,Age,Tm,Unnamed: 5,Opp,Unnamed: 7,GS,MP,...,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,GmSc,+/-
0,1,1,2023-10-25,22-081,MIN,@,TOR,L (-3),1,38:18,...,9,14,1,0,0,2,1,26,14.8,0
1,2,2,2023-10-28,22-084,MIN,,MIA,W (+16),1,33:22,...,2,2,7,0,0,2,3,19,13.6,+21
2,3,3,2023-10-30,22-086,MIN,@,ATL,L (-14),1,37:47,...,5,5,7,1,0,5,4,31,21.9,-9
3,4,4,2023-11-01,22-088,MIN,,DEN,W (+21),1,31:50,...,2,2,1,2,0,4,1,24,14.5,+11
4,5,5,2023-11-04,22-091,MIN,,UTA,W (+28),1,35:44,...,8,8,6,1,0,6,0,31,23.7,+27
5,6,6,2023-11-06,22-093,MIN,,BOS,W (+5),1,38:08,...,9,9,7,1,0,3,5,38,29.7,+18
6,7,7,2023-11-08,22-095,MIN,,NOP,W (+21),1,33:04,...,3,3,8,3,2,3,1,26,21.7,+35
7,8,8,2023-11-10,22-097,MIN,@,SAS,W (+7),1,37:08,...,5,7,5,2,1,4,1,28,20.4,+3
8,9,9,2023-11-12,22-099,MIN,@,GSW,W (+6),1,37:47,...,5,6,7,2,0,4,0,33,23.6,+15
9,10,10,2023-11-14,22-101,MIN,@,GSW,W (+3),1,36:18,...,4,4,4,3,0,5,1,20,12.4,-9


**Data cleaning**

In [2]:
import numpy as np

df = df.drop(df[df.Date == "Date"].index)                    # deletes repeating headers in content
df = df.drop(df[df.GS == "Inactive"].index)                  # deletes games where he was inactive
df = df.drop(columns=["Unnamed: 7", "GS", "+/-"])            # deletes unnamed header and "games started" header
df.rename(columns={'Unnamed: 5': 'Home'}, inplace=True)      # renames "Unnamed 5" to "Home"
df['Home'].replace({np.nan: 'Yes', '@': 'No'}, inplace=True) # reformats "Home" values
df['NextPTS'] = df['PTS'].shift(-1)                          # Shift 'PTS' values down by one and assign to 'NextPTS'
df = df.reset_index(drop=True)                               # resets index

df


Unnamed: 0,Rk,G,Date,Age,Tm,Home,Opp,MP,FG,FGA,...,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,GmSc,NextPTS
0,1,1,2023-10-25,22-081,MIN,No,TOR,38:18,8,27,...,9,14,1,0,0,2,1,26,14.8,19.0
1,2,2,2023-10-28,22-084,MIN,Yes,MIA,33:22,7,15,...,2,2,7,0,0,2,3,19,13.6,31.0
2,3,3,2023-10-30,22-086,MIN,No,ATL,37:47,13,21,...,5,5,7,1,0,5,4,31,21.9,24.0
3,4,4,2023-11-01,22-088,MIN,Yes,DEN,31:50,8,16,...,2,2,1,2,0,4,1,24,14.5,31.0
4,5,5,2023-11-04,22-091,MIN,Yes,UTA,35:44,13,19,...,8,8,6,1,0,6,0,31,23.7,38.0
5,6,6,2023-11-06,22-093,MIN,Yes,BOS,38:08,15,25,...,9,9,7,1,0,3,5,38,29.7,26.0
6,7,7,2023-11-08,22-095,MIN,Yes,NOP,33:04,10,22,...,3,3,8,3,2,3,1,26,21.7,28.0
7,8,8,2023-11-10,22-097,MIN,No,SAS,37:08,9,21,...,5,7,5,2,1,4,1,28,20.4,33.0
8,9,9,2023-11-12,22-099,MIN,No,GSW,37:47,11,27,...,5,6,7,2,0,4,0,33,23.6,20.0
9,10,10,2023-11-14,22-101,MIN,No,GSW,36:18,7,16,...,4,4,4,3,0,5,1,20,12.4,13.0


In [4]:
print(df.dtypes)

Rk         object
G          object
Date       object
Age        object
Tm         object
Home       object
Opp        object
MP         object
FG         object
FGA        object
FG%        object
3P         object
3PA        object
3P%        object
FT         object
FTA        object
FT%        object
ORB        object
DRB        object
TRB        object
AST        object
STL        object
BLK        object
TOV        object
PF         object
PTS        object
GmSc       object
NextPTS    object
dtype: object


In [5]:
# Convert multiple columns to numeric
columns_to_convert = ['G', 'FG', 'FGA', 'PTS', 'NextPTS']

for column in columns_to_convert:
    df[column] = pd.to_numeric(df[column], errors='coerce').fillna(0)

In [6]:
playerstats = df[['G', 'FG', 'FGA', 'PTS', 'NextPTS']]
print(playerstats.dtypes)

G            int64
FG           int64
FGA          int64
PTS          int64
NextPTS    float64
dtype: object


In [7]:
playerstats

Unnamed: 0,G,FG,FGA,PTS,NextPTS
0,1,8,27,26,19.0
1,2,7,15,19,31.0
2,3,13,21,31,24.0
3,4,8,16,24,31.0
4,5,13,19,31,38.0
5,6,15,25,38,26.0
6,7,10,22,26,28.0
7,8,9,21,28,33.0
8,9,11,27,33,20.0
9,10,7,16,20,13.0


## Data Seperation/Splitting

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

X = df[['G', 'FG', 'FGA', 'PTS']]
y = df['NextPTS']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=100)

# create and train the model
lr = LinearRegression()
lr.fit(X_train, y_train)

Apply model to make prediction

In [9]:
y_lr_train_pred = lr.predict(X_train)
y_lr_test_pred = lr.predict(X_test)

In [10]:
from sklearn.metrics import mean_squared_error, r2_score

lr_train_mse = mean_squared_error(y_train, y_lr_train_pred)
lr_train_r2 = r2_score(y_train, y_lr_train_pred)

lr_test_mse = mean_squared_error(y_test, y_lr_test_pred)
lr_test_r2 = r2_score(y_test, y_lr_test_pred)

In [11]:
lr_results = pd.DataFrame(["Linear Regression", lr_train_mse, lr_train_r2, lr_test_mse, lr_test_r2]).transpose()
lr_results.columns = ["Method", "Training MSE", "Training R2", "Test MSE", "Test R2"]
lr_results

Unnamed: 0,Method,Training MSE,Training R2,Test MSE,Test R2
0,Linear Regression,75.454156,0.163363,155.968349,-0.505012


## Actually Predicting


In [18]:
X_new = playerstats.drop(columns=["NextPTS"])
X_new = X_new.loc[[5]]   
X_new

Unnamed: 0,G,FG,FGA,PTS
5,6,15,27,36


In [19]:
predicted_points = lr.predict(X_new)
print("Predicted Points for Next Game:", predicted_points)

Predicted Points for Next Game: [24.64157854]
