## Load/Clean Data

Sources: 

https://www.basketball-reference.com/leagues/NBA_2024_per_game.html

https://www.basketball-reference.com/players/a/antetgi01/gamelog/2024

In [451]:
import pandas as pd

url = "https://www.basketball-reference.com/players/a/antetgi01/gamelog/2024"   # Giannis 23-24 season game-by-game stats

html = pd.read_html(url, header=0)
df = html[7]                              # selects desired table on website page
df

Unnamed: 0,Rk,G,Date,Age,Tm,Unnamed: 5,Opp,Unnamed: 7,GS,MP,...,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,GmSc,+/-
0,1,1,2023-10-26,28-324,MIL,,PHI,W (+1),1,35:07,...,8,12,3,2,2,7,4,23,12.0,-13
1,2,2,2023-10-29,28-327,MIL,,ATL,L (-17),1,29:28,...,6,11,3,1,2,3,2,26,25.0,-11
2,3,3,2023-10-30,28-328,MIL,,MIA,W (+8),1,31:51,...,6,7,2,0,1,2,3,33,23.3,+18
3,4,4,2023-11-01,28-330,MIL,@,TOR,L (-19),1,27:41,...,1,4,2,1,0,3,3,16,9.7,-10
4,5,5,2023-11-03,28-332,MIL,,NYK,W (+5),1,35:47,...,7,8,6,0,1,7,3,22,14.5,+13
5,6,6,2023-11-06,28-335,MIL,@,BRK,W (+4),1,35:15,...,10,12,3,1,2,3,1,36,28.2,+15
6,7,7,2023-11-08,28-337,MIL,,DET,W (+2),1,21:52,...,9,9,5,0,3,1,1,15,11.9,+5
7,8,8,2023-11-09,28-338,MIL,@,IND,L (-2),1,36:29,...,10,12,3,0,0,8,4,54,40.2,+6
8,9,9,2023-11-11,28-340,MIL,@,ORL,L (-15),1,36:39,...,8,10,7,4,2,6,4,35,29.3,-11
9,10,10,2023-11-13,28-342,MIL,,CHI,W (+9),1,34:42,...,8,11,2,0,0,3,4,35,23.7,+13


**Data cleaning**

In [452]:
import numpy as np

df = df.drop(df[df.Date == "Date"].index)                    # deletes repeating headers in content
df = df.drop(df[df.GS == "Inactive"].index)                  # deletes games where he was inactive
df = df.drop(columns=["Unnamed: 7", "GS", "+/-"])            # deletes unnamed header and "games started" header
df.rename(columns={'Unnamed: 5': 'Home'}, inplace=True)      # renames "Unnamed 5" to "Home"
df['Home'].replace({np.nan: 'Yes', '@': 'No'}, inplace=True) # reformats "Home" values
df['NextPTS'] = df['PTS'].shift(-1)                          # Shift 'PTS' values down by one and assign to 'NextPTS'
df = df.reset_index(drop=True)                               # resets index

df


Unnamed: 0,Rk,G,Date,Age,Tm,Home,Opp,MP,FG,FGA,...,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,GmSc,NextPTS
0,1,1,2023-10-26,28-324,MIL,Yes,PHI,35:07,10,21,...,8,12,3,2,2,7,4,23,12.0,26.0
1,2,2,2023-10-29,28-327,MIL,Yes,ATL,29:28,9,14,...,6,11,3,1,2,3,2,26,25.0,33.0
2,3,3,2023-10-30,28-328,MIL,Yes,MIA,31:51,12,21,...,6,7,2,0,1,2,3,33,23.3,16.0
3,4,4,2023-11-01,28-330,MIL,No,TOR,27:41,5,11,...,1,4,2,1,0,3,3,16,9.7,22.0
4,5,5,2023-11-03,28-332,MIL,Yes,NYK,35:47,6,10,...,7,8,6,0,1,7,3,22,14.5,36.0
5,6,6,2023-11-06,28-335,MIL,No,BRK,35:15,15,27,...,10,12,3,1,2,3,1,36,28.2,15.0
6,7,7,2023-11-08,28-337,MIL,Yes,DET,21:52,6,16,...,9,9,5,0,3,1,1,15,11.9,54.0
7,8,8,2023-11-09,28-338,MIL,No,IND,36:29,19,25,...,10,12,3,0,0,8,4,54,40.2,35.0
8,9,9,2023-11-11,28-340,MIL,No,ORL,36:39,15,22,...,8,10,7,4,2,6,4,35,29.3,35.0
9,10,10,2023-11-13,28-342,MIL,Yes,CHI,34:42,13,22,...,8,11,2,0,0,3,4,35,23.7,16.0


In [453]:
print(df.dtypes)

Rk         object
G          object
Date       object
Age        object
Tm         object
Home       object
Opp        object
MP         object
FG         object
FGA        object
FG%        object
3P         object
3PA        object
3P%        object
FT         object
FTA        object
FT%        object
ORB        object
DRB        object
TRB        object
AST        object
STL        object
BLK        object
TOV        object
PF         object
PTS        object
GmSc       object
NextPTS    object
dtype: object


In [457]:
# Convert multiple columns to numeric
columns_to_convert = ['G', 'FG', 'FGA', 'PTS', 'NextPTS']

for column in columns_to_convert:
    df[column] = pd.to_numeric(df[column], errors='coerce').fillna(0)

In [458]:
playerstats = df[['G', 'FG', 'FGA', 'PTS', 'NextPTS']]
print(playerstats.dtypes)

G            int64
FG           int64
FGA          int64
PTS          int64
NextPTS    float64
dtype: object


In [459]:
playerstats

Unnamed: 0,G,FG,FGA,PTS,NextPTS
0,1,10,21,23,26.0
1,2,9,14,26,33.0
2,3,12,21,33,16.0
3,4,5,11,16,22.0
4,5,6,10,22,36.0
5,6,15,27,36,15.0
6,7,6,16,15,54.0
7,8,19,25,54,35.0
8,9,15,22,35,35.0
9,10,13,22,35,16.0


## Data Seperation/Splitting

In [462]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

X = df[['G', 'FG', 'FGA', 'PTS']]
y = df['NextPTS']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=100)

# create and train the model
lr = LinearRegression()
lr.fit(X_train, y_train)

Apply model to make prediction

In [463]:
y_lr_train_pred = lr.predict(X_train)
y_lr_test_pred = lr.predict(X_test)


In [467]:
from sklearn.metrics import mean_squared_error, r2_score

lr_train_mse = mean_squared_error(y_train, y_lr_train_pred)
lr_train_r2 = r2_score(y_train, y_lr_train_pred)

lr_test_mse = mean_squared_error(y_test, y_lr_test_pred)
lr_test_r2 = r2_score(y_test, y_lr_test_pred)l



In [468]:
lr_results = pd.DataFrame(["Linear Regression", lr_train_mse, lr_train_r2, lr_test_mse, lr_test_r2]).transpose()
lr_results.columns = ["Method", "Training MSE", "Training R2", "Test MSE", "Test R2"]
lr_results

Unnamed: 0,Method,Training MSE,Training R2,Test MSE,Test R2
0,Linear Regression,104.205269,0.117972,206.125011,-0.008399


## Actually Predicting


In [482]:
X_new = playerstats.drop(columns=["NextPTS"])
X_new = X_new.loc[[34]]   
X_new

Unnamed: 0,G,FG,FGA,PTS
34,35,16,25,48


In [483]:
predicted_points = lr.predict(X_new)
print("Predicted Points for Next Game:", predicted_points)

Predicted Points for Next Game: [29.81350236]
