# Basketball player statistics analysis (and prediction)

In this notebook, I explore my stats-RudyGobert.xlsx file. This file contains the main statistics of Rudy Gobert during his 10 seasons in the NBA. The final goal will be to obtain information on his progress, and build a model capable of predicting his next performances (points, rebounds, turnovers, etc.)

In [71]:
import pandas as pd

## **Data exportation** - Basketball Reference

In [72]:
# get the data from the excel file
season2425 = pd.read_excel('stats-RudyGobert.xlsx', sheet_name=None)
# concatenate the sheets
stats_dataframe = pd.concat(season2425)
# rename the columns that didn't have a name
stats_dataframe.rename(columns={"Unnamed: 5" : "LOC", "Unnamed: 7" : "RES"}, inplace=True)

In [73]:
print(stats_dataframe.shape)
print(stats_dataframe.columns)

(924, 30)
Index(['Rk', 'G', 'Date', 'Age', 'Tm', 'LOC', 'Opp', 'RES', 'GS', 'MP', 'FG',
       'FGA', 'FG%', '3P', '3PA', '3P%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB',
       'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'GmSc', '+/-'],
      dtype='object')


In [74]:
stats_dataframe

Unnamed: 0,Unnamed: 1,Rk,G,Date,Age,Tm,LOC,Opp,RES,GS,MP,FG,FGA,FG%,3P,3PA,3P%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,GmSc,+/-
2024-2025,0,1,1.0,2024-10-22,32-118,MIN,@,LAL,L (-7),1,"1 day, 11:20:00",5.0,8.0,0.625,0.0,0.0,,3.0,4.0,0.75,3.0,11.0,14.0,2.0,0.0,1.0,1.0,4.0,13.0,2025-09-13 00:00:00,-10.0
2024-2025,1,2,2.0,2024-10-24,32-120,MIN,@,SAC,W (+2),1,"1 day, 10:38:00",1.0,2.0,0.500,0.0,0.0,,1.0,2.0,0.50,1.0,10.0,11.0,1.0,2.0,0.0,3.0,4.0,3.0,2025-04-03 00:00:00,-6.0
2024-2025,2,3,3.0,2024-10-26,32-122,MIN,,TOR,W (+11),1,"1 day, 14:59:00",6.0,8.0,0.750,0.0,0.0,,3.0,4.0,0.75,2.0,10.0,12.0,1.0,1.0,4.0,1.0,3.0,15.0,2025-01-18 00:00:00,20.0
2024-2025,3,4,4.0,2024-10-29,32-125,MIN,,DAL,L (-6),1,"1 day, 7:11:00",2.0,5.0,0.400,0.0,0.0,,3.0,5.0,0.60,4.0,4.0,8.0,1.0,0.0,0.0,2.0,3.0,7.0,5.0,-8.0
2024-2025,4,5,5.0,2024-11-01,32-128,MIN,,DEN,W (+3),1,"1 day, 11:58:00",7.0,10.0,0.700,0.0,0.0,,3.0,4.0,0.75,4.0,10.0,14.0,3.0,1.0,2.0,2.0,3.0,17.0,2025-05-19 00:00:00,7.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2013-2014,77,78,,2014-04-08,21-286,UTA,,DAL,L (-12),Did Not Play,,,,,,,,,,,,,,,,,,,,,
2013-2014,78,79,42.0,2014-04-11,21-289,UTA,,POR,L (-12),0,10:33:00,3.0,7.0,0.429,0.0,0.0,,1.0,2.0,0.50,2.0,4.0,6.0,0.0,0.0,1.0,2.0,1.0,7.0,2025-08-03 00:00:00,-12.0
2013-2014,79,80,43.0,2014-04-12,21-290,UTA,@,DEN,L (-7),0,03:57:00,0.0,1.0,0.000,0.0,0.0,,0.0,0.0,,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,-0.4,-3.0
2013-2014,80,81,44.0,2014-04-14,21-292,UTA,,LAL,L (-15),0,02:25:00,0.0,1.0,0.000,0.0,0.0,,0.0,0.0,,0.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.1,5.0


## **Data preprocessing**

### Missing values

In [75]:
# Get the number of missing values in the dataframe per column
nan_counts = stats_dataframe.isna().sum().sort_values(ascending=False)

print(nan_counts)

3P%     908
LOC     460
FT%     210
FG%     142
+/-     126
FTA     125
STL     125
TRB     125
DRB     125
ORB     125
BLK     125
TOV     125
FT      125
G       125
3PA     125
3P      125
PF      125
FGA     125
FG      125
MP      125
PTS     125
GmSc    125
AST     125
Tm        0
Date      0
Opp       0
Age       0
GS        0
RES       0
Rk        0
dtype: int64


In [76]:
# Treat the missing values in the LOC column

# Replace "@" with "away"
stats_dataframe['LOC'] = stats_dataframe['LOC'].replace('@', 'away')
# Replace NaN values with "home"
stats_dataframe['LOC'] = stats_dataframe['LOC'].fillna('home')

In [77]:
# Treat the missing values in all the game stats columns for when he didn't play

# Define the conditions
conditions = ['Inactive', 'Did Not Play', 'Did Not Dress', 'Not With Team']

# Replace NaN values with -1 for rows where GS matches the conditions
stats_dataframe.loc[stats_dataframe['GS'].isin(conditions)] = stats_dataframe.loc[stats_dataframe['GS'].isin(conditions)].fillna(-1)

  stats_dataframe.loc[stats_dataframe['GS'].isin(conditions)] = stats_dataframe.loc[stats_dataframe['GS'].isin(conditions)].fillna(-1)


In [78]:
print(stats_dataframe.isna().sum().sort_values(ascending=False))

3P%     783
FT%      85
FG%      17
+/-       1
Tm        0
LOC       0
GmSc      0
PTS       0
PF        0
TOV       0
BLK       0
STL       0
AST       0
TRB       0
DRB       0
ORB       0
Date      0
FTA       0
FT        0
G         0
3PA       0
3P        0
Age       0
FGA       0
FG        0
MP        0
GS        0
RES       0
Opp       0
Rk        0
dtype: int64


In [81]:
# Treat the missing values in the 3P% column. In this cases, the player did play, but didn't attempt any 3-point shots, free throws or field goals.

stats_dataframe['3P%'] = stats_dataframe['3P%'].fillna(0)
stats_dataframe['FT%'] = stats_dataframe['FT%'].fillna(0)
stats_dataframe['FG%'] = stats_dataframe['FG%'].fillna(0)

# Treat the missing value in the +/- column, which was maybe a mistake or an error.
stats_dataframe['+/-'] = stats_dataframe['+/-'].fillna('4')


In [87]:
print(stats_dataframe.isna().sum().sort_values(ascending=False))

Rk      0
G       0
GmSc    0
PTS     0
PF      0
TOV     0
BLK     0
STL     0
AST     0
TRB     0
DRB     0
ORB     0
FT%     0
FTA     0
FT      0
3P%     0
3PA     0
3P      0
FG%     0
FGA     0
FG      0
MP      0
GS      0
RES     0
Opp     0
LOC     0
Tm      0
Age     0
Date    0
+/-     0
dtype: int64


Perfect! Now we don't have any missing values anymore. We can move on to the exploration phase.

### **Date exploration**

In [117]:
stats_dataframe.to_excel('stats-RudyGobert-cleaned.xlsx', index=False)

In [118]:
stats_dataframe

Unnamed: 0,Unnamed: 1,Rk,G,Date,Age,Tm,LOC,Opp,RES,GS,MP,FG,FGA,FG%,3P,3PA,3P%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,GmSc,+/-
2024-2025,0,1,1.0,2024-10-22,32-118,MIN,away,LAL,L (-7),1,"1 day, 11:20:00",5.0,8.0,0.625,0.0,0.0,0.0,3.0,4.0,0.75,3.0,11.0,14.0,2.0,0.0,1.0,1.0,4.0,13.0,2025-09-13 00:00:00,-10.0
2024-2025,1,2,2.0,2024-10-24,32-120,MIN,away,SAC,W (+2),1,"1 day, 10:38:00",1.0,2.0,0.500,0.0,0.0,0.0,1.0,2.0,0.50,1.0,10.0,11.0,1.0,2.0,0.0,3.0,4.0,3.0,2025-04-03 00:00:00,-6.0
2024-2025,2,3,3.0,2024-10-26,32-122,MIN,home,TOR,W (+11),1,"1 day, 14:59:00",6.0,8.0,0.750,0.0,0.0,0.0,3.0,4.0,0.75,2.0,10.0,12.0,1.0,1.0,4.0,1.0,3.0,15.0,2025-01-18 00:00:00,20.0
2024-2025,3,4,4.0,2024-10-29,32-125,MIN,home,DAL,L (-6),1,"1 day, 7:11:00",2.0,5.0,0.400,0.0,0.0,0.0,3.0,5.0,0.60,4.0,4.0,8.0,1.0,0.0,0.0,2.0,3.0,7.0,5.0,-8.0
2024-2025,4,5,5.0,2024-11-01,32-128,MIN,home,DEN,W (+3),1,"1 day, 11:58:00",7.0,10.0,0.700,0.0,0.0,0.0,3.0,4.0,0.75,4.0,10.0,14.0,3.0,1.0,2.0,2.0,3.0,17.0,2025-05-19 00:00:00,7.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2013-2014,77,78,-1.0,2014-04-08,21-286,UTA,home,DAL,L (-12),Did Not Play,-1,-1.0,-1.0,-1.000,-1.0,-1.0,-1.0,-1.0,-1.0,-1.00,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1,-1.0
2013-2014,78,79,42.0,2014-04-11,21-289,UTA,home,POR,L (-12),0,10:33:00,3.0,7.0,0.429,0.0,0.0,0.0,1.0,2.0,0.50,2.0,4.0,6.0,0.0,0.0,1.0,2.0,1.0,7.0,2025-08-03 00:00:00,-12.0
2013-2014,79,80,43.0,2014-04-12,21-290,UTA,away,DEN,L (-7),0,03:57:00,0.0,1.0,0.000,0.0,0.0,0.0,0.0,0.0,0.00,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,-0.4,-3.0
2013-2014,80,81,44.0,2014-04-14,21-292,UTA,home,LAL,L (-15),0,02:25:00,0.0,1.0,0.000,0.0,0.0,0.0,0.0,0.0,0.00,0.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.1,5.0
