# Basketball player statistics analysis (and prediction)

In this notebook, I explore my stats-RudyGobert.xlsx file. This file contains the main statistics of Rudy Gobert during his 10 seasons in the NBA. The final goal will be to obtain information on his progress, and build a model capable of predicting his next performances (points, rebounds, turnovers, etc.)

## **Data exportation** - Basketball Reference

In [198]:
import os
import pandas as pd

# Define the path to the data folder
data_folder = 'data'

# List all CSV files in the data folder
csv_files = [f for f in os.listdir(data_folder) if f.endswith('.csv')]

# Read and concatenate all CSV files
dataframes = [pd.read_csv(os.path.join(data_folder, file)) for file in csv_files]
stats_dataframe = pd.concat(dataframes, ignore_index=True)

stats_dataframe

Unnamed: 0,Rk,G,Date,Age,Tm,Unnamed: 5,Opp,Unnamed: 7,GS,MP,FG,FGA,FG%,3P,3PA,3P%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,GmSc,+/-
0,1,1.0,2020-12-23,28-180,UTA,@,POR,W (+20),1,28:08:00,8.0,10.0,0.800,0.0,0.0,,4.0,6.0,0.667,4.0,13.0,17.0,2.0,1.0,0.0,2.0,2.0,20.0,21.7,15.0
1,2,2.0,2020-12-26,28-183,UTA,,MIN,L (-5),1,29:03:00,7.0,10.0,0.700,0.0,0.0,,4.0,6.0,0.667,7.0,9.0,16.0,2.0,0.0,1.0,4.0,4.0,18.0,17.1,7.0
2,3,3.0,2020-12-28,28-185,UTA,@,OKC,W (+1),1,36:49:00,6.0,8.0,0.750,0.0,0.0,,0.0,0.0,,0.0,10.0,10.0,4.0,1.0,4.0,1.0,1.0,12.0,17.0,5.0
3,4,4.0,2020-12-31,28-188,UTA,,PHO,L (-11),1,34:11:00,7.0,9.0,0.778,0.0,0.0,,4.0,8.0,0.500,2.0,12.0,14.0,0.0,1.0,1.0,1.0,3.0,18.0,17.4,-9.0
4,5,5.0,2021-01-01,28-189,UTA,,LAC,W (+6),1,24:28:00,5.0,10.0,0.500,0.0,0.0,,2.0,4.0,0.500,2.0,7.0,9.0,1.0,1.0,1.0,0.0,4.0,12.0,10.5,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
919,38,38.0,2025-01-11,32-199,MIN,,MEM,L (-2),1,28:40:00,6.0,6.0,1.000,0.0,0.0,,0.0,2.0,0.000,2.0,2.0,4.0,4.0,0.0,1.0,0.0,3.0,12.0,13.7,3.0
920,39,39.0,2025-01-13,32-201,MIN,@,WAS,W (+14),1,32:34:00,5.0,6.0,0.833,0.0,0.0,,1.0,2.0,0.500,5.0,6.0,11.0,2.0,1.0,1.0,0.0,1.0,11.0,16.4,11.0
921,40,40.0,2025-01-15,32-203,MIN,,GSW,L (-1),1,29:29:00,2.0,2.0,1.000,0.0,0.0,,3.0,5.0,0.600,2.0,8.0,10.0,1.0,0.0,2.0,1.0,2.0,7.0,9.7,-4.0
922,41,41.0,2025-01-17,32-205,MIN,@,NYK,W (+17),1,34:26:00,5.0,6.0,0.833,0.0,0.0,,1.0,2.0,0.500,1.0,5.0,6.0,1.0,0.0,2.0,1.0,2.0,11.0,10.9,9.0


In [199]:
print(stats_dataframe.shape)
stats_dataframe.rename(columns={'Unnamed: 5': 'LOC', 'Unnamed: 7':'RES'}, inplace=True)
print(stats_dataframe.columns)

(924, 30)
Index(['Rk', 'G', 'Date', 'Age', 'Tm', 'LOC', 'Opp', 'RES', 'GS', 'MP', 'FG',
       'FGA', 'FG%', '3P', '3PA', '3P%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB',
       'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'GmSc', '+/-'],
      dtype='object')


## **Data preprocessing**

### Missing values

In [200]:
# Get the number of missing values in the dataframe per column
nan_counts = stats_dataframe.isna().sum().sort_values(ascending=False)

print(nan_counts)

3P%     908
LOC     460
FT%     210
FG%     142
+/-     126
FTA     125
STL     125
TRB     125
DRB     125
ORB     125
BLK     125
TOV     125
FT      125
G       125
3PA     125
3P      125
PF      125
FGA     125
FG      125
MP      125
PTS     125
GmSc    125
AST     125
Tm        0
Date      0
Opp       0
Age       0
GS        0
RES       0
Rk        0
dtype: int64


In [201]:
# Treat the missing values in the LOC column

# Replace "@" with "away"
stats_dataframe['LOC'] = stats_dataframe['LOC'].replace('@', 'away')
# Replace NaN values with "home"
stats_dataframe['LOC'] = stats_dataframe['LOC'].fillna('home')

In [202]:
# Treat the missing values in all the game stats columns for when he didn't play

# Define the conditions
conditions = ['Inactive', 'Did Not Play', 'Did Not Dress', 'Not With Team']

# Replace NaN values with -1 for rows where GS matches the conditions
stats_dataframe.loc[stats_dataframe['GS'].isin(conditions)] = stats_dataframe.loc[stats_dataframe['GS'].isin(conditions)].infer_objects(copy=False).fillna(-1)

In [203]:
# Treat the missing values in the 3P% column. In this cases, the player did play, but didn't attempt any 3-point shots, free throws or field goals.

stats_dataframe['3P%'] = stats_dataframe['3P%'].fillna(0)
stats_dataframe['FT%'] = stats_dataframe['FT%'].fillna(0)
stats_dataframe['FG%'] = stats_dataframe['FG%'].fillna(0)

# Treat the missing value in the +/- column, which was maybe a mistake or an error.
stats_dataframe['+/-'] = stats_dataframe['+/-'].fillna('4')


In [204]:
print(stats_dataframe.isna().sum().sort_values(ascending=False))

Rk      0
G       0
GmSc    0
PTS     0
PF      0
TOV     0
BLK     0
STL     0
AST     0
TRB     0
DRB     0
ORB     0
FT%     0
FTA     0
FT      0
3P%     0
3PA     0
3P      0
FG%     0
FGA     0
FG      0
MP      0
GS      0
RES     0
Opp     0
LOC     0
Tm      0
Age     0
Date    0
+/-     0
dtype: int64


### Other preprocessing

In [205]:
# Age column (from 31-128 to 31)
# Remove everything after the hyphen and convert the column to int
stats_dataframe['Age'] = stats_dataframe['Age'].apply(lambda x: int(x.split('-')[0]))
stats_dataframe.Age


0      28
1      28
2      28
3      28
4      28
       ..
919    32
920    32
921    32
922    32
923    32
Name: Age, Length: 924, dtype: int64

In [206]:
# RES column (from W (+2) to 2)
import re

# Extract the number in parentheses in the colonne RES
stats_dataframe['RES'] = stats_dataframe['RES'].str.extract(r'\(([-+]?\d+)\)').astype(int)
stats_dataframe.RES

0      20
1      -5
2       1
3     -11
4       6
       ..
919    -2
920    14
921    -1
922    17
923    -7
Name: RES, Length: 924, dtype: int64

In [207]:
stats_dataframe

Unnamed: 0,Rk,G,Date,Age,Tm,LOC,Opp,RES,GS,MP,FG,FGA,FG%,3P,3PA,3P%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,GmSc,+/-
0,1,1.0,2020-12-23,28,UTA,away,POR,20,1,28:08:00,8.0,10.0,0.800,0.0,0.0,0.0,4.0,6.0,0.667,4.0,13.0,17.0,2.0,1.0,0.0,2.0,2.0,20.0,21.7,15.0
1,2,2.0,2020-12-26,28,UTA,home,MIN,-5,1,29:03:00,7.0,10.0,0.700,0.0,0.0,0.0,4.0,6.0,0.667,7.0,9.0,16.0,2.0,0.0,1.0,4.0,4.0,18.0,17.1,7.0
2,3,3.0,2020-12-28,28,UTA,away,OKC,1,1,36:49:00,6.0,8.0,0.750,0.0,0.0,0.0,0.0,0.0,0.000,0.0,10.0,10.0,4.0,1.0,4.0,1.0,1.0,12.0,17.0,5.0
3,4,4.0,2020-12-31,28,UTA,home,PHO,-11,1,34:11:00,7.0,9.0,0.778,0.0,0.0,0.0,4.0,8.0,0.500,2.0,12.0,14.0,0.0,1.0,1.0,1.0,3.0,18.0,17.4,-9.0
4,5,5.0,2021-01-01,28,UTA,home,LAC,6,1,24:28:00,5.0,10.0,0.500,0.0,0.0,0.0,2.0,4.0,0.500,2.0,7.0,9.0,1.0,1.0,1.0,0.0,4.0,12.0,10.5,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
919,38,38.0,2025-01-11,32,MIN,home,MEM,-2,1,28:40:00,6.0,6.0,1.000,0.0,0.0,0.0,0.0,2.0,0.000,2.0,2.0,4.0,4.0,0.0,1.0,0.0,3.0,12.0,13.7,3.0
920,39,39.0,2025-01-13,32,MIN,away,WAS,14,1,32:34:00,5.0,6.0,0.833,0.0,0.0,0.0,1.0,2.0,0.500,5.0,6.0,11.0,2.0,1.0,1.0,0.0,1.0,11.0,16.4,11.0
921,40,40.0,2025-01-15,32,MIN,home,GSW,-1,1,29:29:00,2.0,2.0,1.000,0.0,0.0,0.0,3.0,5.0,0.600,2.0,8.0,10.0,1.0,0.0,2.0,1.0,2.0,7.0,9.7,-4.0
922,41,41.0,2025-01-17,32,MIN,away,NYK,17,1,34:26:00,5.0,6.0,0.833,0.0,0.0,0.0,1.0,2.0,0.500,1.0,5.0,6.0,1.0,0.0,2.0,1.0,2.0,11.0,10.9,9.0


In [213]:
def convert_to_minutes_seconds(mp):
    if isinstance(mp, str):
        parts = mp.split(':')
        return int(parts[0]) * 60 + int(parts[1])
    return mp

stats_dataframe['MP'] = stats_dataframe['MP'].apply(convert_to_minutes_seconds)
stats_dataframe['MP']

0      1688.0
1      1743.0
2      2209.0
3      2051.0
4      1468.0
        ...  
919    1720.0
920    1954.0
921    1769.0
922    2066.0
923    1878.0
Name: MP, Length: 924, dtype: float64

In [219]:
stats_dataframe.head(10)

Unnamed: 0,Rk,G,Date,Age,Tm,LOC,Opp,RES,GS,MP,FG,FGA,FG%,3P,3PA,3P%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,GmSc,+/-
0,1,1.0,2020-12-23,28,UTA,away,POR,20,1,1688.0,8.0,10.0,0.8,0.0,0.0,0.0,4.0,6.0,0.667,4.0,13.0,17.0,2.0,1.0,0.0,2.0,2.0,20.0,21.7,15.0
1,2,2.0,2020-12-26,28,UTA,home,MIN,-5,1,1743.0,7.0,10.0,0.7,0.0,0.0,0.0,4.0,6.0,0.667,7.0,9.0,16.0,2.0,0.0,1.0,4.0,4.0,18.0,17.1,7.0
2,3,3.0,2020-12-28,28,UTA,away,OKC,1,1,2209.0,6.0,8.0,0.75,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,10.0,4.0,1.0,4.0,1.0,1.0,12.0,17.0,5.0
3,4,4.0,2020-12-31,28,UTA,home,PHO,-11,1,2051.0,7.0,9.0,0.778,0.0,0.0,0.0,4.0,8.0,0.5,2.0,12.0,14.0,0.0,1.0,1.0,1.0,3.0,18.0,17.4,-9.0
4,5,5.0,2021-01-01,28,UTA,home,LAC,6,1,1468.0,5.0,10.0,0.5,0.0,0.0,0.0,2.0,4.0,0.5,2.0,7.0,9.0,1.0,1.0,1.0,0.0,4.0,12.0,10.5,0.0
5,6,6.0,2021-01-03,28,UTA,away,SAS,21,1,1647.0,2.0,7.0,0.286,0.0,0.0,0.0,3.0,6.0,0.5,2.0,14.0,16.0,1.0,0.0,6.0,4.0,1.0,7.0,7.8,18.0
6,7,7.0,2021-01-05,28,UTA,away,BRK,-34,1,1630.0,3.0,10.0,0.3,0.0,0.0,0.0,4.0,5.0,0.8,6.0,5.0,11.0,3.0,0.0,2.0,0.0,1.0,10.0,12.6,-15.0
7,8,8.0,2021-01-06,28,UTA,away,NYK,-12,1,2100.0,7.0,9.0,0.778,0.0,0.0,0.0,0.0,3.0,0.0,2.0,9.0,11.0,2.0,0.0,5.0,0.0,1.0,14.0,17.9,-18.0
8,9,9.0,2021-01-08,28,UTA,away,MIL,13,1,1986.0,4.0,9.0,0.444,0.0,0.0,0.0,1.0,4.0,0.25,4.0,10.0,14.0,0.0,1.0,4.0,2.0,1.0,9.0,10.3,13.0
9,10,10.0,2021-01-10,28,UTA,away,DET,10,1,2014.0,1.0,5.0,0.2,0.0,0.0,0.0,2.0,6.0,0.333,6.0,13.0,19.0,1.0,0.0,4.0,4.0,2.0,4.0,6.1,7.0


Perfect! Now we don't have any missing values anymore. We can move on to the exploration phase.

### **Date exploration**

In [None]:
stats_dataframe.to_csv('data/Rudy_Gobert_entire_stats.csv', index=False)