# NBA 2024-25: Utilizing Roles
## Notebook 03: Data Acquisition, Pt. 3
This notebook loads and merges per-game player statistics and advanced statistics from Basketball-Reference for the 2024-25 NBA regular season and the previous five seasons (2019-20 through 2023-24). These five seasons of data are then combined into a single DataFrame.

In [2]:
# Import libraries
import pandas as pd

___
## Load

In [17]:
# Load per game stats data (5 seasons from 2019-20 to 2023-24 PLUS 2024-25 separately)
per_game_2024_25 = pd.read_csv(r"C:\Users\dylan\OneDrive\Documents\Portfolio_Projects\NBA_2024_25_utilizing_roles\01_raw_data\NBA_per_game_stats_2024_25.txt")
per_game_2023_24 = pd.read_csv(r"C:\Users\dylan\OneDrive\Documents\Portfolio_Projects\NBA_2024_25_utilizing_roles\01_raw_data\NBA_per_game_stats_2023_24.txt")
per_game_2022_23 = pd.read_csv(r"C:\Users\dylan\OneDrive\Documents\Portfolio_Projects\NBA_2024_25_utilizing_roles\01_raw_data\NBA_per_game_stats_2022_23.txt")
per_game_2021_22 = pd.read_csv(r"C:\Users\dylan\OneDrive\Documents\Portfolio_Projects\NBA_2024_25_utilizing_roles\01_raw_data\NBA_per_game_stats_2021_22.txt")
per_game_2020_21 = pd.read_csv(r"C:\Users\dylan\OneDrive\Documents\Portfolio_Projects\NBA_2024_25_utilizing_roles\01_raw_data\NBA_per_game_stats_2020_21.txt")
per_game_2019_20 = pd.read_csv(r"C:\Users\dylan\OneDrive\Documents\Portfolio_Projects\NBA_2024_25_utilizing_roles\01_raw_data\NBA_per_game_stats_2019_20.txt")

In [19]:
# Load advanced stats data (5 seasons from 2019-20 to 2023-24 PLUS 2024-25 separately)
advanced_2024_25 = pd.read_csv(r"C:\Users\dylan\OneDrive\Documents\Portfolio_Projects\NBA_2024_25_utilizing_roles\01_raw_data\NBA_advanced_stats_2024_25.txt")
advanced_2023_24 = pd.read_csv(r"C:\Users\dylan\OneDrive\Documents\Portfolio_Projects\NBA_2024_25_utilizing_roles\01_raw_data\NBA_advanced_stats_2023_24.txt")
advanced_2022_23 = pd.read_csv(r"C:\Users\dylan\OneDrive\Documents\Portfolio_Projects\NBA_2024_25_utilizing_roles\01_raw_data\NBA_advanced_stats_2022_23.txt")
advanced_2021_22 = pd.read_csv(r"C:\Users\dylan\OneDrive\Documents\Portfolio_Projects\NBA_2024_25_utilizing_roles\01_raw_data\NBA_advanced_stats_2021_22.txt")
advanced_2020_21 = pd.read_csv(r"C:\Users\dylan\OneDrive\Documents\Portfolio_Projects\NBA_2024_25_utilizing_roles\01_raw_data\NBA_advanced_stats_2020_21.txt")
advanced_2019_20 = pd.read_csv(r"C:\Users\dylan\OneDrive\Documents\Portfolio_Projects\NBA_2024_25_utilizing_roles\01_raw_data\NBA_advanced_stats_2019_20.txt")

___
## Merge & Concatenate

In [None]:
# --- Only keep necessary columns from advanced stats DataFrame ---
advanced_2024_25_keep_cols = ["Player", "USG%"]
advanced_2023_24_keep_cols = ["Player", "USG%"]
advanced_2022_23_keep_cols = ["Player", "USG%"]
advanced_2021_22_keep_cols = ["Player", "USG%"]
advanced_2020_21_keep_cols = ["Player", "USG%"]
advanced_2019_20_keep_cols = ["Player", "USG%"]

advanced_2024_25 = advanced_2024_25[advanced_2024_25_keep_cols].copy()
advanced_2023_24 = advanced_2023_24[advanced_2023_24_keep_cols].copy()
advanced_2022_23 = advanced_2022_23[advanced_2022_23_keep_cols].copy()
advanced_2021_22 = advanced_2021_22[advanced_2021_22_keep_cols].copy()
advanced_2020_21 = advanced_2020_21[advanced_2020_21_keep_cols].copy()
advanced_2019_20 = advanced_2019_20[advanced_2019_20_keep_cols].copy()

In [None]:
# Confirm columns for per game stats
per_game_2024_25.columns

In [None]:
# Confirm columns for advanced stats
advanced_2024_25.columns

In [None]:
# Merge each season individually
merged_2024_25 = per_game_2024_25.merge(advanced_2024_25, on="Player", how="inner")
merged_2023_24 = per_game_2023_24.merge(advanced_2023_24, on="Player", how="inner")
merged_2022_23 = per_game_2022_23.merge(advanced_2022_23, on="Player", how="inner")
merged_2021_22 = per_game_2021_22.merge(advanced_2021_22, on="Player", how="inner")
merged_2020_21 = per_game_2020_21.merge(advanced_2020_21, on="Player", how="inner")
merged_2019_20 = per_game_2019_20.merge(advanced_2019_20, on="Player", how="inner")

In [None]:
# Add a season column to each merged DataFrame
merged_2024_25["Season"] = "2024-25"
merged_2023_24["Season"] = "2023-24"
merged_2022_23["Season"] = "2022-23"
merged_2021_22["Season"] = "2021-22"
merged_2020_21["Season"] = "2020-21"
merged_2019_20["Season"] = "2019-20"

In [None]:
# Inspect columns for 2024-25
merged_2024_25.columns

In [None]:
# --- For 2024-25 season, keep only necessary columns ---
merged_2024_25_keep_cols = ["Player", "Season", "Age", "Team", "G", "Pos", "PTS", "TRB", "AST", "USG%"]

merged_2024_25 = merged_2024_25[merged_2024_25_keep_cols].copy()

In [None]:
# Confirm structure of 2024-25 merged
merged_2024_25.head(5)

In [None]:
# Concatenate the previous 5 seasons (2019-2024) into one DataFrame
per_game_prev_5_seasons = pd.concat(
    [
        merged_2023_24,
        merged_2022_23,
        merged_2021_22,
        merged_2020_21,
        merged_2019_20
    ],
    ignore_index=True
)

In [None]:
# Drop nonessential columns
per_game_prev_5_seasons = per_game_prev_5_seasons.drop(columns=["Rk", "Player-additional"])

In [None]:
# Confirm structure
per_game_prev_5_seasons.head(5)

In [None]:
# Confirm columns
per_game_prev_5_seasons.columns

___
## Save

In [None]:
# Save to parquet
per_game_prev_5_seasons.to_parquet("NBA_Per_Game_2019_2024.parquet")
merged_2024_25.to_parquet("NBA_Per_Game_2024_25.parquet")