In [1]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://archive.apache.org/dist/spark/spark-3.4.1/spark-3.4.1-bin-hadoop3.tgz
!tar xf spark-3.4.1-bin-hadoop3.tgz
!pip install -q findspark


In [2]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.4.1-bin-hadoop3"
os.environ["HADOOP_HOME"] = "/content/spark-3.4.1-bin-hadoop3" # Add Hadoop home for consistency with Spark
os.environ["PATH"] += os.pathsep + os.environ["SPARK_HOME"] + "/bin"
os.environ["PATH"] += os.pathsep + os.environ["HADOOP_HOME"] + "/bin" # Add Hadoop path
os.environ["PYSPARK_PYTHON"] = "/usr/bin/python3"  # Use Python3 explicitly


In [3]:
# Install PySpark
!pip install -q findspark

In [4]:
# Initialize Spark
import findspark
import os
findspark.init()
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("NBA-Analysis2") \
    .master("local[*]") \
    .config("spark.driver.extraClassPath", os.environ["HADOOP_HOME"] + "/etc/hadoop/*") \
    .getOrCreate()

spark

In [5]:
df = spark.read.csv("dataset2.csv", header=True, inferSchema=True)
print(df)

DataFrame[EVENTID: int, EVENTNUM: int, GAME_ID: int, HOMEDESCRIPTION: string, PCTIMESTRING: timestamp, PERIOD: int, PLAYER1_ID: int, PLAYER1_NAME: string, PLAYER1_TEAM_ABBREVIATION: string, PLAYER1_TEAM_CITY: string, PLAYER1_TEAM_ID: int, PLAYER1_TEAM_NICKNAME: string, PLAYER2_ID: int, PLAYER2_NAME: string, PLAYER2_TEAM_ABBREVIATION: string, PLAYER2_TEAM_CITY: string, PLAYER2_TEAM_ID: int, PLAYER2_TEAM_NICKNAME: string, PLAYER3_ID: int, PLAYER3_NAME: string, PLAYER3_TEAM_ABBREVIATION: string, PLAYER3_TEAM_CITY: string, PLAYER3_TEAM_ID: int, PLAYER3_TEAM_NICKNAME: string, SCORE: string, SCOREMARGIN: string, VISITORDESCRIPTION: string]


In [6]:
df

DataFrame[EVENTID: int, EVENTNUM: int, GAME_ID: int, HOMEDESCRIPTION: string, PCTIMESTRING: timestamp, PERIOD: int, PLAYER1_ID: int, PLAYER1_NAME: string, PLAYER1_TEAM_ABBREVIATION: string, PLAYER1_TEAM_CITY: string, PLAYER1_TEAM_ID: int, PLAYER1_TEAM_NICKNAME: string, PLAYER2_ID: int, PLAYER2_NAME: string, PLAYER2_TEAM_ABBREVIATION: string, PLAYER2_TEAM_CITY: string, PLAYER2_TEAM_ID: int, PLAYER2_TEAM_NICKNAME: string, PLAYER3_ID: int, PLAYER3_NAME: string, PLAYER3_TEAM_ABBREVIATION: string, PLAYER3_TEAM_CITY: string, PLAYER3_TEAM_ID: int, PLAYER3_TEAM_NICKNAME: string, SCORE: string, SCOREMARGIN: string, VISITORDESCRIPTION: string]

In [7]:
df.printSchema()
df.columns

root
 |-- EVENTID: integer (nullable = true)
 |-- EVENTNUM: integer (nullable = true)
 |-- GAME_ID: integer (nullable = true)
 |-- HOMEDESCRIPTION: string (nullable = true)
 |-- PCTIMESTRING: timestamp (nullable = true)
 |-- PERIOD: integer (nullable = true)
 |-- PLAYER1_ID: integer (nullable = true)
 |-- PLAYER1_NAME: string (nullable = true)
 |-- PLAYER1_TEAM_ABBREVIATION: string (nullable = true)
 |-- PLAYER1_TEAM_CITY: string (nullable = true)
 |-- PLAYER1_TEAM_ID: integer (nullable = true)
 |-- PLAYER1_TEAM_NICKNAME: string (nullable = true)
 |-- PLAYER2_ID: integer (nullable = true)
 |-- PLAYER2_NAME: string (nullable = true)
 |-- PLAYER2_TEAM_ABBREVIATION: string (nullable = true)
 |-- PLAYER2_TEAM_CITY: string (nullable = true)
 |-- PLAYER2_TEAM_ID: integer (nullable = true)
 |-- PLAYER2_TEAM_NICKNAME: string (nullable = true)
 |-- PLAYER3_ID: integer (nullable = true)
 |-- PLAYER3_NAME: string (nullable = true)
 |-- PLAYER3_TEAM_ABBREVIATION: string (nullable = true)
 |-- PLAY

['EVENTID',
 'EVENTNUM',
 'GAME_ID',
 'HOMEDESCRIPTION',
 'PCTIMESTRING',
 'PERIOD',
 'PLAYER1_ID',
 'PLAYER1_NAME',
 'PLAYER1_TEAM_ABBREVIATION',
 'PLAYER1_TEAM_CITY',
 'PLAYER1_TEAM_ID',
 'PLAYER1_TEAM_NICKNAME',
 'PLAYER2_ID',
 'PLAYER2_NAME',
 'PLAYER2_TEAM_ABBREVIATION',
 'PLAYER2_TEAM_CITY',
 'PLAYER2_TEAM_ID',
 'PLAYER2_TEAM_NICKNAME',
 'PLAYER3_ID',
 'PLAYER3_NAME',
 'PLAYER3_TEAM_ABBREVIATION',
 'PLAYER3_TEAM_CITY',
 'PLAYER3_TEAM_ID',
 'PLAYER3_TEAM_NICKNAME',
 'SCORE',
 'SCOREMARGIN',
 'VISITORDESCRIPTION']

In [8]:
df.show(5)

+-------+--------+--------+--------------------+-------------------+------+----------+-------------+-------------------------+-----------------+---------------+---------------------+----------+---------------+-------------------------+-----------------+---------------+---------------------+----------+------------+-------------------------+-----------------+---------------+---------------------+------+-----------+--------------------+
|EVENTID|EVENTNUM| GAME_ID|     HOMEDESCRIPTION|       PCTIMESTRING|PERIOD|PLAYER1_ID| PLAYER1_NAME|PLAYER1_TEAM_ABBREVIATION|PLAYER1_TEAM_CITY|PLAYER1_TEAM_ID|PLAYER1_TEAM_NICKNAME|PLAYER2_ID|   PLAYER2_NAME|PLAYER2_TEAM_ABBREVIATION|PLAYER2_TEAM_CITY|PLAYER2_TEAM_ID|PLAYER2_TEAM_NICKNAME|PLAYER3_ID|PLAYER3_NAME|PLAYER3_TEAM_ABBREVIATION|PLAYER3_TEAM_CITY|PLAYER3_TEAM_ID|PLAYER3_TEAM_NICKNAME| SCORE|SCOREMARGIN|  VISITORDESCRIPTION|
+-------+--------+--------+--------------------+-------------------+------+----------+-------------+------------------------

In [9]:
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
import re

# Define month-to-number mapping
month_map = {
    'Jan': '1', 'Feb': '2', 'Mar': '3', 'Apr': '4',
    'May': '5', 'Jun': '6', 'Jul': '7', 'Aug': '8',
    'Sep': '9', 'Oct': '10', 'Nov': '11', 'Dec': '12'
}

def clean_score(score):
    if not score or score.strip() == '':
        return None

    score = score.strip()

    # Already valid score format
    if re.match(r"^\d+\s*-\s*\d+$", score):
        return score

    # Handle Month-Day format (e.g., Apr-6 → 6 - 4)
    for month_abbr, num in month_map.items():
        if month_abbr in score:
            parts = score.split('-')
            if len(parts) == 2:
                if month_abbr in parts[0]:
                    return f"{parts[1].strip()} - {num}"
                elif month_abbr in parts[1]:
                    return f"{parts[0].strip()} - {num}"

    # Try to extract digits in a generic fallback
    digits = re.findall(r'\d+', score)
    if len(digits) == 2:
        return f"{digits[0]} - {digits[1]}"

    return None

# Register UDF
clean_score_udf = udf(clean_score, StringType())

# Apply the cleaning function
df_cleaned = df.withColumn("CLEANED_SCORE", clean_score_udf(df["SCORE"]))
df_cleaned.select("SCORE", "CLEANED_SCORE").distinct().show(30, truncate=False)


+--------+-------------+
|SCORE   |CLEANED_SCORE|
+--------+-------------+
|63 - 69 |63 - 69      |
|83 - 106|83 - 106     |
|26 - 29 |26 - 29      |
|38 - 39 |38 - 39      |
|77 - 90 |77 - 90      |
|40 - 42 |40 - 42      |
|96 - 90 |96 - 90      |
|58 - 50 |58 - 50      |
|54 - 74 |54 - 74      |
|76 - 83 |76 - 83      |
|68 - 103|68 - 103     |
|103 - 83|103 - 83     |
|71 - 71 |71 - 71      |
|49 - 39 |49 - 39      |
|47 - 68 |47 - 68      |
|70 - 108|70 - 108     |
|24 - 44 |24 - 44      |
|25 - 31 |25 - 31      |
|74 - 112|74 - 112     |
|71 - 67 |71 - 67      |
|Dec-19  |19 - 12      |
|52 - 59 |52 - 59      |
|22 - 23 |22 - 23      |
|40 - 45 |40 - 45      |
|30 - 39 |30 - 39      |
|113 - 96|113 - 96     |
|27 - 18 |27 - 18      |
|15-Apr  |15 - 4       |
|48 - 63 |48 - 63      |
|81 - 98 |81 - 98      |
+--------+-------------+
only showing top 30 rows



In [10]:
df.show(5)

+-------+--------+--------+--------------------+-------------------+------+----------+-------------+-------------------------+-----------------+---------------+---------------------+----------+---------------+-------------------------+-----------------+---------------+---------------------+----------+------------+-------------------------+-----------------+---------------+---------------------+------+-----------+--------------------+
|EVENTID|EVENTNUM| GAME_ID|     HOMEDESCRIPTION|       PCTIMESTRING|PERIOD|PLAYER1_ID| PLAYER1_NAME|PLAYER1_TEAM_ABBREVIATION|PLAYER1_TEAM_CITY|PLAYER1_TEAM_ID|PLAYER1_TEAM_NICKNAME|PLAYER2_ID|   PLAYER2_NAME|PLAYER2_TEAM_ABBREVIATION|PLAYER2_TEAM_CITY|PLAYER2_TEAM_ID|PLAYER2_TEAM_NICKNAME|PLAYER3_ID|PLAYER3_NAME|PLAYER3_TEAM_ABBREVIATION|PLAYER3_TEAM_CITY|PLAYER3_TEAM_ID|PLAYER3_TEAM_NICKNAME| SCORE|SCOREMARGIN|  VISITORDESCRIPTION|
+-------+--------+--------+--------------------+-------------------+------+----------+-------------+------------------------

In [11]:
df_cleaned.show(5)

+-------+--------+--------+--------------------+-------------------+------+----------+-------------+-------------------------+-----------------+---------------+---------------------+----------+---------------+-------------------------+-----------------+---------------+---------------------+----------+------------+-------------------------+-----------------+---------------+---------------------+------+-----------+--------------------+-------------+
|EVENTID|EVENTNUM| GAME_ID|     HOMEDESCRIPTION|       PCTIMESTRING|PERIOD|PLAYER1_ID| PLAYER1_NAME|PLAYER1_TEAM_ABBREVIATION|PLAYER1_TEAM_CITY|PLAYER1_TEAM_ID|PLAYER1_TEAM_NICKNAME|PLAYER2_ID|   PLAYER2_NAME|PLAYER2_TEAM_ABBREVIATION|PLAYER2_TEAM_CITY|PLAYER2_TEAM_ID|PLAYER2_TEAM_NICKNAME|PLAYER3_ID|PLAYER3_NAME|PLAYER3_TEAM_ABBREVIATION|PLAYER3_TEAM_CITY|PLAYER3_TEAM_ID|PLAYER3_TEAM_NICKNAME| SCORE|SCOREMARGIN|  VISITORDESCRIPTION|CLEANED_SCORE|
+-------+--------+--------+--------------------+-------------------+------+----------+----------

In [12]:
from pyspark.sql.functions import col

df_cleaned = df_cleaned.withColumn("SCOREMARGIN_INT", col("SCOREMARGIN").cast("int"))

In [13]:
from pyspark.sql.functions import split, col

# Split CLEANED_SCORE into two new columns: HOME_SCORE and VISITOR_SCORE
df_cleaned = df_cleaned.withColumn("HOME_SCORE", split(col("CLEANED_SCORE"), " - ").getItem(0).cast("int")) \
                           .withColumn("VISITOR_SCORE", split(col("CLEANED_SCORE"), " - ").getItem(1).cast("int"))


In [14]:
df_cleaned.show(5)

+-------+--------+--------+--------------------+-------------------+------+----------+-------------+-------------------------+-----------------+---------------+---------------------+----------+---------------+-------------------------+-----------------+---------------+---------------------+----------+------------+-------------------------+-----------------+---------------+---------------------+------+-----------+--------------------+-------------+---------------+----------+-------------+
|EVENTID|EVENTNUM| GAME_ID|     HOMEDESCRIPTION|       PCTIMESTRING|PERIOD|PLAYER1_ID| PLAYER1_NAME|PLAYER1_TEAM_ABBREVIATION|PLAYER1_TEAM_CITY|PLAYER1_TEAM_ID|PLAYER1_TEAM_NICKNAME|PLAYER2_ID|   PLAYER2_NAME|PLAYER2_TEAM_ABBREVIATION|PLAYER2_TEAM_CITY|PLAYER2_TEAM_ID|PLAYER2_TEAM_NICKNAME|PLAYER3_ID|PLAYER3_NAME|PLAYER3_TEAM_ABBREVIATION|PLAYER3_TEAM_CITY|PLAYER3_TEAM_ID|PLAYER3_TEAM_NICKNAME| SCORE|SCOREMARGIN|  VISITORDESCRIPTION|CLEANED_SCORE|SCOREMARGIN_INT|HOME_SCORE|VISITOR_SCORE|
+-------+-----

In [15]:
# spark.conf.set("spark.sql.execution.pandas.convertToPythonArrow", "false")

In [16]:
from pyspark.sql.functions import col, count, when

# Total row count
total_rows = df_cleaned.count()

# Missing value count per column
missing_data = df_cleaned.select([
    (count(when(col(c).isNull(), c)) / total_rows * 100).alias(c)
    for c in df_cleaned.columns
])

# Show missing percentage for each column
missing_data.show(truncate=False)


+-------+--------+-------+-----------------+------------+------+----------+----------------+-------------------------+-----------------+----------------+---------------------+----------+----------------+-------------------------+-----------------+----------------+---------------------+----------+-----------------+-------------------------+-----------------+-----------------+---------------------+-----------------+-----------------+------------------+-----------------+-----------------+-----------------+-----------------+
|EVENTID|EVENTNUM|GAME_ID|HOMEDESCRIPTION  |PCTIMESTRING|PERIOD|PLAYER1_ID|PLAYER1_NAME    |PLAYER1_TEAM_ABBREVIATION|PLAYER1_TEAM_CITY|PLAYER1_TEAM_ID |PLAYER1_TEAM_NICKNAME|PLAYER2_ID|PLAYER2_NAME    |PLAYER2_TEAM_ABBREVIATION|PLAYER2_TEAM_CITY|PLAYER2_TEAM_ID |PLAYER2_TEAM_NICKNAME|PLAYER3_ID|PLAYER3_NAME     |PLAYER3_TEAM_ABBREVIATION|PLAYER3_TEAM_CITY|PLAYER3_TEAM_ID  |PLAYER3_TEAM_NICKNAME|SCORE            |SCOREMARGIN      |VISITORDESCRIPTION|CLEANED_SCORE    |S

In [17]:
df_cleaned_filtered = df_cleaned.na.drop(subset=["CLEANED_SCORE", "HOME_SCORE", "VISITOR_SCORE", "SCOREMARGIN_INT"])

In [18]:
from pyspark.sql.functions import col

df_cleaned_filtered = df_cleaned_filtered.withColumn("HOME_SCORE", col("HOME_SCORE").cast("int")) \
                                         .withColumn("VISITOR_SCORE", col("VISITOR_SCORE").cast("int")) \
                                         .withColumn("SCOREMARGIN_INT", col("SCOREMARGIN_INT").cast("int"))


In [19]:
from pyspark.sql.functions import col, count, when

# Total row count
total_rows = df_cleaned_filtered.count()

# Missing value count per column
missing_data2 = df_cleaned_filtered.select([
    (count(when(col(c).isNull(), c)) / total_rows * 100).alias(c)
    for c in df_cleaned_filtered.columns
])

# Show missing percentage for each column
missing_data2.show(truncate=False)


+-------+--------+-------+------------------+------------+------+----------+-----------------+-------------------------+-----------------+-----------------+---------------------+----------+-----------------+-------------------------+-----------------+-----------------+---------------------+----------+------------+-------------------------+-----------------+---------------+---------------------+-----+-----------+------------------+-------------+---------------+----------+-------------+
|EVENTID|EVENTNUM|GAME_ID|HOMEDESCRIPTION   |PCTIMESTRING|PERIOD|PLAYER1_ID|PLAYER1_NAME     |PLAYER1_TEAM_ABBREVIATION|PLAYER1_TEAM_CITY|PLAYER1_TEAM_ID  |PLAYER1_TEAM_NICKNAME|PLAYER2_ID|PLAYER2_NAME     |PLAYER2_TEAM_ABBREVIATION|PLAYER2_TEAM_CITY|PLAYER2_TEAM_ID  |PLAYER2_TEAM_NICKNAME|PLAYER3_ID|PLAYER3_NAME|PLAYER3_TEAM_ABBREVIATION|PLAYER3_TEAM_CITY|PLAYER3_TEAM_ID|PLAYER3_TEAM_NICKNAME|SCORE|SCOREMARGIN|VISITORDESCRIPTION|CLEANED_SCORE|SCOREMARGIN_INT|HOME_SCORE|VISITOR_SCORE|
+-------+--------+--

In [20]:
from pyspark.sql.functions import col

# Drop original columns
df_prepared = df_cleaned_filtered.drop("SCORE", "SCOREMARGIN")

In [21]:
df_prepared.show(5)

+-------+--------+--------+--------------------+-------------------+------+----------+-------------+-------------------------+-----------------+---------------+---------------------+----------+---------------+-------------------------+-----------------+---------------+---------------------+----------+------------+-------------------------+-----------------+---------------+---------------------+--------------------+-------------+---------------+----------+-------------+
|EVENTID|EVENTNUM| GAME_ID|     HOMEDESCRIPTION|       PCTIMESTRING|PERIOD|PLAYER1_ID| PLAYER1_NAME|PLAYER1_TEAM_ABBREVIATION|PLAYER1_TEAM_CITY|PLAYER1_TEAM_ID|PLAYER1_TEAM_NICKNAME|PLAYER2_ID|   PLAYER2_NAME|PLAYER2_TEAM_ABBREVIATION|PLAYER2_TEAM_CITY|PLAYER2_TEAM_ID|PLAYER2_TEAM_NICKNAME|PLAYER3_ID|PLAYER3_NAME|PLAYER3_TEAM_ABBREVIATION|PLAYER3_TEAM_CITY|PLAYER3_TEAM_ID|PLAYER3_TEAM_NICKNAME|  VISITORDESCRIPTION|CLEANED_SCORE|SCOREMARGIN_INT|HOME_SCORE|VISITOR_SCORE|
+-------+--------+--------+--------------------+----

In [22]:
# Rename new processed columns
df_prepared = df_prepared.withColumnRenamed("CLEANED_SCORE", "SCORE") \
                         .withColumnRenamed("SCOREMARGIN_INT", "SCOREMARGIN")

In [23]:
# Show updated DataFrame schema
df_prepared.printSchema()

root
 |-- EVENTID: integer (nullable = true)
 |-- EVENTNUM: integer (nullable = true)
 |-- GAME_ID: integer (nullable = true)
 |-- HOMEDESCRIPTION: string (nullable = true)
 |-- PCTIMESTRING: timestamp (nullable = true)
 |-- PERIOD: integer (nullable = true)
 |-- PLAYER1_ID: integer (nullable = true)
 |-- PLAYER1_NAME: string (nullable = true)
 |-- PLAYER1_TEAM_ABBREVIATION: string (nullable = true)
 |-- PLAYER1_TEAM_CITY: string (nullable = true)
 |-- PLAYER1_TEAM_ID: integer (nullable = true)
 |-- PLAYER1_TEAM_NICKNAME: string (nullable = true)
 |-- PLAYER2_ID: integer (nullable = true)
 |-- PLAYER2_NAME: string (nullable = true)
 |-- PLAYER2_TEAM_ABBREVIATION: string (nullable = true)
 |-- PLAYER2_TEAM_CITY: string (nullable = true)
 |-- PLAYER2_TEAM_ID: integer (nullable = true)
 |-- PLAYER2_TEAM_NICKNAME: string (nullable = true)
 |-- PLAYER3_ID: integer (nullable = true)
 |-- PLAYER3_NAME: string (nullable = true)
 |-- PLAYER3_TEAM_ABBREVIATION: string (nullable = true)
 |-- PLAY

In [24]:
# Show top few rows
df_prepared.show(5)

+-------+--------+--------+--------------------+-------------------+------+----------+-------------+-------------------------+-----------------+---------------+---------------------+----------+---------------+-------------------------+-----------------+---------------+---------------------+----------+------------+-------------------------+-----------------+---------------+---------------------+--------------------+------+-----------+----------+-------------+
|EVENTID|EVENTNUM| GAME_ID|     HOMEDESCRIPTION|       PCTIMESTRING|PERIOD|PLAYER1_ID| PLAYER1_NAME|PLAYER1_TEAM_ABBREVIATION|PLAYER1_TEAM_CITY|PLAYER1_TEAM_ID|PLAYER1_TEAM_NICKNAME|PLAYER2_ID|   PLAYER2_NAME|PLAYER2_TEAM_ABBREVIATION|PLAYER2_TEAM_CITY|PLAYER2_TEAM_ID|PLAYER2_TEAM_NICKNAME|PLAYER3_ID|PLAYER3_NAME|PLAYER3_TEAM_ABBREVIATION|PLAYER3_TEAM_CITY|PLAYER3_TEAM_ID|PLAYER3_TEAM_NICKNAME|  VISITORDESCRIPTION| SCORE|SCOREMARGIN|HOME_SCORE|VISITOR_SCORE|
+-------+--------+--------+--------------------+-------------------+------

In [25]:
df_prepared.describe().show()

+-------+------------------+------------------+-------------------+--------------------+------------------+-----------------+------------+-------------------------+-----------------+--------------------+---------------------+------------------+------------+-------------------------+-----------------+--------------------+---------------------+----------+------------+-------------------------+-----------------+---------------+---------------------+--------------------+-------+------------------+------------------+------------------+
|summary|           EVENTID|          EVENTNUM|            GAME_ID|     HOMEDESCRIPTION|            PERIOD|       PLAYER1_ID|PLAYER1_NAME|PLAYER1_TEAM_ABBREVIATION|PLAYER1_TEAM_CITY|     PLAYER1_TEAM_ID|PLAYER1_TEAM_NICKNAME|        PLAYER2_ID|PLAYER2_NAME|PLAYER2_TEAM_ABBREVIATION|PLAYER2_TEAM_CITY|     PLAYER2_TEAM_ID|PLAYER2_TEAM_NICKNAME|PLAYER3_ID|PLAYER3_NAME|PLAYER3_TEAM_ABBREVIATION|PLAYER3_TEAM_CITY|PLAYER3_TEAM_ID|PLAYER3_TEAM_NICKNAME|  VISITORDESC

In [36]:
df_prepared.write.csv("Cleaned-NBA", header=True)

In [37]:
import os
import pandas as pd
from glob import glob

# Path to the folder where Spark saved the CSV parts
folder_path = '/content/Cleaned-NBA'  # Update if different

# Get list of part files (ignore _SUCCESS file)
part_files = sorted(glob(os.path.join(folder_path, 'part-*.csv')))

# Read and combine all part files
df_list = []
for idx, file in enumerate(part_files):
    df = pd.read_csv(file, header=0 if idx == 0 else None)  # Only keep header from first file
    df_list.append(df)

# Concatenate all into a single DataFrame
df_combined = pd.concat(df_list, ignore_index=True)

# Save as one CSV
output_file = '/content/NBA-Cleaned-Merged.csv'
df_combined.to_csv(output_file, index=False)

print(f"✅ Merged CSV saved to: {output_file}")


✅ Merged CSV saved to: /content/NBA-Cleaned-Merged.csv
