# Combine data

In [79]:
import pandas as pd
import os

# Directory
directory = "Data"
directory2 = "partial_match_data"
combined_directory = os.path.join(directory, directory2)
os.makedirs(combined_directory, exist_ok=True)

# Import and combine
dfs = []
batch = 0

while True:
    try:
        filepath = os.path.join(combined_directory, f"match_data_{batch}.csv")
        df = pd.read_csv(filepath)
        dfs.append(df)
        batch += 1
    except FileNotFoundError:
        break

combined_df = pd.concat(dfs, ignore_index=True)
print(f"Combined {batch} files into `combined_df`")

Combined 13 files into `combined_df`


# Check data

In [80]:
combined_df.columns

Index(['date', 'visitor_name', 'visitor_pts', 'home_name', 'home_pts',
       'overtime', 'home_win', 'margin_of_victory', 'postseason', 'season',
       'league'],
      dtype='object')

In [81]:
combined_df.describe()

Unnamed: 0,visitor_pts,home_pts,overtime,margin_of_victory,season
count,75983.0,75983.0,75983.0,75983.0,75983.0
mean,102.210955,105.714094,0.066528,10.943909,1993.651869
std,14.479939,14.86215,0.292364,8.103389,20.119162
min,0.0,2.0,0.0,1.0,1947.0
25%,93.0,96.0,0.0,5.0,1978.0
50%,102.0,106.0,0.0,9.0,1995.0
75%,112.0,116.0,0.0,15.0,2011.0
max,186.0,184.0,6.0,73.0,2025.0


In [82]:
# Note: due to league expansions, changing amount of matches per
# season and uncertain length of the playoffs these are all rough estimates

total_matches = combined_df.shape[0]
print(total_matches, "total matches")

matches_per_season = total_matches / 67
print(round(matches_per_season), "average number of matches per season (rounded)")

matches_per_team_ps = matches_per_season / 30 * 2
print(round(matches_per_team_ps), "average number of matches per team per season (rounded + if there had always been 30 teams)")

75983 total matches
1134 average number of matches per season (rounded)
76 average number of matches per team per season (rounded + if there had always been 30 teams)


In [83]:
#Date
print("Data from:", combined_df["date"].min(), '-', combined_df["date"].max())

# Team name
print("Equal number of home and visiting teams check:", len(combined_df["visitor_name"].unique()) == len(combined_df["visitor_name"].unique()))
print("Number of teams in history:", len(combined_df["visitor_name"].unique()))

# Season
years_played = len(combined_df["season"].unique())
print("Total number of years played:", years_played)

# Postseason
values = combined_df["postseason"].unique()
print("Includes regular and post season games:", True in values and False in values)
print("Postseason in every year:", len(combined_df[combined_df["postseason"] == True]["season"].unique()) == years_played)

# Leagues
for league in combined_df["league"].unique():
    print(league, "games played:", combined_df[combined_df["league"] == league].shape[0])

Data from: 1946-11-01 - 2025-05-14
Equal number of home and visiting teams check: True
Number of teams in history: 96
Total number of years played: 79
Includes regular and post season games: True
Postseason in every year: True
NBA games played: 70889
ABA games played: 4149
BAA games played: 945


# Export combined data

In [84]:
directory = "Data"
filepath = os.path.join(directory, "match_data.csv")
combined_df.to_csv(filepath, index = False)