In [None]:
#Identifying Correct Path for Raw Data
from pathlib import Path
import pandas as pd

here = Path.cwd()
for parent in [here, *here.parents]:
    if (parent / "Data").exists():
        PROJECT_ROOT = parent
        break
else:
    raise FileNotFoundError("Could not locate 'Data' folder in parent directories.")

RAW_DIR = PROJECT_ROOT / "Data" / "Raw" / "NBA"
print("Scanning recursively in:", RAW_DIR.resolve())


Scanning recursively in: C:\Users\YOUNTZ\Desktop\Backtest\Sports-Analytics-Backtester\Data\Raw\NBA


In [None]:
# Load all Text and CSV Files found in the raw directory
files = list(RAW_DIR.rglob("*.txt")) + list(RAW_DIR.rglob("*.csv"))

print(f"Found {len(files)} files total under NBA/")

if not files:
    raise FileNotFoundError(f"No .txt or .csv found under {RAW_DIR}")


Found 21 files total under NBA/
 - 2012-13\raw_scores.txt
 - 2012-13\vegas.txt
 - 2012-13\vegas_playoff.txt
 - 2013-14\raw_scores.txt
 - 2013-14\vegas.txt
 - 2013-14\vegas_playoff.txt
 - 2014-15\raw_scores.txt
 - 2014-15\vegas.txt
 - 2014-15\vegas_playoff.txt
 - 2015-16\raw_scores.txt


In [18]:
#Splitting Files Into Score Info and Betting Info

vegas_files  = [f for f in files if "vegas" in f.name.lower()]
scores_files = [f for f in files if "score" in f.name.lower()]

print(f"vegas_files:  {len(vegas_files)}")
for f in vegas_files[:5]: print("  -", f.relative_to(RAW_DIR))
print(f"scores_files: {len(scores_files)}")
for f in scores_files[:5]: print("  -", f.relative_to(RAW_DIR))


vegas_files:  14
  - 2012-13\vegas.txt
  - 2012-13\vegas_playoff.txt
  - 2013-14\vegas.txt
  - 2013-14\vegas_playoff.txt
  - 2014-15\vegas.txt
scores_files: 7
  - 2012-13\raw_scores.txt
  - 2013-14\raw_scores.txt
  - 2014-15\raw_scores.txt
  - 2015-16\raw_scores.txt
  - 2016-17\raw_scores.txt


In [None]:
#Merge Using Game and Team ID
vegas_list = [pd.read_csv(f) for f in vegas_files]
vegas = pd.concat(vegas_list, ignore_index=True)

scores_list = [pd.read_csv(f) for f in scores_files]
scores = pd.concat(scores_list, ignore_index=True)

vegas["GameId"] = vegas["GameId"].astype(str)
vegas["TeamId"] = vegas["TeamId"].astype(str)
scores["GAME_ID"] = scores["GAME_ID"].astype(str)
scores["TEAM_ID"] = scores["TEAM_ID"].astype(str)

merged = scores.merge(
    vegas,
    left_on=["GAME_ID","TEAM_ID"],
    right_on=["GameId","TeamId"],
    how="inner"
)

merged = scores.merge(
    vegas,
    left_on=["GAME_ID","TEAM_ID"],
    right_on=["GameId","TeamId"],
    how="inner"
)

merged.head()

Merged shape: (17340, 87)


Unnamed: 0.1,Unnamed: 0,GAME_SEQUENCE,GAME_ID,TEAM_ID,TEAM_ABBREVIATION,TEAM_CITY_NAME,TEAM_WINS_LOSSES,PTS_QTR1,PTS_QTR2,PTS_QTR3,...,Average_Line_OU,Average_Odds_OU,Best_Line_OU,Worst_Line_OU,Best_Odds_OU,Worst_Odds_OU,Pts,Spread,Result,Total
0,2012-10-30,1,21200001,1610612764,WAS,Washington,0-1,24,15,23,...,190.33,-108.17,191.0,187.0,-104.0,-110.0,84.0,-10.0,L,178.0
1,2012-10-30,1,21200001,1610612739,CLE,Cleveland,1-0,31,19,24,...,190.33,-110.17,191.0,187.0,-106.0,-115.0,94.0,10.0,W,178.0
2,2012-10-30,2,21200002,1610612738,BOS,Boston,0-1,25,29,22,...,186.58,-107.33,188.0,182.0,-104.0,-110.0,107.0,-13.0,L,227.0
3,2012-10-30,2,21200002,1610612748,MIA,Miami,1-0,31,31,31,...,186.58,-109.33,188.0,182.0,-105.0,-115.0,120.0,13.0,W,227.0
4,2012-10-30,3,21200003,1610612742,DAL,Dallas,1-0,25,23,26,...,186.0,-107.5,187.0,185.0,-100.0,-110.0,99.0,8.0,W,190.0


In [23]:
merged_flat = merged[["Date","TEAM_ABBREVIATION","Pinnacle_ML","Result"]].copy()
merged_flat = merged_flat.rename(columns={
    "Date": "date",
    "TEAM_ABBREVIATION": "selection",
    "Pinnacle_ML": "odds",
    "Result": "result"
})
merged_flat["date"] = pd.to_datetime(merged_flat["date"], errors="coerce")
merged_flat["result"] = merged_flat["result"].map({"W": 1, "L": 0})

merged_flat.head

<bound method NDFrame.head of             date selection   odds  result
0     2012-10-30       WAS  210.0       0
1     2012-10-30       CLE -235.0       1
2     2012-10-30       BOS  244.0       0
3     2012-10-30       MIA -275.0       1
4     2012-10-30       DAL  393.0       1
...          ...       ...    ...     ...
17335 2019-04-10       DEN -794.0       1
17336 2019-04-10       UTA  249.0       0
17337 2019-04-10       LAC -281.0       1
17338 2019-04-10       SAC  135.0       0
17339 2019-04-10       POR -149.0       1

[17340 rows x 4 columns]>

In [None]:
#Save to Data Processed
here = Path.cwd()
for parent in [here, *here.parents]:
    if (parent / "Data").exists():
        PROJECT_ROOT = parent
        break
else:
    raise FileNotFoundError("Could not locate 'Data' folder in any parent directories.")

out_dir = PROJECT_ROOT / "Data" / "Processed"
out_dir.mkdir(parents=True, exist_ok=True)

out_path = out_dir / "games_clean.csv"
merged_flat.to_csv(out_path, index=False)

print("Saved:", out_path.resolve())

Saved: C:\Users\YOUNTZ\Desktop\Backtest\Sports-Analytics-Backtester\Data\Processed\games_clean.csv
