In [1]:
# --- Find the Dynasty repo root (must contain BOTH src/ and data/Bakery) ---

from pathlib import Path
import sys
import pandas as pd
import numpy as np  # Add numpy for SHAP analysis functions
import pickle  # Add pickle for model loading if needed
import json  # Add json for metadata handling

import warnings
from sklearn.exceptions import ConvergenceWarning

warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=RuntimeWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=ConvergenceWarning)

def find_repo_root(start: Path) -> Path:
    for p in [start, *start.parents]:
        if (p / "src" / "models").exists() and (p / "data" / "Bakery").exists():
            return p
    raise FileNotFoundError(
        "Could not locate the Dynasty repo root (needs both 'src/models' and 'data/Bakery')."
    )

REPO_ROOT = find_repo_root(Path.cwd())
print("‚úÖ REPO_ROOT:", REPO_ROOT)

# Make sure we can import from src/
if str(REPO_ROOT) not in sys.path:
    sys.path.insert(0, str(REPO_ROOT))

# Updated imports: use gradientboosting_tuning and utils
from src.models.gradientboosting_tuning import run_seed_for_subsets
from src.utils import default_out_dir, default_csv_for_position

# ---- Config ----
position = "WR"            # RB / WR / TE / QB
seeds = [12, 34, 56]
subset_grid = [20, 30, 40]

# Optional constraints (leave empty if none)
must_feats  = ["DOM+"]           # e.g. ["DOM+", "YPC"]
ban_feats   = ["Conference Rank", "Draft Age", "Wide%", "Slot%"]           # e.g. ["aDOT"]
must_inters = ["SpeedxBMI"]           # e.g. ["SpeedxBMI"]
ban_inters  = ["Wide%xSlot%"]           # e.g. ["Wide%xSlot%"]
hierarchy   = "none"       # "strong" | "weak" | "none"

# Confirm CSV location (under REPO_ROOT/data/Bakery/...)
csv_path = default_csv_for_position(REPO_ROOT, position)
print("CSV path:", csv_path)
assert csv_path.exists(), f"CSV not found at {csv_path}"

df = pd.read_csv(csv_path)
print(f"Loaded CSV from: {csv_path}")
print(f"Shape: {df.shape}")
print("Columns:", df.columns.tolist())
print("First 5 rows:")
print(df.head())

# Run
all_runs = []
for n in subset_grid:
    try:
        res = run_seed_for_subsets(
            position=position,
            project_root=REPO_ROOT,    
            n_subsets=n,
            seeds=seeds,
            max_base_feats=13,
            max_interactions=3,
            n_iter_per_model=20,
            cv_folds=15,
            test_size=0.15,
            must_feats=must_feats,
            ban_feats=ban_feats,
            must_inters=must_inters,
            ban_inters=ban_inters,
            interaction_hierarchy=hierarchy,
            draft_cap_cap=0.30,          
            draft_cap_lower_q=0.05,
            draft_cap_upper_q=0.95,
            draft_cap_importance_cap=0.1,
            breakout_age_importance_cap=0.1,
            draft_age_importance_cap=None
        )
    except UnboundLocalError as e:
        # This guards against the UnboundLocalError originating inside the imported function
        # that references `shap_feature_importance` before assignment. Skip this n and continue.
        print(f"‚ö†Ô∏è Warning: UnboundLocalError for n={n}: {e}. Skipping this run.")
        continue
    except Exception as e:
        # Catch other exceptions so one failing configuration doesn't stop the whole batch.
        print(f"‚ö†Ô∏è Error running subsets={n}: {e}. Skipping this run.")
        continue
    else:
        all_runs.append(res)

# If no runs succeeded, create an empty DataFrame gracefully instead of failing on pd.concat
if all_runs:
    summary = pd.concat(all_runs, ignore_index=True)
else:
    summary = pd.DataFrame()
    print("‚ö†Ô∏è No successful runs completed; summary is an empty DataFrame.")

# Save summary under REPO_ROOT/data/Bakery/_derived/<POS>/
out_dir = default_out_dir(REPO_ROOT, position)
out_dir.mkdir(parents=True, exist_ok=True)

summary_path = out_dir / f"{position.lower()}_runtime_accuracy_summary.csv"
summary.to_csv(summary_path, index=False)

print("\n‚úÖ Run complete!")
print("Summary CSV:", summary_path)
print("üíæ Trained models saved as .pkl files in:", out_dir)

# Show available files
model_files = list(out_dir.glob("*.pkl"))
json_files = list(out_dir.glob("*_best_meta_*.json"))

if model_files:
    print("\nüìÅ Saved model files:")
    for model_file in sorted(model_files):
        print(f"  - {model_file.name}")
else:
    print("\n‚ö†Ô∏è No .pkl model files found yet (will be created after running)")

if json_files:
    print("\nüìÑ JSON metadata files with SHAP analysis:")
    for json_file in sorted(json_files):
        print(f"  - {json_file.name}")
        
        # Quick preview of SHAP data
        try:
            with open(json_file, 'r') as f:
                metadata = json.load(f)
            if 'shap_analysis' in metadata and 'base_importance_sum' in metadata['shap_analysis']:
                shap = metadata['shap_analysis']
                print(f"    üí° SHAP: Base importance = {shap['base_importance_sum']:.4f}, Interaction = {shap['interaction_importance_sum']:.4f}")
                if shap['top_5_features']:
                    print(f"    ‚≠ê Top feature: {shap['top_5_features'][0][0]} ({shap['top_5_features'][0][1]:.4f})")
        except:
            pass
else:
    print("\n‚ö†Ô∏è No JSON metadata files found yet (will be created after running)")

summary.head()

‚úÖ REPO_ROOT: /Users/chasesiegel/Desktop/Comp_Sci/Capstone/Dynasty
CSV path: /Users/chasesiegel/Desktop/Comp_Sci/Capstone/Dynasty/data/Bakery/WR/Bakery_WR_Overall.csv
Loaded CSV from: /Users/chasesiegel/Desktop/Comp_Sci/Capstone/Dynasty/data/Bakery/WR/Bakery_WR_Overall.csv
Shape: (618, 43)
Columns: ['Rank', 'Class Rk', 'Year', 'Player Name', 'Tier', 'WR Grade', 'Overall', 'Z-Score', 'School', 'Conf Rk', 'DOB', 'Age', 'Year.1', 'Draft Age', 'Type', 'Build', 'Separator', 'Height', 'Weight', 'BMI', '40 Time', 'Draft Cap', 'DOM1', 'DOM2', 'DOM3', 'DOM4', 'DOM5', 'TDOM', 'DOM+', 'Breakout', 'Y/RR', 'YAC/R', 'aDOT', 'EPA/P', 'aYPTPA', 'CTPRR', 'UCTPRR', 'Drop%', 'CC%', 'Wide %', 'Slot %', 'Speed', 'HaSS']
First 5 rows:
   Rank  Class Rk    Year          Player Name         Tier  WR Grade  \
0   1.0       1.0  2024.0  Marvin Harrison Jr.  Cornerstone     12.79   
1   2.0       1.0  2021.0        Ja'Marr Chase  Cornerstone     12.77   
2   3.0       2.0  2024.0         Malik Nabers  Cornersto

  from .autonotebook import tqdm as notebook_tqdm


üíæ Saved model: /Users/chasesiegel/Desktop/Comp_Sci/Capstone/Dynasty/data/Bakery/_derived/WR/wr_model_seed12_subs20.pkl
üìÑ Saved metadata with SHAP results: /Users/chasesiegel/Desktop/Comp_Sci/Capstone/Dynasty/data/Bakery/_derived/WR/wr_wide_best_meta_seed12_subs20.json

[WR] Seed 12 | n_subsets=20 ‚Üí R¬≤=0.8686 | MAE=0.685 | RMSE=0.810 | time=212.8s
üíæ Saved model: /Users/chasesiegel/Desktop/Comp_Sci/Capstone/Dynasty/data/Bakery/_derived/WR/wr_model_seed34_subs20.pkl
üìÑ Saved metadata with SHAP results: /Users/chasesiegel/Desktop/Comp_Sci/Capstone/Dynasty/data/Bakery/_derived/WR/wr_wide_best_meta_seed34_subs20.json

[WR] Seed 34 | n_subsets=20 ‚Üí R¬≤=0.8569 | MAE=0.681 | RMSE=0.869 | time=190.0s
üíæ Saved model: /Users/chasesiegel/Desktop/Comp_Sci/Capstone/Dynasty/data/Bakery/_derived/WR/wr_model_seed56_subs20.pkl
üìÑ Saved metadata with SHAP results: /Users/chasesiegel/Desktop/Comp_Sci/Capstone/Dynasty/data/Bakery/_derived/WR/wr_wide_best_meta_seed56_subs20.json

[WR] See

Unnamed: 0,position,seed,n_subsets,best_test_R2,best_test_MAE,best_test_RMSE,runtime_sec,leaderboard_csv,predictions_csv,metadata_json,model_pickle,best_model_tag,best_bases,best_interactions
0,WR,12,20,0.868584,0.684738,0.809986,212.836339,/Users/chasesiegel/Desktop/Comp_Sci/Capstone/D...,/Users/chasesiegel/Desktop/Comp_Sci/Capstone/D...,/Users/chasesiegel/Desktop/Comp_Sci/Capstone/D...,/Users/chasesiegel/Desktop/Comp_Sci/Capstone/D...,GB,DOM+|UCTPRR|aDOT|YAC/R|EPA/P|aYPTPA|CTPRR|Draf...,SpeedxBMI
1,WR,34,20,0.856903,0.681095,0.869047,190.001687,/Users/chasesiegel/Desktop/Comp_Sci/Capstone/D...,/Users/chasesiegel/Desktop/Comp_Sci/Capstone/D...,/Users/chasesiegel/Desktop/Comp_Sci/Capstone/D...,/Users/chasesiegel/Desktop/Comp_Sci/Capstone/D...,GB,DOM+|CTPRR|Y/RR|aDOT|YAC/R|Drop Rate|aYPTPA|Br...,SpeedxBMI
2,WR,56,20,0.896055,0.582347,0.727705,228.464857,/Users/chasesiegel/Desktop/Comp_Sci/Capstone/D...,/Users/chasesiegel/Desktop/Comp_Sci/Capstone/D...,/Users/chasesiegel/Desktop/Comp_Sci/Capstone/D...,/Users/chasesiegel/Desktop/Comp_Sci/Capstone/D...,GB,DOM+|Draft Capital|EPA/P|aYPTPA|aDOT|CC%|Break...,SpeedxBMI
3,WR,12,30,0.871336,0.648301,0.80146,328.700074,/Users/chasesiegel/Desktop/Comp_Sci/Capstone/D...,/Users/chasesiegel/Desktop/Comp_Sci/Capstone/D...,/Users/chasesiegel/Desktop/Comp_Sci/Capstone/D...,/Users/chasesiegel/Desktop/Comp_Sci/Capstone/D...,GB,DOM+|CTPRR|aDOT|CC%|Draft Capital|Breakout Age...,SpeedxBMI
4,WR,34,30,0.856903,0.681095,0.869047,322.478611,/Users/chasesiegel/Desktop/Comp_Sci/Capstone/D...,/Users/chasesiegel/Desktop/Comp_Sci/Capstone/D...,/Users/chasesiegel/Desktop/Comp_Sci/Capstone/D...,/Users/chasesiegel/Desktop/Comp_Sci/Capstone/D...,GB,DOM+|CTPRR|Y/RR|aDOT|YAC/R|Drop Rate|aYPTPA|Br...,SpeedxBMI


In [2]:
# --- Find the Dynasty repo root (must contain BOTH src/ and data/Bakery) ---

from pathlib import Path
import sys
import pandas as pd
import numpy as np  # Add numpy for SHAP analysis functions
import pickle  # Add pickle for model loading if needed
import json  # Add json for metadata handling

import warnings
from sklearn.exceptions import ConvergenceWarning

warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=RuntimeWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=ConvergenceWarning)

def find_repo_root(start: Path) -> Path:
    for p in [start, *start.parents]:
        if (p / "src" / "models").exists() and (p / "data" / "Bakery").exists():
            return p
    raise FileNotFoundError(
        "Could not locate the Dynasty repo root (needs both 'src/models' and 'data/Bakery')."
    )

REPO_ROOT = find_repo_root(Path.cwd())
print("‚úÖ REPO_ROOT:", REPO_ROOT)

# Make sure we can import from src/
if str(REPO_ROOT) not in sys.path:
    sys.path.insert(0, str(REPO_ROOT))

# Updated imports: use gradientboosting_tuning and utils
from src.models.gradientboosting_tuning import run_seed_for_subsets
from src.utils import default_out_dir, default_csv_for_position

# ---- Config ----
position = "RB"            # RB / WR / TE / QB
seeds = [12, 34, 56]
subset_grid = [20, 30, 40]

# Optional constraints (leave empty if none)
must_feats  = ["DOM+", "RDOM+"]           # e.g. ["DOM+", "YPC"]
ban_feats   = ["Conference Rank", "Draft Age", "ELU"]           # e.g. ["aDOT"]
must_inters = ["SpeedxBMI"]           # e.g. ["SpeedxBMI"]
ban_inters  = []           # e.g. ["Wide%xSlot%"]
hierarchy   = "none"       # "strong" | "weak" | "none"

# Confirm CSV location (under REPO_ROOT/data/Bakery/...)
csv_path = default_csv_for_position(REPO_ROOT, position)
print("CSV path:", csv_path)
assert csv_path.exists(), f"CSV not found at {csv_path}"

df = pd.read_csv(csv_path)
print(f"Loaded CSV from: {csv_path}")
print(f"Shape: {df.shape}")
print("Columns:", df.columns.tolist())
print("First 5 rows:")
print(df.head())

# Run
all_runs = []
for n in subset_grid:
    try:
        res = run_seed_for_subsets(
            position=position,
            project_root=REPO_ROOT,     # ‚Üê IMPORTANT: use the Dynasty repo root
            n_subsets=n,
            seeds=seeds,
            max_base_feats=13,
            max_interactions=3,
            n_iter_per_model=20,
            cv_folds=15,
            test_size=0.15,
            must_feats=must_feats,
            ban_feats=ban_feats,
            must_inters=must_inters,
            ban_inters=ban_inters,
            interaction_hierarchy=hierarchy,
            draft_cap_cap=0.30,          # try 0.40 first; lower to 0.30 / 0.20 if still dominant
            draft_cap_lower_q=0.05,
            draft_cap_upper_q=0.95,
            draft_cap_importance_cap=0.1,
            breakout_age_importance_cap=0.1,
            draft_age_importance_cap=None
        )
    except UnboundLocalError as e:
        # This guards against the UnboundLocalError originating inside the imported function
        # that references `shap_feature_importance` before assignment. Skip this n and continue.
        print(f"‚ö†Ô∏è Warning: UnboundLocalError for n={n}: {e}. Skipping this run.")
        continue
    except Exception as e:
        # Catch other exceptions so one failing configuration doesn't stop the whole batch.
        print(f"‚ö†Ô∏è Error running subsets={n}: {e}. Skipping this run.")
        continue
    else:
        all_runs.append(res)

# If no runs succeeded, create an empty DataFrame gracefully instead of failing on pd.concat
if all_runs:
    summary = pd.concat(all_runs, ignore_index=True)
else:
    summary = pd.DataFrame()
    print("‚ö†Ô∏è No successful runs completed; summary is an empty DataFrame.")

# Save summary under REPO_ROOT/data/Bakery/_derived/<POS>/
out_dir = default_out_dir(REPO_ROOT, position)
out_dir.mkdir(parents=True, exist_ok=True)

summary_path = out_dir / f"{position.lower()}_runtime_accuracy_summary.csv"
summary.to_csv(summary_path, index=False)

print("\n‚úÖ Run complete!")
print("Summary CSV:", summary_path)
print("üíæ Trained models saved as .pkl files in:", out_dir)

# Show available files
model_files = list(out_dir.glob("*.pkl"))
json_files = list(out_dir.glob("*_best_meta_*.json"))

if model_files:
    print("\nüìÅ Saved model files:")
    for model_file in sorted(model_files):
        print(f"  - {model_file.name}")
else:
    print("\n‚ö†Ô∏è No .pkl model files found yet (will be created after running)")

if json_files:
    print("\nüìÑ JSON metadata files with SHAP analysis:")
    for json_file in sorted(json_files):
        print(f"  - {json_file.name}")
        
        # Quick preview of SHAP data
        try:
            with open(json_file, 'r') as f:
                metadata = json.load(f)
            if 'shap_analysis' in metadata and 'base_importance_sum' in metadata['shap_analysis']:
                shap = metadata['shap_analysis']
                print(f"    üí° SHAP: Base importance = {shap['base_importance_sum']:.4f}, Interaction = {shap['interaction_importance_sum']:.4f}")
                if shap['top_5_features']:
                    print(f"    ‚≠ê Top feature: {shap['top_5_features'][0][0]} ({shap['top_5_features'][0][1]:.4f})")
        except:
            pass
else:
    print("\n‚ö†Ô∏è No JSON metadata files found yet (will be created after running)")

summary.head()

‚úÖ REPO_ROOT: /Users/chasesiegel/Desktop/Comp_Sci/Capstone/Dynasty
CSV path: /Users/chasesiegel/Desktop/Comp_Sci/Capstone/Dynasty/data/Bakery/RB/Bakery_RB_Overall.csv
Loaded CSV from: /Users/chasesiegel/Desktop/Comp_Sci/Capstone/Dynasty/data/Bakery/RB/Bakery_RB_Overall.csv
Shape: (328, 42)
Columns: ['Rank', 'Class Rk', 'Year', 'Player Name', 'Tier', 'RB Grade', 'Overall', 'Z-Score', 'School', 'Conf Rk', 'DOB', 'Age', 'Class', 'Draft Age', 'Build', 'Type', 'Height', 'Weight', 'BMI', '40 Time', 'Draft Cap', 'RDOM1', 'RDOM2', 'RDOM3', 'RDOM4', 'RDOM5', 'TDOM', 'RDOM+', 'DOM1', 'DOM2', 'ADOM', 'DOM+', 'BOUT', 'Speed', 'BMI.1', 'MTF/A', 'YPC', 'YPR', 'RPT', 'ELU', 'YCO/A', 'Break %']
First 5 rows:
   Rank  Class Rk    Year     Player Name         Tier  RB Grade  Overall  \
0   1.0       1.0  2023.0  Bijan Robinson  Cornerstone     15.17     99.6   
1   2.0       1.0  2015.0     Todd Gurley  Cornerstone     14.39     99.3   
2   3.0       1.0  2026.0  Jeremiyah Love  Cornerstone     14.39  

Unnamed: 0,position,seed,n_subsets,best_test_R2,best_test_MAE,best_test_RMSE,runtime_sec,leaderboard_csv,predictions_csv,metadata_json,model_pickle,best_model_tag,best_bases,best_interactions
0,RB,12,20,0.945033,0.590681,0.718798,159.805577,/Users/chasesiegel/Desktop/Comp_Sci/Capstone/D...,/Users/chasesiegel/Desktop/Comp_Sci/Capstone/D...,/Users/chasesiegel/Desktop/Comp_Sci/Capstone/D...,/Users/chasesiegel/Desktop/Comp_Sci/Capstone/D...,GB,DOM+|RDOM+|MTF/A|Breakout Age|YPC|YPR|Draft Ca...,SpeedxBMI
1,RB,34,20,0.868981,0.752069,0.903997,163.769816,/Users/chasesiegel/Desktop/Comp_Sci/Capstone/D...,/Users/chasesiegel/Desktop/Comp_Sci/Capstone/D...,/Users/chasesiegel/Desktop/Comp_Sci/Capstone/D...,/Users/chasesiegel/Desktop/Comp_Sci/Capstone/D...,GB,DOM+|RDOM+|Break%|MTF/A|YPC|Draft Capital|YPR|...,SpeedxBMI
2,RB,56,20,0.878781,0.706132,0.855439,170.321055,/Users/chasesiegel/Desktop/Comp_Sci/Capstone/D...,/Users/chasesiegel/Desktop/Comp_Sci/Capstone/D...,/Users/chasesiegel/Desktop/Comp_Sci/Capstone/D...,/Users/chasesiegel/Desktop/Comp_Sci/Capstone/D...,GB,DOM+|RDOM+|YPC|YCO/A|Break%|Draft Capital|Brea...,SpeedxBMI
3,RB,12,30,0.945033,0.590681,0.718798,227.174733,/Users/chasesiegel/Desktop/Comp_Sci/Capstone/D...,/Users/chasesiegel/Desktop/Comp_Sci/Capstone/D...,/Users/chasesiegel/Desktop/Comp_Sci/Capstone/D...,/Users/chasesiegel/Desktop/Comp_Sci/Capstone/D...,GB,DOM+|RDOM+|MTF/A|Breakout Age|YPC|YPR|Draft Ca...,SpeedxBMI
4,RB,34,30,0.868981,0.752069,0.903997,229.328391,/Users/chasesiegel/Desktop/Comp_Sci/Capstone/D...,/Users/chasesiegel/Desktop/Comp_Sci/Capstone/D...,/Users/chasesiegel/Desktop/Comp_Sci/Capstone/D...,/Users/chasesiegel/Desktop/Comp_Sci/Capstone/D...,GB,DOM+|RDOM+|Break%|MTF/A|YPC|Draft Capital|YPR|...,SpeedxBMI


In [3]:
# --- Find the Dynasty repo root (must contain BOTH src/ and data/Bakery) ---

from pathlib import Path
import sys
import pandas as pd
import numpy as np  # Add numpy for SHAP analysis functions
import pickle  # Add pickle for model loading if needed
import json  # Add json for metadata handling

import warnings
from sklearn.exceptions import ConvergenceWarning

warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=RuntimeWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=ConvergenceWarning)

def find_repo_root(start: Path) -> Path:
    for p in [start, *start.parents]:
        if (p / "src" / "models").exists() and (p / "data" / "Bakery").exists():
            return p
    raise FileNotFoundError(
        "Could not locate the Dynasty repo root (needs both 'src/models' and 'data/Bakery')."
    )

REPO_ROOT = find_repo_root(Path.cwd())
print("‚úÖ REPO_ROOT:", REPO_ROOT)

# Make sure we can import from src/
if str(REPO_ROOT) not in sys.path:
    sys.path.insert(0, str(REPO_ROOT))

# Updated imports: use gradientboosting_tuning and utils
from src.models.gradientboosting_tuning import run_seed_for_subsets
from src.utils import default_out_dir, default_csv_for_position

# ---- Config ----
position = "TE"            # RB / WR / TE / QB
seeds = [12, 34, 56]
subset_grid = [20, 30, 40]

# Optional constraints (leave empty if none)
must_feats  = ["DOM+", "Draft Capital"]           # e.g. ["DOM+", "YPC"]
ban_feats   = ["Conference Rank", "Draft Age"]           # e.g. ["aDOT"]
must_inters = ["SpeedxBMI"]           # e.g. ["SpeedxBMI"]
ban_inters  = []           # e.g. ["Wide%xSlot%"]
hierarchy   = "none"       # "strong" | "weak" | "none"

# Confirm CSV location (under REPO_ROOT/data/Bakery/...)
csv_path = default_csv_for_position(REPO_ROOT, position)
print("CSV path:", csv_path)
assert csv_path.exists(), f"CSV not found at {csv_path}"

df = pd.read_csv(csv_path)
print(f"Loaded CSV from: {csv_path}")
print(f"Shape: {df.shape}")
print("Columns:", df.columns.tolist())
print("First 5 rows:")
print(df.head())

# Run
all_runs = []
for n in subset_grid:
    try:
        res = run_seed_for_subsets(
            position=position,
            project_root=REPO_ROOT,     # ‚Üê IMPORTANT: use the Dynasty repo root
            n_subsets=n,
            seeds=seeds,
            max_base_feats=13,
            max_interactions=3,
            n_iter_per_model=30,
            cv_folds=15,
            test_size=0.15,
            must_feats=must_feats,
            ban_feats=ban_feats,
            must_inters=must_inters,
            ban_inters=ban_inters,
            interaction_hierarchy=hierarchy,
            draft_cap_cap=0.30,          # try 0.40 first; lower to 0.30 / 0.20 if still dominant
            draft_cap_lower_q=0.05,
            draft_cap_upper_q=0.95,
            draft_cap_importance_cap=0.1,
            breakout_age_importance_cap=0.1,
            draft_age_importance_cap=None
        )
    except UnboundLocalError as e:
        # This guards against the UnboundLocalError originating inside the imported function
        # that references `shap_feature_importance` before assignment. Skip this n and continue.
        print(f"‚ö†Ô∏è Warning: UnboundLocalError for n={n}: {e}. Skipping this run.")
        continue
    except Exception as e:
        # Catch other exceptions so one failing configuration doesn't stop the whole batch.
        print(f"‚ö†Ô∏è Error running subsets={n}: {e}. Skipping this run.")
        continue
    else:
        all_runs.append(res)

# If no runs succeeded, create an empty DataFrame gracefully instead of failing on pd.concat
if all_runs:
    summary = pd.concat(all_runs, ignore_index=True)
else:
    summary = pd.DataFrame()
    print("‚ö†Ô∏è No successful runs completed; summary is an empty DataFrame.")

# Save summary under REPO_ROOT/data/Bakery/_derived/<POS>/
out_dir = default_out_dir(REPO_ROOT, position)
out_dir.mkdir(parents=True, exist_ok=True)

summary_path = out_dir / f"{position.lower()}_runtime_accuracy_summary.csv"
summary.to_csv(summary_path, index=False)

print("\n‚úÖ Run complete!")
print("Summary CSV:", summary_path)
print("üíæ Trained models saved as .pkl files in:", out_dir)

# Show available files
model_files = list(out_dir.glob("*.pkl"))
json_files = list(out_dir.glob("*_best_meta_*.json"))

if model_files:
    print("\nüìÅ Saved model files:")
    for model_file in sorted(model_files):
        print(f"  - {model_file.name}")
else:
    print("\n‚ö†Ô∏è No .pkl model files found yet (will be created after running)")

if json_files:
    print("\nüìÑ JSON metadata files with SHAP analysis:")
    for json_file in sorted(json_files):
        print(f"  - {json_file.name}")
        
        # Quick preview of SHAP data
        try:
            with open(json_file, 'r') as f:
                metadata = json.load(f)
            if 'shap_analysis' in metadata and 'base_importance_sum' in metadata['shap_analysis']:
                shap = metadata['shap_analysis']
                print(f"    üí° SHAP: Base importance = {shap['base_importance_sum']:.4f}, Interaction = {shap['interaction_importance_sum']:.4f}")
                if shap['top_5_features']:
                    print(f"    ‚≠ê Top feature: {shap['top_5_features'][0][0]} ({shap['top_5_features'][0][1]:.4f})")
        except:
            pass
else:
    print("\n‚ö†Ô∏è No JSON metadata files found yet (will be created after running)")

summary.head()

‚úÖ REPO_ROOT: /Users/chasesiegel/Desktop/Comp_Sci/Capstone/Dynasty
CSV path: /Users/chasesiegel/Desktop/Comp_Sci/Capstone/Dynasty/data/Bakery/TE/Bakery_TE_Overall.csv
Loaded CSV from: /Users/chasesiegel/Desktop/Comp_Sci/Capstone/Dynasty/data/Bakery/TE/Bakery_TE_Overall.csv
Shape: (213, 37)
Columns: ['Rank', 'Class Rk', 'Year', 'Player Name', 'Tier', 'TE Grade', 'Overall', 'Z-Score', 'School', 'Conf Rk', 'DOB', 'Age', 'Year.1', 'Draft Age', 'Height', 'Weight', 'BMI', '40 Time', 'Draft', 'DOM1', 'DOM2', 'DOM3', 'DOM4', 'DOM5', 'TDOM', 'DOM+', 'BOUT', 'Y/RR', 'Y/REC', 'YAC/R', 'aDOT', 'EPA/P', 'aYPTPA', 'CC%', 'Drop %', 'Speed', 'HaSS']
First 5 rows:
   Rank  Class Rk  Year     Player Name          Tier  TE Grade  Overall  \
0     1         1  2024    Brock Bowers   Cornerstone     15.12     99.5   
1     2         1  2021      Kyle Pitts   Cornerstone     15.00     99.0   
2     3         1  2017     Evan Engram         Elite     14.59     98.5   
3     4         1  2019  T.J. Hockenson

Unnamed: 0,position,seed,n_subsets,best_test_R2,best_test_MAE,best_test_RMSE,runtime_sec,leaderboard_csv,predictions_csv,metadata_json,model_pickle,best_model_tag,best_bases,best_interactions
0,TE,12,20,0.867737,0.760561,0.936975,162.405478,/Users/chasesiegel/Desktop/Comp_Sci/Capstone/D...,/Users/chasesiegel/Desktop/Comp_Sci/Capstone/D...,/Users/chasesiegel/Desktop/Comp_Sci/Capstone/D...,/Users/chasesiegel/Desktop/Comp_Sci/Capstone/D...,GB,DOM+|Draft Capital|YAC/R|CC%|Y/RR|aYPTPA|EPA/P...,SpeedxBMI
1,TE,34,20,0.753765,1.008069,1.144913,173.856077,/Users/chasesiegel/Desktop/Comp_Sci/Capstone/D...,/Users/chasesiegel/Desktop/Comp_Sci/Capstone/D...,/Users/chasesiegel/Desktop/Comp_Sci/Capstone/D...,/Users/chasesiegel/Desktop/Comp_Sci/Capstone/D...,GB,DOM+|Draft Capital|YAC/R|Breakout Age|Y/RR|Dro...,SpeedxBMI
2,TE,56,20,0.927969,0.752081,0.902318,191.626019,/Users/chasesiegel/Desktop/Comp_Sci/Capstone/D...,/Users/chasesiegel/Desktop/Comp_Sci/Capstone/D...,/Users/chasesiegel/Desktop/Comp_Sci/Capstone/D...,/Users/chasesiegel/Desktop/Comp_Sci/Capstone/D...,GB,DOM+|Draft Capital|CC%|aYPTPA|YAC/R|Y/RR|Break...,SpeedxBMI
3,TE,12,30,0.867737,0.760561,0.936975,221.547456,/Users/chasesiegel/Desktop/Comp_Sci/Capstone/D...,/Users/chasesiegel/Desktop/Comp_Sci/Capstone/D...,/Users/chasesiegel/Desktop/Comp_Sci/Capstone/D...,/Users/chasesiegel/Desktop/Comp_Sci/Capstone/D...,GB,DOM+|Draft Capital|YAC/R|CC%|Y/RR|aYPTPA|EPA/P...,SpeedxBMI
4,TE,34,30,0.756479,0.999001,1.138585,244.692836,/Users/chasesiegel/Desktop/Comp_Sci/Capstone/D...,/Users/chasesiegel/Desktop/Comp_Sci/Capstone/D...,/Users/chasesiegel/Desktop/Comp_Sci/Capstone/D...,/Users/chasesiegel/Desktop/Comp_Sci/Capstone/D...,GB,DOM+|Draft Capital|YAC/R|Y/RR|CC%|Breakout Age...,SpeedxBMI


In [4]:
# --- Find the Dynasty repo root (must contain BOTH src/ and data/Bakery) ---

from pathlib import Path
import sys
import pandas as pd
import numpy as np  # Add numpy for SHAP analysis functions
import pickle  # Add pickle for model loading if needed
import json  # Add json for metadata handling

import warnings
from sklearn.exceptions import ConvergenceWarning

warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=RuntimeWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=ConvergenceWarning)

def find_repo_root(start: Path) -> Path:
    for p in [start, *start.parents]:
        if (p / "src" / "models").exists() and (p / "data" / "Bakery").exists():
            return p
    raise FileNotFoundError(
        "Could not locate the Dynasty repo root (needs both 'src/models' and 'data/Bakery')."
    )

REPO_ROOT = find_repo_root(Path.cwd())
print("‚úÖ REPO_ROOT:", REPO_ROOT)

# Make sure we can import from src/
if str(REPO_ROOT) not in sys.path:
    sys.path.insert(0, str(REPO_ROOT))

# Updated imports: use gradientboosting_tuning and utils
from src.models.gradientboosting_tuning import run_seed_for_subsets
from src.utils import default_out_dir, default_csv_for_position

# ---- Config ----
position = "QB"            # RB / WR / TE / QB
seeds = [12, 34, 56]
subset_grid = [20, 30, 40]

# Optional constraints (leave empty if none)
must_feats  = ["PDOM+", "RDOM+"]           # e.g. ["DOM+", "YPC"]
ban_feats   = ["Conference Rank", "Draft Age"]           # e.g. ["aDOT"]
must_inters = ["SpeedxBMI"]           # e.g. ["SpeedxBMI"]
ban_inters  = []           # e.g. ["Wide%xSlot%"]
hierarchy   = "none"       # "strong" | "weak" | "none"

# Confirm CSV location (under REPO_ROOT/data/Bakery/...)
csv_path = default_csv_for_position(REPO_ROOT, position)
print("CSV path:", csv_path)
assert csv_path.exists(), f"CSV not found at {csv_path}"

df = pd.read_csv(csv_path)
print(f"Loaded CSV from: {csv_path}")
print(f"Shape: {df.shape}")
print("Columns:", df.columns.tolist())
print("First 5 rows:")
print(df.head())

# Run
all_runs = []
for n in subset_grid:
    try:
        res = run_seed_for_subsets(
            position=position,
            project_root=REPO_ROOT,     # ‚Üê IMPORTANT: use the Dynasty repo root
            n_subsets=n,
            seeds=seeds,
            max_base_feats=13,
            max_interactions=3,
            n_iter_per_model=20,
            cv_folds=15,
            test_size=0.15,
            must_feats=must_feats,
            ban_feats=ban_feats,
            must_inters=must_inters,
            ban_inters=ban_inters,
            interaction_hierarchy=hierarchy,
            draft_cap_cap=0.30,          # try 0.40 first; lower to 0.30 / 0.20 if still dominant
            draft_cap_lower_q=0.05,
            draft_cap_upper_q=0.95,
            draft_cap_importance_cap=0.1,
            breakout_age_importance_cap=0.1,
            draft_age_importance_cap=None
        )
    except UnboundLocalError as e:
        # This guards against the UnboundLocalError originating inside the imported function
        # that references `shap_feature_importance` before assignment. Skip this n and continue.
        print(f"‚ö†Ô∏è Warning: UnboundLocalError for n={n}: {e}. Skipping this run.")
        continue
    except Exception as e:
        # Catch other exceptions so one failing configuration doesn't stop the whole batch.
        print(f"‚ö†Ô∏è Error running subsets={n}: {e}. Skipping this run.")
        continue
    else:
        all_runs.append(res)

# If no runs succeeded, create an empty DataFrame gracefully instead of failing on pd.concat
if all_runs:
    summary = pd.concat(all_runs, ignore_index=True)
else:
    summary = pd.DataFrame()
    print("‚ö†Ô∏è No successful runs completed; summary is an empty DataFrame.")

# Save summary under REPO_ROOT/data/Bakery/_derived/<POS>/
out_dir = default_out_dir(REPO_ROOT, position)
out_dir.mkdir(parents=True, exist_ok=True)

summary_path = out_dir / f"{position.lower()}_runtime_accuracy_summary.csv"
summary.to_csv(summary_path, index=False)

print("\n‚úÖ Run complete!")
print("Summary CSV:", summary_path)
print("üíæ Trained models saved as .pkl files in:", out_dir)

# Show available files
model_files = list(out_dir.glob("*.pkl"))
json_files = list(out_dir.glob("*_best_meta_*.json"))

if model_files:
    print("\nüìÅ Saved model files:")
    for model_file in sorted(model_files):
        print(f"  - {model_file.name}")
else:
    print("\n‚ö†Ô∏è No .pkl model files found yet (will be created after running)")

if json_files:
    print("\nüìÑ JSON metadata files with SHAP analysis:")
    for json_file in sorted(json_files):
        print(f"  - {json_file.name}")
        
        # Quick preview of SHAP data
        try:
            with open(json_file, 'r') as f:
                metadata = json.load(f)
            if 'shap_analysis' in metadata and 'base_importance_sum' in metadata['shap_analysis']:
                shap = metadata['shap_analysis']
                print(f"    üí° SHAP: Base importance = {shap['base_importance_sum']:.4f}, Interaction = {shap['interaction_importance_sum']:.4f}")
                if shap['top_5_features']:
                    print(f"    ‚≠ê Top feature: {shap['top_5_features'][0][0]} ({shap['top_5_features'][0][1]:.4f})")
        except:
            pass
else:
    print("\n‚ö†Ô∏è No JSON metadata files found yet (will be created after running)")

summary.head()

‚úÖ REPO_ROOT: /Users/chasesiegel/Desktop/Comp_Sci/Capstone/Dynasty
CSV path: /Users/chasesiegel/Desktop/Comp_Sci/Capstone/Dynasty/data/Bakery/QB/Bakery_QB_Overall.csv
Loaded CSV from: /Users/chasesiegel/Desktop/Comp_Sci/Capstone/Dynasty/data/Bakery/QB/Bakery_QB_Overall.csv
Shape: (206, 42)
Columns: ['RK', 'Class Rk', 'Year', 'Player Name', 'Unnamed: 4', 'QB Grade', 'Overall', 'Z-Score', 'School', 'Conf Rk', 'DOB', 'Age', 'Year.1', 'Draft Age', 'Archetype', 'Height', 'Weight', 'BMI', '40 Time', 'Draft Cap', 'PDOM1', 'PDOM2', 'PDOM3', 'PDOM4', 'PDOM5', 'TDOM', 'PDOM+', 'RDOM1', 'RDOM2', 'ADOM', 'RDOM+', 'BOUT', 'Speed', 'Comp', 'YPC', 'ADJ%', 'BTT%', 'TWP%', 'DAA', 'YPA', 'MTF/A', 'YCO/A']
First 5 rows:
   RK  Class Rk  Year      Player Name   Unnamed: 4  QB Grade  Overall  \
0   1         1  2024   Jayden Daniels  Cornerstone     15.28     99.5   
1   2         1  2021  Trevor Lawrence  Cornerstone     13.05     99.0   
2   3         1  2020       Joe Burrow        Elite     12.99     

Unnamed: 0,position,seed,n_subsets,best_test_R2,best_test_MAE,best_test_RMSE,runtime_sec,leaderboard_csv,predictions_csv,metadata_json,model_pickle,best_model_tag,best_bases,best_interactions
0,QB,12,20,0.934425,0.759242,0.925335,120.293804,/Users/chasesiegel/Desktop/Comp_Sci/Capstone/D...,/Users/chasesiegel/Desktop/Comp_Sci/Capstone/D...,/Users/chasesiegel/Desktop/Comp_Sci/Capstone/D...,/Users/chasesiegel/Desktop/Comp_Sci/Capstone/D...,GB,PDOM+|RDOM+|ADJ%|YPC|DAA|Draft Capital,SpeedxBMI
1,QB,34,20,0.891328,0.8841,1.037895,115.759147,/Users/chasesiegel/Desktop/Comp_Sci/Capstone/D...,/Users/chasesiegel/Desktop/Comp_Sci/Capstone/D...,/Users/chasesiegel/Desktop/Comp_Sci/Capstone/D...,/Users/chasesiegel/Desktop/Comp_Sci/Capstone/D...,GB,PDOM+|RDOM+|Comp%|Breakout Age|YPA|TWP%|YPC|BT...,SpeedxBMI
2,QB,56,20,0.947423,0.655189,0.839391,124.281332,/Users/chasesiegel/Desktop/Comp_Sci/Capstone/D...,/Users/chasesiegel/Desktop/Comp_Sci/Capstone/D...,/Users/chasesiegel/Desktop/Comp_Sci/Capstone/D...,/Users/chasesiegel/Desktop/Comp_Sci/Capstone/D...,GB,PDOM+|RDOM+|MTF/A|BTT%|Comp%|YPA|ADJ%|DAA|TWP%...,SpeedxBMI
3,QB,12,30,0.94397,0.618423,0.855344,177.171544,/Users/chasesiegel/Desktop/Comp_Sci/Capstone/D...,/Users/chasesiegel/Desktop/Comp_Sci/Capstone/D...,/Users/chasesiegel/Desktop/Comp_Sci/Capstone/D...,/Users/chasesiegel/Desktop/Comp_Sci/Capstone/D...,GB,PDOM+|RDOM+|YCO/A|BTT%|YPC|Draft Capital|DAA|B...,SpeedxBMI
4,QB,34,30,0.891328,0.8841,1.037895,186.333766,/Users/chasesiegel/Desktop/Comp_Sci/Capstone/D...,/Users/chasesiegel/Desktop/Comp_Sci/Capstone/D...,/Users/chasesiegel/Desktop/Comp_Sci/Capstone/D...,/Users/chasesiegel/Desktop/Comp_Sci/Capstone/D...,GB,PDOM+|RDOM+|Comp%|Breakout Age|YPA|TWP%|YPC|BT...,SpeedxBMI
