# DT round regression (Ridge)

Predict draft round 1–8 (8 = undrafted) using combine+PFF features (same as Edges), KNN imputation, Ridge regression.
- Train: 2015–2023 (dt_training.csv; includes RAS, arm_length_inches, PFF pass rush + run defense).
- Test: dt_testing.csv filtered to 2024 and 2025 (drafted only; actual rounds 1–7).

In [94]:
import numpy as np
import pandas as pd
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Same feature set as Edges: combine + RAS + arm_length + PFF (pass rush, run defense) + p4
FEATURES_WITH_COLLEGE = [
    'Broad Jump', 'Vertical', '40yd', 'Height', 'Weight',
    'speed_score', 'explosive_score', 'RAS', 'arm_length_inches',
    'true_pass_set_pass_rush_win_rate', 'pass_rush_win_rate', 'snap_counts_pass_rush', 'stop_percent', 'p4_conference'
]
CONTAINS_WITH_COLLEGE = [
    'contains_broad_jump', 'contains_vertical', 'contains_40yd', 'contains_height', 'contains_weight',
    'contains_speed_score', 'contains_explosive_score', 'contains_ras', 'contains_arm_length_inches',
    'contains_true_pass_set_pass_rush_win_rate', 'contains_pass_rush_win_rate', 'contains_snap_counts_pass_rush', 'contains_stop_percent',
    'contains_p4_conference'
]
FEATURES_WITH_COLLEGE_ALL = FEATURES_WITH_COLLEGE + CONTAINS_WITH_COLLEGE

In [95]:
# Load DT training (2015–2023); RAS and PFF already in dt_training.csv
df = pd.read_csv('../data/processed/dt_training.csv')
df = df[df['Year'].between(2015, 2023)].copy()
print('Train (2015–2023 DTs):', len(df))

Train (2015–2023 DTs): 154


In [96]:
# Print RAS and PFF availability (same as Edges)
total_count = len(df)
ras_count = df['RAS'].notna().sum()
print(f"Players with RAS score: {ras_count} out of {total_count} ({ras_count/total_count*100:.1f}%)")
print(f"Players with True Pass Set Win Rate: {df['true_pass_set_pass_rush_win_rate'].notna().sum()} out of {total_count}")
print(f"Players with Pass Rush Win Rate: {df['pass_rush_win_rate'].notna().sum()} out of {total_count}")
print(f"Players with Run Defense Stop %: {df['stop_percent'].notna().sum()} out of {total_count}")
arm_count = df['arm_length_inches'].notna().sum() if 'arm_length_inches' in df.columns else 0
print(f"Players with arm length: {arm_count} out of {total_count}")

Players with RAS score: 85 out of 154 (55.2%)
Players with True Pass Set Win Rate: 139 out of 154
Players with Pass Rush Win Rate: 139 out of 154
Players with Run Defense Stop %: 139 out of 154
Players with arm length: 0 out of 154


In [97]:
# Height to inches
def height_inches(h):
    if pd.isna(h): return np.nan
    if isinstance(h, (int, float)) and not (isinstance(h, float) and np.isnan(h)):
        return float(h)
    s = str(h).strip()
    if '-' in s:
        parts = s.split('-')
        return int(parts[0]) * 12 + int(parts[1])
    return np.nan
df['Height'] = df['Height'].apply(height_inches)

# Speed score
df['speed_score'] = np.where(
    df['40yd'].notna() & (df['40yd'] > 0),
    df['Weight'] * 200 / (df['40yd'] ** 4),
    np.nan
)

# Explosive score (z-scores from this pool)
mean_v = df['Vertical'].mean()
std_v = df['Vertical'].std()
mean_b = df['Broad Jump'].mean()
std_b = df['Broad Jump'].std()
if std_v == 0 or np.isnan(std_v): std_v = 1.0
if std_b == 0 or np.isnan(std_b): std_b = 1.0
df['explosive_score'] = (df['Vertical'] - mean_v).fillna(0) / std_v + (df['Broad Jump'] - mean_b).fillna(0) / std_b

# P4 conference mapping (hardcoded - no longer using defensive_stats CSV)
P4_WITH_PAC12 = {'SEC', 'Big Ten', 'Big 12', 'ACC', 'Pac-12'}
P4_NO_PAC12 = {'SEC', 'Big Ten', 'Big 12', 'ACC'}
school_alias = {
    'Ole Miss': 'Mississippi', 'Miami (FL)': 'Miami', 'Southern California': 'USC',
    'Central Florida': 'UCF', 'Brigham Young': 'BYU', 'Ohio St.': 'Ohio State',
    'Florida St.': 'Florida State', 'Kansas St.': 'Kansas State', 'Iowa St.': 'Iowa State',
    'Oklahoma St.': 'Oklahoma State', 'Penn St.': 'Penn State', 'San Diego St.': 'San Diego State',
}

# P4 schools (Power 4/5 conferences)
# SEC
SEC_SCHOOLS = {
    'Alabama', 'Arkansas', 'Auburn', 'Florida', 'Georgia', 'Kentucky', 'LSU', 'Mississippi',
    'Mississippi State', 'Missouri', 'South Carolina', 'Tennessee', 'Texas A&M', 'Vanderbilt',
    'Oklahoma', 'Texas'  # Added 2024
}
# Big Ten
BIG_TEN_SCHOOLS = {
    'Illinois', 'Indiana', 'Iowa', 'Maryland', 'Michigan', 'Michigan State', 'Minnesota',
    'Nebraska', 'Northwestern', 'Ohio State', 'Penn State', 'Purdue', 'Rutgers', 'Wisconsin',
    'UCLA', 'USC', 'Oregon', 'Washington'  # Added 2024
}
# Big 12
BIG_12_SCHOOLS = {
    'Baylor', 'Iowa State', 'Kansas', 'Kansas State', 'Oklahoma State', 'TCU', 'Texas Tech',
    'West Virginia', 'BYU', 'UCF', 'Cincinnati', 'Houston',  # Added 2023
    'Arizona', 'Arizona State', 'Colorado', 'Utah'  # Added 2024
}
# ACC
ACC_SCHOOLS = {
    'Boston College', 'Clemson', 'Duke', 'Florida State', 'Georgia Tech', 'Louisville', 'Miami',
    'North Carolina', 'North Carolina State', 'NC State', 'Pittsburgh', 'Syracuse', 'Virginia',
    'Virginia Tech', 'Wake Forest', 'California', 'SMU', 'Stanford'  # Added 2024
}
# Pac-12 (pre-2024 only)
PAC12_SCHOOLS = {
    'Arizona', 'Arizona State', 'California', 'Colorado', 'Oregon', 'Oregon State', 'Stanford',
    'UCLA', 'USC', 'Utah', 'Washington', 'Washington State'
}

# Combine into sets
P4_SCHOOLS = SEC_SCHOOLS | BIG_TEN_SCHOOLS | BIG_12_SCHOOLS | ACC_SCHOOLS | PAC12_SCHOOLS
P4_SCHOOLS_NO_PAC12 = SEC_SCHOOLS | BIG_TEN_SCHOOLS | BIG_12_SCHOOLS | ACC_SCHOOLS

def is_p4(row):
    s = row.get('School')
    if pd.isna(s) or s == '': return 0
    sn = school_alias.get(s, s)
    year = row.get('Year', 2023)
    schools = P4_SCHOOLS if year <= 2023 else P4_SCHOOLS_NO_PAC12
    return 1 if sn in schools else 0
df['p4_conference'] = df.apply(is_p4, axis=1)

# Contains flags (same as Edges)
df['contains_broad_jump'] = df['Broad Jump'].notna().astype(int)
df['contains_vertical'] = df['Vertical'].notna().astype(int)
df['contains_40yd'] = df['40yd'].notna().astype(int)
df['contains_height'] = df['Height'].notna().astype(int)
df['contains_weight'] = df['Weight'].notna().astype(int)
df['contains_speed_score'] = df['speed_score'].notna().astype(int)
df['contains_explosive_score'] = 1
df['contains_true_pass_set_pass_rush_win_rate'] = df['true_pass_set_pass_rush_win_rate'].notna().astype(int)
df['contains_pass_rush_win_rate'] = df['pass_rush_win_rate'].notna().astype(int)
df['contains_snap_counts_pass_rush'] = df['snap_counts_pass_rush'].notna().astype(int)
df['contains_stop_percent'] = df['stop_percent'].notna().astype(int) if 'stop_percent' in df.columns else 0
df['contains_ras'] = df['RAS'].notna().astype(int)
if 'arm_length_inches' not in df.columns:
    df['arm_length_inches'] = np.nan
df['contains_arm_length_inches'] = df['arm_length_inches'].notna().astype(int)
df['contains_p4_conference'] = df['School'].notna().astype(int)

In [99]:
# Target: round 1–7 if drafted, 8 if undrafted
y = np.where(
    df['Drafted'].astype(bool),
    np.clip(df['Round'].fillna(1).astype(int), 1, 7),
    8
)
X_raw = df[FEATURES_WITH_COLLEGE_ALL].copy()

# KNN imputation + scale
imputer = KNNImputer(n_neighbors=10)
X = imputer.fit_transform(X_raw)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Ridge regression
ridge = Ridge(alpha=1.0, random_state=42)
ridge.fit(X_scaled, y)

y_pred_train = np.clip(ridge.predict(X_scaled), 1, 8)
print('Train MAE (round 1–8):', round(mean_absolute_error(y, y_pred_train), 4))
print('Train samples:', len(y))

Train MAE (round 1–8): 1.3032
Train samples: 154


In [101]:
def prepare_test_dt(ddf):
    """Build same features as train (Height inches, speed_score, explosive_score, p4, PFF/arm contains_*)."""
    if ddf['Height'].dtype == object or (ddf['Height'].astype(str).str.contains('-', na=False).any()):
        ddf['Height'] = ddf['Height'].apply(height_inches)
    else:
        ddf['Height'] = pd.to_numeric(ddf['Height'], errors='coerce')
    ddf['speed_score'] = np.where(
        ddf['40yd'].notna() & (ddf['40yd'] > 0),
        ddf['Weight'] * 200 / (ddf['40yd'] ** 4),
        np.nan
    )
    ddf['explosive_score'] = (ddf['Vertical'] - mean_v).fillna(0) / std_v + (ddf['Broad Jump'] - mean_b).fillna(0) / std_b
    ddf['p4_conference'] = ddf.apply(is_p4, axis=1)
    ddf['contains_broad_jump'] = ddf['Broad Jump'].notna().astype(int)
    ddf['contains_vertical'] = ddf['Vertical'].notna().astype(int)
    ddf['contains_40yd'] = ddf['40yd'].notna().astype(int)
    ddf['contains_height'] = ddf['Height'].notna().astype(int)
    ddf['contains_weight'] = ddf['Weight'].notna().astype(int)
    ddf['contains_speed_score'] = ddf['speed_score'].notna().astype(int)
    ddf['contains_explosive_score'] = 1
    ddf['contains_true_pass_set_pass_rush_win_rate'] = ddf['true_pass_set_pass_rush_win_rate'].notna().astype(int) if 'true_pass_set_pass_rush_win_rate' in ddf.columns else 0
    ddf['contains_pass_rush_win_rate'] = ddf['pass_rush_win_rate'].notna().astype(int) if 'pass_rush_win_rate' in ddf.columns else 0
    ddf['contains_snap_counts_pass_rush'] = ddf['snap_counts_pass_rush'].notna().astype(int) if 'snap_counts_pass_rush' in ddf.columns else 0
    ddf['contains_stop_percent'] = ddf['stop_percent'].notna().astype(int) if 'stop_percent' in ddf.columns else 0
    ddf['contains_ras'] = ddf['RAS'].notna().astype(int)
    if 'arm_length_inches' not in ddf.columns:
        ddf['arm_length_inches'] = np.nan
    ddf['contains_arm_length_inches'] = ddf['arm_length_inches'].notna().astype(int)
    ddf['contains_p4_conference'] = ddf['School'].notna().astype(int)
    return ddf

# 2024 and 2025 from dt_testing.csv (same columns as train; PFF/RAS/arm already in)
dt_testing = pd.read_csv('../data/processed/dt_testing.csv')
dt_2024 = dt_testing[dt_testing['Year'] == 2024].copy()
dt_2025 = dt_testing[dt_testing['Year'] == 2025].copy()
dt_2024 = prepare_test_dt(dt_2024)
dt_2025 = prepare_test_dt(dt_2025)

X_24_raw = dt_2024[FEATURES_WITH_COLLEGE_ALL].copy()
X_25_raw = dt_2025[FEATURES_WITH_COLLEGE_ALL].copy()
X_24 = imputer.transform(X_24_raw)
X_25 = imputer.transform(X_25_raw)
X_24_scaled = scaler.transform(X_24)
X_25_scaled = scaler.transform(X_25)

pred_24 = np.clip(ridge.predict(X_24_scaled), 1, 8)
pred_25 = np.clip(ridge.predict(X_25_scaled), 1, 8)

actual_24 = dt_2024['Round'].astype(int).values
actual_25 = dt_2025['Round'].astype(int).values

def eval_metrics(actual, pred, label):
    mae = mean_absolute_error(actual, pred)
    rmse = np.sqrt(mean_squared_error(actual, pred))
    r2 = r2_score(actual, pred)
    exact = (np.round(pred) == actual).mean()
    within_1 = (np.abs(np.round(pred) - actual) <= 1).mean()
    print(f'{label} (n={len(actual)}): MAE={mae:.4f}, RMSE={rmse:.4f}, R²={r2:.4f}, Exact={exact:.2%}, Within-1={within_1:.2%}')

print('2024 DTs:')
eval_metrics(actual_24, pred_24, '2024')
print('2025 DTs:')
eval_metrics(actual_25, pred_25, '2025')

2024 DTs:
2024 (n=13): MAE=1.2414, RMSE=1.4882, R²=0.2661, Exact=15.38%, Within-1=69.23%
2025 DTs:
2025 (n=11): MAE=1.6266, RMSE=1.9502, R²=-3.6961, Exact=18.18%, Within-1=36.36%


In [102]:
# Dataframes: players with actual round, model prediction, tier label, and interpretation
def pred_round_to_tier(p):
    if p < 1.75: return ('Round 1 Tier', 'True 1st-round grade')
    if p < 2.75: return ('Round 2 Tier', 'Early Day 2')
    if p < 3.75: return ('Round 3 Tier', 'Late Day 2')
    if p < 4.75: return ('Round 4 Tier', 'Early Day 3')
    if p < 5.75: return ('Round 5 Tier', 'Mid Day 3')
    if p < 6.75: return ('Round 6 Tier', 'Late Day 3')
    return ('Round 7 / UDFA Tier', 'Fringe draftable')

dt_2024_display = dt_2024[['Round', 'Pick', 'Player', 'School', 'Year']].copy()
dt_2024_display['predicted_round'] = pred_24
dt_2024_display['tier_label'] = [pred_round_to_tier(x)[0] for x in pred_24]
dt_2024_display['interpretation'] = [pred_round_to_tier(x)[1] for x in pred_24]
dt_2024_display['Round'] = dt_2024_display['Round'].astype(int)

dt_2025_display = dt_2025[['Round', 'Pick', 'Player', 'School', 'Year']].copy()
dt_2025_display['predicted_round'] = pred_25
dt_2025_display['tier_label'] = [pred_round_to_tier(x)[0] for x in pred_25]
dt_2025_display['interpretation'] = [pred_round_to_tier(x)[1] for x in pred_25]
dt_2025_display['Round'] = dt_2025_display['Round'].astype(int)

print('2024 drafted DTs')
display(dt_2024_display)
print('2025 drafted DTs')
display(dt_2025_display)

2024 drafted DTs


Unnamed: 0,Round,Pick,Player,School,Year,predicted_round,tier_label,interpretation
0,1,16,Byron Murphy II,Texas,2024,2.091066,Round 2 Tier,Early Day 2
1,2,35,Ruke Orhorhoro,Clemson,2024,5.020187,Round 5 Tier,Mid Day 3
2,2,36,Johnny Newton,Illinois,2024,2.006664,Round 2 Tier,Early Day 2
3,2,38,T'Vondre Sweat,Texas,2024,1.0,Round 1 Tier,True 1st-round grade
4,2,39,Braden Fiske,Florida State,2024,3.719911,Round 3 Tier,Late Day 2
5,3,73,Kris Jenkins,Michigan,2024,3.745167,Round 3 Tier,Late Day 2
6,3,80,McKinnley Jackson,Texas A&M,2024,5.591153,Round 5 Tier,Mid Day 3
7,4,109,Maason Smith,LSU,2024,3.351625,Round 3 Tier,Late Day 2
8,4,111,Justin Eboigbe,Alabama,2024,4.720833,Round 4 Tier,Early Day 3
9,5,139,Logan Lee,Iowa,2024,5.449872,Round 5 Tier,Mid Day 3


2025 drafted DTs


Unnamed: 0,Round,Pick,Player,School,Year,predicted_round,tier_label,interpretation
13,1,5,Mason Graham,Michigan,2025,2.668577,Round 2 Tier,Early Day 2
14,1,13,Kenneth Grant,Michigan,2025,2.745109,Round 2 Tier,Early Day 2
15,1,16,Walter Nolen,Ole Miss,2025,3.052453,Round 3 Tier,Late Day 2
16,1,21,Derrick Harmon,Oregon,2025,1.421597,Round 1 Tier,True 1st-round grade
17,1,28,Tyleik Williams,Ohio State,2025,2.907142,Round 3 Tier,Late Day 2
18,2,41,T.J. Sanders,South Carolina,2025,4.218352,Round 4 Tier,Early Day 3
19,2,43,Alfred Collins,Texas,2025,3.928776,Round 4 Tier,Early Day 3
20,3,62,Shemar Turner,Texas A&M,2025,3.651342,Round 3 Tier,Late Day 2
21,3,63,Omarr Norman-Lott,Tennessee,2025,7.275681,Round 7 / UDFA Tier,Fringe draftable
22,3,65,Darius Alexander,Toledo,2025,3.746063,Round 3 Tier,Late Day 2


In [103]:
# 2026 predictions (dt_drafted_2026.csv has PFF/RAS/arm from data_cleaning)
dt_2026 = pd.read_csv('dt_drafted_2026.csv')
dt_2026['Year'] = 2026
dt_2026 = prepare_test_dt(dt_2026)

X_26_raw = dt_2026[FEATURES_WITH_COLLEGE_ALL].copy()
X_26 = imputer.transform(X_26_raw)
X_26_scaled = scaler.transform(X_26)

pred_26 = np.clip(ridge.predict(X_26_scaled), 1, 8)

# Check if Round column exists and has valid values for evaluation
if 'Round' in dt_2026.columns and dt_2026['Round'].notna().any():
    actual_26 = dt_2026['Round'].astype(int).values
    print('2026 DTs:')
    eval_metrics(actual_26, pred_26, '2026')
else:
    print(f'2026 DTs (n={len(pred_26)}): Predictions generated (no actual rounds available)')

# Display 2026 predictions
dt_2026_display = dt_2026[['Round', 'Pick', 'Player', 'School', 'Year']].copy()
dt_2026_display['predicted_round'] = pred_26
dt_2026_display['tier_label'] = [pred_round_to_tier(x)[0] for x in pred_26]
dt_2026_display['interpretation'] = [pred_round_to_tier(x)[1] for x in pred_26]
if 'Round' in dt_2026_display.columns and dt_2026_display['Round'].notna().any():
    dt_2026_display['Round'] = dt_2026_display['Round'].astype(int)

print('\n2026 drafted DTs')
display(dt_2026_display)

2026 DTs:
2026 (n=26): MAE=1.9953, RMSE=2.3663, R²=-0.2732, Exact=11.54%, Within-1=34.62%

2026 drafted DTs


Unnamed: 0,Round,Pick,Player,School,Year,predicted_round,tier_label,interpretation
0,1,3,Peter Woods,Clemson,2026,1.813948,Round 2 Tier,Early Day 2
1,1,14,Caleb Banks,Florida,2026,4.26397,Round 4 Tier,Early Day 3
2,1,21,Christen Miller,Georgia,2026,4.730665,Round 4 Tier,Early Day 3
3,2,39,A'Mauri Washington,Oregon,2026,3.708243,Round 3 Tier,Late Day 2
4,2,47,Kayden McDonald,Ohio State,2026,4.842359,Round 5 Tier,Mid Day 3
5,2,58,Dontay Corleone,Cincinnati,2026,6.900476,Round 7 / UDFA Tier,Fringe draftable
6,2,61,Domonique Orange,Iowa State,2026,5.529832,Round 5 Tier,Mid Day 3
7,3,79,C.J. Fite,Arizona State,2026,6.694004,Round 6 Tier,Late Day 3
8,3,82,Darrell Jackson Jr.,Florida State,2026,2.59579,Round 2 Tier,Early Day 2
9,4,114,Skyler Gill-Howard,Texas Tech,2026,6.004661,Round 6 Tier,Late Day 3


In [104]:
# Model results on entire training set (2017–2023), ordered by predicted_round
train_display = df[['Round', 'Pick', 'Player', 'School', 'Year']].copy()
train_display['predicted_round'] = y_pred_train
train_display['tier_label'] = [pred_round_to_tier(x)[0] for x in y_pred_train]
train_display['interpretation'] = [pred_round_to_tier(x)[1] for x in y_pred_train]
train_display = train_display.sort_values('predicted_round').reset_index(drop=True)
train_display

Unnamed: 0,Round,Pick,Player,School,Year,predicted_round,tier_label,interpretation
0,1.0,6.0,Leonard Williams,USC,2015,1.000000,Round 1 Tier,True 1st-round grade
1,1.0,12.0,Vita Vea,Washington,2018,1.000000,Round 1 Tier,True 1st-round grade
2,1.0,13.0,Jordan Davis,Georgia,2022,1.000000,Round 1 Tier,True 1st-round grade
3,2.0,37.0,Chris Jones,Mississippi State,2016,1.306975,Round 1 Tier,True 1st-round grade
4,1.0,17.0,Jonathan Allen,Alabama,2017,1.342204,Round 1 Tier,True 1st-round grade
...,...,...,...,...,...,...,...,...
149,,,LaBryan Ray,Alabama,2022,8.000000,Round 7 / UDFA Tier,Fringe draftable
150,,,Dan Archibong,Temple,2021,8.000000,Round 7 / UDFA Tier,Fringe draftable
151,,,Naquan Jones,Michigan St.,2021,8.000000,Round 7 / UDFA Tier,Fringe draftable
152,,,Lorenzo Neal,Purdue,2021,8.000000,Round 7 / UDFA Tier,Fringe draftable
