# DT round regression (Ridge)

Predict draft round 1–8 (8 = undrafted) using combine+college features, KNN imputation, Ridge regression.
- Train: 2017–2023 (dt_training + dt_testing combined).
- Test: dt_drafted_2024.csv, dt_drafted_2025.csv (drafted only; actual rounds 1–7).

In [16]:
import numpy as np
import pandas as pd
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Raw Athletic Score (RAS) for DTs from ras.csv
ras_df = pd.read_csv('../data/raw/ras.csv')
ras_df = ras_df[ras_df['Pos'] == 'DT'].copy()
ras_df['RAS'] = pd.to_numeric(ras_df['RAS'], errors='coerce')
ras_df['Year'] = ras_df['Year'].astype(int)
ras_dt = ras_df[['Name', 'Year', 'RAS']].drop_duplicates(subset=['Name', 'Year'])

FEATURES_WITH_COLLEGE = [
    'Broad Jump', 'Vertical', '40yd', 'Height', 'Weight',
    'speed_score', 'explosive_score', 'RAS',
    'QB_Hurry_final_season', 'TFL_final_season', 'Sacks_final_season', 'p4_conference'
]
CONTAINS_WITH_COLLEGE = [
    'contains_broad_jump', 'contains_vertical', 'contains_40yd', 'contains_height', 'contains_weight',
    'contains_speed_score', 'contains_explosive_score', 'contains_ras',
    'contains_qb_hurry_final_season', 'contains_tfl_final_season', 'contains_sacks_final_season',
    'contains_p4_conference'
]
FEATURES_WITH_COLLEGE_ALL = FEATURES_WITH_COLLEGE + CONTAINS_WITH_COLLEGE

In [17]:
# Load DT training + testing, combine, filter 2017–2023
train_dt = pd.read_csv('../data/processed/dt_training_data.csv')
test_dt = pd.read_csv('../data/processed/dt_testing_data.csv')
df = pd.concat([train_dt, test_dt], ignore_index=True)
df = df[df['Year'].between(2017, 2023)].copy()
df = df[df['Pos'] == 'DT'].copy()
# Merge RAS (Raw Athletic Score) by Player name and Year
df = df.merge(ras_dt, left_on=['Player', 'Year'], right_on=['Name', 'Year'], how='left')
df = df.drop(columns=['Name'], errors='ignore')
print('Train (2017–2023 DTs):', len(df))

Train (2017–2023 DTs): 100


In [18]:
# Print RAS score availability
ras_count = df['RAS'].notna().sum()
total_count = len(df)
ras_pct = (ras_count / total_count * 100) if total_count > 0 else 0
print(f"Players with RAS score: {ras_count} out of {total_count} ({ras_pct:.1f}%)")

Players with RAS score: 79 out of 100 (79.0%)


In [19]:
# Height to inches
def height_inches(h):
    if pd.isna(h): return np.nan
    if isinstance(h, (int, float)) and not (isinstance(h, float) and np.isnan(h)):
        return float(h)
    s = str(h).strip()
    if '-' in s:
        parts = s.split('-')
        return int(parts[0]) * 12 + int(parts[1])
    return np.nan
df['Height'] = df['Height'].apply(height_inches)

# Speed score
df['speed_score'] = np.where(
    df['40yd'].notna() & (df['40yd'] > 0),
    df['Weight'] * 200 / (df['40yd'] ** 4),
    np.nan
)

# Explosive score (z-scores from this pool)
mean_v = df['Vertical'].mean()
std_v = df['Vertical'].std()
mean_b = df['Broad Jump'].mean()
std_b = df['Broad Jump'].std()
if std_v == 0 or np.isnan(std_v): std_v = 1.0
if std_b == 0 or np.isnan(std_b): std_b = 1.0
df['explosive_score'] = (df['Vertical'] - mean_v).fillna(0) / std_v + (df['Broad Jump'] - mean_b).fillna(0) / std_b

# P4 conference (from defensive_stats)
P4_WITH_PAC12 = {'SEC', 'Big Ten', 'Big 12', 'ACC', 'Pac-12'}
P4_NO_PAC12 = {'SEC', 'Big Ten', 'Big 12', 'ACC'}
school_alias = {
    'Ole Miss': 'Mississippi', 'Miami (FL)': 'Miami', 'Southern California': 'USC',
    'Central Florida': 'UCF', 'Brigham Young': 'BYU', 'Ohio St.': 'Ohio State',
    'Florida St.': 'Florida State', 'Kansas St.': 'Kansas State', 'Iowa St.': 'Iowa State',
    'Oklahoma St.': 'Oklahoma State', 'Penn St.': 'Penn State', 'San Diego St.': 'San Diego State',
}
_stats = pd.read_csv('../data/processed/defensive_stats_2016_to_2025.csv')
P4_SCHOOLS = set(_stats[_stats['Conference'].isin(P4_WITH_PAC12)]['Team'].unique())
P4_SCHOOLS_NO_PAC12 = set(_stats[_stats['Conference'].isin(P4_NO_PAC12)]['Team'].unique())

def is_p4(row):
    s = row.get('School')
    if pd.isna(s) or s == '': return 0
    sn = school_alias.get(s, s)
    year = row.get('Year', 2023)
    schools = P4_SCHOOLS if year <= 2023 else P4_SCHOOLS_NO_PAC12
    return 1 if sn in schools else 0
df['p4_conference'] = df.apply(is_p4, axis=1)

# Contains flags
df['contains_broad_jump'] = df['Broad Jump'].notna().astype(int)
df['contains_vertical'] = df['Vertical'].notna().astype(int)
df['contains_40yd'] = df['40yd'].notna().astype(int)
df['contains_height'] = df['Height'].notna().astype(int)
df['contains_weight'] = df['Weight'].notna().astype(int)
df['contains_speed_score'] = df['speed_score'].notna().astype(int)
df['contains_explosive_score'] = 1
df['contains_qb_hurry_final_season'] = df['QB_Hurry_final_season'].notna().astype(int)
df['contains_tfl_final_season'] = df['TFL_final_season'].notna().astype(int)
df['contains_sacks_final_season'] = df['Sacks_final_season'].notna().astype(int)
df['contains_ras'] = df['RAS'].notna().astype(int)
df['contains_p4_conference'] = df['School'].notna().astype(int)

In [20]:
# Target: round 1–7 if drafted, 8 if undrafted
y = np.where(
    df['Drafted'].astype(bool),
    np.clip(df['Round'].fillna(1).astype(int), 1, 7),
    8
)
X_raw = df[FEATURES_WITH_COLLEGE_ALL].copy()

# KNN imputation + scale
imputer = KNNImputer(n_neighbors=10)
X = imputer.fit_transform(X_raw)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Ridge regression
ridge = Ridge(alpha=1.0, random_state=42)
ridge.fit(X_scaled, y)

y_pred_train = np.clip(ridge.predict(X_scaled), 1, 8)
print('Train MAE (round 1–8):', round(mean_absolute_error(y, y_pred_train), 4))
print('Train samples:', len(y))

Train MAE (round 1–8): 1.3991
Train samples: 100


In [21]:
def prepare_dt_df(csv_path, year):
    """Load DT CSV and build same features (Height inches, speed_score, explosive_score, p4, contains_*)."""
    ddf = pd.read_csv(csv_path)
    ddf['Year'] = year
    if ddf['Height'].dtype == object or (ddf['Height'].astype(str).str.contains('-', na=False).any()):
        ddf['Height'] = ddf['Height'].apply(height_inches)
    else:
        ddf['Height'] = pd.to_numeric(ddf['Height'], errors='coerce')
    ddf['speed_score'] = np.where(
        ddf['40yd'].notna() & (ddf['40yd'] > 0),
        ddf['Weight'] * 200 / (ddf['40yd'] ** 4),
        np.nan
    )
    ddf['explosive_score'] = (ddf['Vertical'] - mean_v).fillna(0) / std_v + (ddf['Broad Jump'] - mean_b).fillna(0) / std_b
    ddf['p4_conference'] = ddf.apply(is_p4, axis=1)
    ddf = ddf.merge(ras_dt, left_on=['Player', 'Year'], right_on=['Name', 'Year'], how='left')
    ddf = ddf.drop(columns=['Name'], errors='ignore')
    ddf['contains_broad_jump'] = ddf['Broad Jump'].notna().astype(int)
    ddf['contains_vertical'] = ddf['Vertical'].notna().astype(int)
    ddf['contains_40yd'] = ddf['40yd'].notna().astype(int)
    ddf['contains_height'] = ddf['Height'].notna().astype(int)
    ddf['contains_weight'] = ddf['Weight'].notna().astype(int)
    ddf['contains_speed_score'] = ddf['speed_score'].notna().astype(int)
    ddf['contains_explosive_score'] = 1
    ddf['contains_qb_hurry_final_season'] = ddf['QB_Hurry_final_season'].notna().astype(int)
    ddf['contains_tfl_final_season'] = ddf['TFL_final_season'].notna().astype(int)
    ddf['contains_sacks_final_season'] = ddf['Sacks_final_season'].notna().astype(int)
    ddf['contains_ras'] = ddf['RAS'].notna().astype(int)
    ddf['contains_p4_conference'] = ddf['School'].notna().astype(int)
    return ddf

# 2024 and 2025 (drafted only; actual round 1–7)
dt_2024 = prepare_dt_df('dt_drafted_2024.csv', 2024)
dt_2025 = prepare_dt_df('dt_drafted_2025.csv', 2025)

X_24_raw = dt_2024[FEATURES_WITH_COLLEGE_ALL].copy()
X_25_raw = dt_2025[FEATURES_WITH_COLLEGE_ALL].copy()
X_24 = imputer.transform(X_24_raw)
X_25 = imputer.transform(X_25_raw)
X_24_scaled = scaler.transform(X_24)
X_25_scaled = scaler.transform(X_25)

pred_24 = np.clip(ridge.predict(X_24_scaled), 1, 8)
pred_25 = np.clip(ridge.predict(X_25_scaled), 1, 8)

actual_24 = dt_2024['Round'].astype(int).values
actual_25 = dt_2025['Round'].astype(int).values

def eval_metrics(actual, pred, label):
    mae = mean_absolute_error(actual, pred)
    rmse = np.sqrt(mean_squared_error(actual, pred))
    r2 = r2_score(actual, pred)
    exact = (np.round(pred) == actual).mean()
    within_1 = (np.abs(np.round(pred) - actual) <= 1).mean()
    print(f'{label} (n={len(actual)}): MAE={mae:.4f}, RMSE={rmse:.4f}, R²={r2:.4f}, Exact={exact:.2%}, Within-1={within_1:.2%}')

print('2024 DTs:')
eval_metrics(actual_24, pred_24, '2024')
print('2025 DTs:')
eval_metrics(actual_25, pred_25, '2025')

2024 DTs:
2024 (n=13): MAE=1.4445, RMSE=1.6507, R²=0.0970, Exact=15.38%, Within-1=46.15%
2025 DTs:
2025 (n=11): MAE=2.4720, RMSE=2.7064, R²=-8.0434, Exact=0.00%, Within-1=27.27%


In [22]:
# Dataframes: players with actual round, model prediction, tier label, and interpretation
def pred_round_to_tier(p):
    if p < 1.75: return ('Round 1 Tier', 'True 1st-round grade')
    if p < 2.75: return ('Round 2 Tier', 'Early Day 2')
    if p < 3.75: return ('Round 3 Tier', 'Late Day 2')
    if p < 4.75: return ('Round 4 Tier', 'Early Day 3')
    if p < 5.75: return ('Round 5 Tier', 'Mid Day 3')
    if p < 6.75: return ('Round 6 Tier', 'Late Day 3')
    return ('Round 7 / UDFA Tier', 'Fringe draftable')

dt_2024_display = dt_2024[['Round', 'Pick', 'Player', 'School', 'Year']].copy()
dt_2024_display['predicted_round'] = pred_24
dt_2024_display['tier_label'] = [pred_round_to_tier(x)[0] for x in pred_24]
dt_2024_display['interpretation'] = [pred_round_to_tier(x)[1] for x in pred_24]
dt_2024_display['Round'] = dt_2024_display['Round'].astype(int)

dt_2025_display = dt_2025[['Round', 'Pick', 'Player', 'School', 'Year']].copy()
dt_2025_display['predicted_round'] = pred_25
dt_2025_display['tier_label'] = [pred_round_to_tier(x)[0] for x in pred_25]
dt_2025_display['interpretation'] = [pred_round_to_tier(x)[1] for x in pred_25]
dt_2025_display['Round'] = dt_2025_display['Round'].astype(int)

print('2024 drafted DTs')
display(dt_2024_display)
print('2025 drafted DTs')
display(dt_2025_display)

2024 drafted DTs


Unnamed: 0,Round,Pick,Player,School,Year,predicted_round,tier_label,interpretation
0,1,16,Byron Murphy II,Texas,2024,3.3186,Round 3 Tier,Late Day 2
1,2,35,Ruke Orhorhoro,Clemson,2024,3.537523,Round 3 Tier,Late Day 2
2,2,36,Johnny Newton,Illinois,2024,2.408132,Round 2 Tier,Early Day 2
3,2,38,T'Vondre Sweat,Texas,2024,2.589779,Round 2 Tier,Early Day 2
4,2,39,Braden Fiske,Florida State,2024,3.113617,Round 3 Tier,Late Day 2
5,3,73,Kris Jenkins,Michigan,2024,4.6117,Round 4 Tier,Early Day 3
6,3,80,McKinnley Jackson,Texas A&M,2024,5.713637,Round 5 Tier,Mid Day 3
7,4,109,Maason Smith,LSU,2024,3.460729,Round 3 Tier,Late Day 2
8,4,111,Justin Eboigbe,Alabama,2024,4.247271,Round 4 Tier,Early Day 3
9,5,139,Logan Lee,Iowa,2024,6.161548,Round 6 Tier,Late Day 3


2025 drafted DTs


Unnamed: 0,Round,Pick,Player,School,Year,predicted_round,tier_label,interpretation
0,1,5,Mason Graham,Michigan,2025,4.385384,Round 4 Tier,Early Day 3
1,1,13,Kenneth Grant,Michigan,2025,3.203304,Round 3 Tier,Late Day 2
2,1,16,Walter Nolen,Ole Miss,2025,4.731168,Round 4 Tier,Early Day 3
3,1,21,Derrick Harmon,Oregon,2025,2.010359,Round 2 Tier,Early Day 2
4,1,28,Tyleik Williams,Ohio State,2025,3.850835,Round 4 Tier,Early Day 3
5,2,41,T.J. Sanders,South Carolina,2025,3.79543,Round 4 Tier,Early Day 3
6,2,43,Alfred Collins,Texas,2025,6.148021,Round 6 Tier,Late Day 3
7,3,62,Shemar Turner,Texas A&M,2025,4.309765,Round 4 Tier,Early Day 3
8,3,63,Omarr Norman-Lott,Tennessee,2025,6.832316,Round 7 / UDFA Tier,Fringe draftable
9,3,65,Darius Alexander,Toledo,2025,4.652975,Round 4 Tier,Early Day 3


In [23]:
# 2026 evaluation
dt_2026 = prepare_dt_df('dt_drafted_2026.csv', 2026)

X_26_raw = dt_2026[FEATURES_WITH_COLLEGE_ALL].copy()
X_26 = imputer.transform(X_26_raw)
X_26_scaled = scaler.transform(X_26)

pred_26 = np.clip(ridge.predict(X_26_scaled), 1, 8)

# Check if Round column exists and has valid values for evaluation
if 'Round' in dt_2026.columns and dt_2026['Round'].notna().any():
    actual_26 = dt_2026['Round'].astype(int).values
    print('2026 DTs:')
    eval_metrics(actual_26, pred_26, '2026')
else:
    print(f'2026 DTs (n={len(pred_26)}): Predictions generated (no actual rounds available)')

# Display 2026 predictions
dt_2026_display = dt_2026[['Round', 'Pick', 'Player', 'School', 'Year']].copy()
dt_2026_display['predicted_round'] = pred_26
dt_2026_display['tier_label'] = [pred_round_to_tier(x)[0] for x in pred_26]
dt_2026_display['interpretation'] = [pred_round_to_tier(x)[1] for x in pred_26]
if 'Round' in dt_2026_display.columns and dt_2026_display['Round'].notna().any():
    dt_2026_display['Round'] = dt_2026_display['Round'].astype(int)

print('\n2026 drafted DTs')
display(dt_2026_display)

2026 DTs:
2026 (n=26): MAE=2.0120, RMSE=2.2831, R²=-0.1852, Exact=7.69%, Within-1=30.77%

2026 drafted DTs


Unnamed: 0,Round,Pick,Player,School,Year,predicted_round,tier_label,interpretation
0,1,3,Peter Woods,Clemson,2026,3.165489,Round 3 Tier,Late Day 2
1,1,14,Caleb Banks,Florida,2026,2.464156,Round 2 Tier,Early Day 2
2,1,21,Christen Miller,Georgia,2026,5.442383,Round 5 Tier,Mid Day 3
3,2,39,A'Mauri Washington,Oregon,2026,4.709124,Round 4 Tier,Early Day 3
4,2,47,Kayden McDonald,Ohio State,2026,4.470138,Round 4 Tier,Early Day 3
5,2,58,Dontay Corleone,Cincinnati,2026,5.254674,Round 5 Tier,Mid Day 3
6,2,61,Domonique Orange,Iowa State,2026,5.330873,Round 5 Tier,Mid Day 3
7,3,79,C.J. Fite,Arizona State,2026,6.162543,Round 6 Tier,Late Day 3
8,3,82,Darrell Jackson Jr.,Florida State,2026,3.741948,Round 3 Tier,Late Day 2
9,4,114,Skyler Gill-Howard,Texas Tech,2026,7.608773,Round 7 / UDFA Tier,Fringe draftable


In [24]:
# Model results on entire training set (2017–2023), ordered by predicted_round
train_display = df[['Round', 'Pick', 'Player', 'School', 'Year']].copy()
train_display['predicted_round'] = y_pred_train
train_display['tier_label'] = [pred_round_to_tier(x)[0] for x in y_pred_train]
train_display['interpretation'] = [pred_round_to_tier(x)[1] for x in y_pred_train]
train_display = train_display.sort_values('predicted_round').reset_index(drop=True)
train_display

Unnamed: 0,Round,Pick,Player,School,Year,predicted_round,tier_label,interpretation
0,1.0,13.0,Jordan Davis,Georgia,2022,1.000000,Round 1 Tier,True 1st-round grade
1,1.0,12.0,Vita Vea,Washington,2018,1.345115,Round 1 Tier,True 1st-round grade
2,1.0,19.0,Calijah Kancey,Pittsburgh,2023,1.742699,Round 1 Tier,True 1st-round grade
3,3.0,93.0,Montravius Adams,Auburn,2017,1.749000,Round 1 Tier,True 1st-round grade
4,1.0,17.0,Jonathan Allen,Alabama,2017,1.937257,Round 2 Tier,Early Day 2
...,...,...,...,...,...,...,...,...
95,,,Austin Faoliu,Oregon,2021,7.824631,Round 7 / UDFA Tier,Fringe draftable
96,,,George Silvanic,Air Force,2021,7.883564,Round 7 / UDFA Tier,Fringe draftable
97,,,Elijah Ponder,Cincinnati,2021,8.000000,Round 7 / UDFA Tier,Fringe draftable
98,,,Dan Archibong,Temple,2021,8.000000,Round 7 / UDFA Tier,Fringe draftable
