# LB round regression (Ridge)

Predict draft round 1–8 (8 = undrafted) using combine+college features, KNN imputation, Ridge regression.
- Train: 2017–2023 (lb_training + lb_testing combined).
- Test: lb_drafted_2024.csv, lb_drafted_2025.csv (drafted only; actual rounds 1–7).

In [17]:
import numpy as np
import pandas as pd
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Raw Athletic Score (RAS) for LBs from ras.csv
ras_df = pd.read_csv('../data/raw/ras.csv')
ras_df = ras_df[ras_df['Pos'].isin(['ILB', 'LB', 'OLB'])].copy()
ras_df['RAS'] = pd.to_numeric(ras_df['RAS'], errors='coerce')
ras_df['Year'] = ras_df['Year'].astype(int)
ras_lb = ras_df[['Name', 'Year', 'RAS']].drop_duplicates(subset=['Name', 'Year'])

FEATURES_WITH_COLLEGE = [
    'Broad Jump', 'Vertical', '40yd', 'Height', 'Weight',
    'speed_score', 'explosive_score', 'RAS',
    'QB_Hurry_final_season', 'TFL_final_season', 'Sacks_final_season', 'PD_final_season', 'SOLO_final_season', 'TOT_final_season', 'p4_conference'
]
CONTAINS_WITH_COLLEGE = [
    'contains_broad_jump', 'contains_vertical', 'contains_40yd', 'contains_height', 'contains_weight',
    'contains_speed_score', 'contains_explosive_score', 'contains_ras',
    'contains_qb_hurry_final_season', 'contains_tfl_final_season', 'contains_sacks_final_season',
    'contains_pd_final_season', 'contains_solo_final_season', 'contains_tot_final_season',
    'contains_p4_conference'
]
FEATURES_WITH_COLLEGE_ALL = FEATURES_WITH_COLLEGE + CONTAINS_WITH_COLLEGE

In [18]:
# Load LB training + testing, combine, filter 2017–2023 and LBs (ILB, LB, OLB)
train_lb = pd.read_csv('../data/processed/lb_training_data.csv')
test_lb = pd.read_csv('../data/processed/lb_testing_data.csv')
df = pd.concat([train_lb, test_lb], ignore_index=True)
df = df[df['Year'].between(2017, 2023)].copy()
df = df[df['Pos'].isin(['ILB', 'LB', 'OLB'])].copy()
# Merge RAS (Raw Athletic Score) by Player name and Year
df = df.merge(ras_lb, left_on=['Player', 'Year'], right_on=['Name', 'Year'], how='left')
df = df.drop(columns=['Name'], errors='ignore')
print('Train (2017–2023 LBs):', len(df))

Train (2017–2023 LBs): 249


In [25]:
# Print RAS score availability
ras_count = df['RAS'].notna().sum()
total_count = len(df)
ras_pct = (ras_count / total_count * 100) if total_count > 0 else 0
print(f"Players with RAS score: {ras_count} out of {total_count} ({ras_pct:.1f}%)")

Players with RAS score: 190 out of 249 (76.3%)


In [19]:
# Height to inches
def height_inches(h):
    if pd.isna(h): return np.nan
    if isinstance(h, (int, float)) and not (isinstance(h, float) and np.isnan(h)):
        return float(h)
    s = str(h).strip()
    if '-' in s:
        parts = s.split('-')
        return int(parts[0]) * 12 + int(parts[1])
    return np.nan
df['Height'] = df['Height'].apply(height_inches)

# Speed score
df['speed_score'] = np.where(
    df['40yd'].notna() & (df['40yd'] > 0),
    df['Weight'] * 200 / (df['40yd'] ** 4),
    np.nan
)

# Explosive score (z-scores from this pool)
mean_v = df['Vertical'].mean()
std_v = df['Vertical'].std()
mean_b = df['Broad Jump'].mean()
std_b = df['Broad Jump'].std()
if std_v == 0 or np.isnan(std_v): std_v = 1.0
if std_b == 0 or np.isnan(std_b): std_b = 1.0
df['explosive_score'] = (df['Vertical'] - mean_v).fillna(0) / std_v + (df['Broad Jump'] - mean_b).fillna(0) / std_b

# P4 conference (from defensive_stats)
P4_WITH_PAC12 = {'SEC', 'Big Ten', 'Big 12', 'ACC', 'Pac-12'}
P4_NO_PAC12 = {'SEC', 'Big Ten', 'Big 12', 'ACC'}
school_alias = {
    'Ole Miss': 'Mississippi', 'Miami (FL)': 'Miami', 'Southern California': 'USC',
    'Central Florida': 'UCF', 'Brigham Young': 'BYU', 'Ohio St.': 'Ohio State',
    'Florida St.': 'Florida State', 'Kansas St.': 'Kansas State', 'Iowa St.': 'Iowa State',
    'Oklahoma St.': 'Oklahoma State', 'Penn St.': 'Penn State', 'San Diego St.': 'San Diego State',
}
_stats = pd.read_csv('../data/processed/defensive_stats_2016_to_2025.csv')
P4_SCHOOLS = set(_stats[_stats['Conference'].isin(P4_WITH_PAC12)]['Team'].unique())
P4_SCHOOLS_NO_PAC12 = set(_stats[_stats['Conference'].isin(P4_NO_PAC12)]['Team'].unique())

def is_p4(row):
    s = row.get('School')
    if pd.isna(s) or s == '': return 0
    sn = school_alias.get(s, s)
    year = row.get('Year', 2023)
    schools = P4_SCHOOLS if year <= 2023 else P4_SCHOOLS_NO_PAC12
    return 1 if sn in schools else 0
df['p4_conference'] = df.apply(is_p4, axis=1)

# Contains flags
df['contains_broad_jump'] = df['Broad Jump'].notna().astype(int)
df['contains_vertical'] = df['Vertical'].notna().astype(int)
df['contains_40yd'] = df['40yd'].notna().astype(int)
df['contains_height'] = df['Height'].notna().astype(int)
df['contains_weight'] = df['Weight'].notna().astype(int)
df['contains_speed_score'] = df['speed_score'].notna().astype(int)
df['contains_explosive_score'] = 1
df['contains_qb_hurry_final_season'] = df['QB_Hurry_final_season'].notna().astype(int)
df['contains_tfl_final_season'] = df['TFL_final_season'].notna().astype(int)
df['contains_sacks_final_season'] = df['Sacks_final_season'].notna().astype(int)
df['contains_pd_final_season'] = df['PD_final_season'].notna().astype(int)
df['contains_solo_final_season'] = df['SOLO_final_season'].notna().astype(int)
df['contains_tot_final_season'] = df['TOT_final_season'].notna().astype(int)
df['contains_ras'] = df['RAS'].notna().astype(int)
df['contains_p4_conference'] = df['School'].notna().astype(int)

In [20]:
# Target: round 1–7 if drafted, 8 if undrafted
y = np.where(
    df['Drafted'].astype(bool),
    np.clip(df['Round'].fillna(1).astype(int), 1, 7),
    8
)
X_raw = df[FEATURES_WITH_COLLEGE_ALL].copy()

# KNN imputation + scale
imputer = KNNImputer(n_neighbors=10)
X = imputer.fit_transform(X_raw)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Ridge regression
ridge = Ridge(alpha=1.0, random_state=42)
ridge.fit(X_scaled, y)

y_pred_train = np.clip(ridge.predict(X_scaled), 1, 8)
print('Train MAE (round 1–8):', round(mean_absolute_error(y, y_pred_train), 4))
print('Train samples:', len(y))

Train MAE (round 1–8): 1.5612
Train samples: 249


In [22]:
def prepare_lb_df(csv_path, year):
    """Load LB CSV and build same features (Height inches, speed_score, explosive_score, p4, contains_*)."""
    ldf = pd.read_csv(csv_path)
    ldf['Year'] = year
    if ldf['Height'].dtype == object or (ldf['Height'].astype(str).str.contains('-', na=False).any()):
        ldf['Height'] = ldf['Height'].apply(height_inches)
    else:
        ldf['Height'] = pd.to_numeric(ldf['Height'], errors='coerce')
    ldf['speed_score'] = np.where(
        ldf['40yd'].notna() & (ldf['40yd'] > 0),
        ldf['Weight'] * 200 / (ldf['40yd'] ** 4),
        np.nan
    )
    ldf['explosive_score'] = (ldf['Vertical'] - mean_v).fillna(0) / std_v + (ldf['Broad Jump'] - mean_b).fillna(0) / std_b
    ldf['p4_conference'] = ldf.apply(is_p4, axis=1)
    ldf = ldf.merge(ras_lb, left_on=['Player', 'Year'], right_on=['Name', 'Year'], how='left')
    ldf = ldf.drop(columns=['Name'], errors='ignore')
    ldf['contains_broad_jump'] = ldf['Broad Jump'].notna().astype(int)
    ldf['contains_vertical'] = ldf['Vertical'].notna().astype(int)
    ldf['contains_40yd'] = ldf['40yd'].notna().astype(int)
    ldf['contains_height'] = ldf['Height'].notna().astype(int)
    ldf['contains_weight'] = ldf['Weight'].notna().astype(int)
    ldf['contains_speed_score'] = ldf['speed_score'].notna().astype(int)
    ldf['contains_explosive_score'] = 1
    ldf['contains_qb_hurry_final_season'] = ldf['QB_Hurry_final_season'].notna().astype(int)
    ldf['contains_tfl_final_season'] = ldf['TFL_final_season'].notna().astype(int)
    ldf['contains_sacks_final_season'] = ldf['Sacks_final_season'].notna().astype(int)
    ldf['contains_pd_final_season'] = ldf['PD_final_season'].notna().astype(int)
    ldf['contains_solo_final_season'] = ldf['SOLO_final_season'].notna().astype(int)
    ldf['contains_tot_final_season'] = ldf['TOT_final_season'].notna().astype(int)
    ldf['contains_ras'] = ldf['RAS'].notna().astype(int)
    ldf['contains_p4_conference'] = ldf['School'].notna().astype(int)
    return ldf

# 2024 and 2025 (drafted only; actual round 1–7)
lb_2024 = prepare_lb_df('lb_drafted_2024.csv', 2024)
lb_2025 = prepare_lb_df('lb_drafted_2025.csv', 2025)

X_24_raw = lb_2024[FEATURES_WITH_COLLEGE_ALL].copy()
X_25_raw = lb_2025[FEATURES_WITH_COLLEGE_ALL].copy()
X_24 = imputer.transform(X_24_raw)
X_25 = imputer.transform(X_25_raw)
X_24_scaled = scaler.transform(X_24)
X_25_scaled = scaler.transform(X_25)

pred_24 = np.clip(ridge.predict(X_24_scaled), 1, 8)
pred_25 = np.clip(ridge.predict(X_25_scaled), 1, 8)

actual_24 = lb_2024['Round'].astype(int).values
actual_25 = lb_2025['Round'].astype(int).values

def eval_metrics(actual, pred, label):
    mae = mean_absolute_error(actual, pred)
    rmse = np.sqrt(mean_squared_error(actual, pred))
    r2 = r2_score(actual, pred)
    exact = (np.round(pred) == actual).mean()
    within_1 = (np.abs(np.round(pred) - actual) <= 1).mean()
    print(f'{label} (n={len(actual)}): MAE={mae:.4f}, RMSE={rmse:.4f}, R²={r2:.4f}, Exact={exact:.2%}, Within-1={within_1:.2%}')

print('2024 LBs:')
eval_metrics(actual_24, pred_24, '2024')
print('2025 LBs:')
eval_metrics(actual_25, pred_25, '2025')

2024 LBs:
2024 (n=14): MAE=1.5823, RMSE=1.8646, R²=-0.0202, Exact=14.29%, Within-1=42.86%
2025 LBs:
2025 (n=21): MAE=1.4901, RMSE=1.7983, R²=-0.1194, Exact=19.05%, Within-1=47.62%


In [24]:
# Dataframes: players with actual round, model prediction, tier label, and interpretation
def pred_round_to_tier(p):
    if p < 1.75: return ('Round 1 Tier', 'True 1st-round grade')
    if p < 2.75: return ('Round 2 Tier', 'Early Day 2')
    if p < 3.75: return ('Round 3 Tier', 'Late Day 2')
    if p < 4.75: return ('Round 4 Tier', 'Early Day 3')
    if p < 5.75: return ('Round 5 Tier', 'Mid Day 3')
    if p < 6.75: return ('Round 6 Tier', 'Late Day 3')
    return ('Round 7 / UDFA Tier', 'Fringe draftable')

lb_2024_display = lb_2024[['Round', 'Pick', 'Player', 'School', 'Year']].copy()
lb_2024_display['predicted_round'] = pred_24
lb_2024_display['tier_label'] = [pred_round_to_tier(x)[0] for x in pred_24]
lb_2024_display['interpretation'] = [pred_round_to_tier(x)[1] for x in pred_24]
lb_2024_display['Round'] = lb_2024_display['Round'].astype(int)

lb_2025_display = lb_2025[['Round', 'Pick', 'Player', 'School', 'Year']].copy()
lb_2025_display['predicted_round'] = pred_25
lb_2025_display['tier_label'] = [pred_round_to_tier(x)[0] for x in pred_25]
lb_2025_display['interpretation'] = [pred_round_to_tier(x)[1] for x in pred_25]
lb_2025_display['Round'] = lb_2025_display['Round'].astype(int)

print('2024 drafted LBs')
display(lb_2024_display)
print('2025 drafted LBs')
display(lb_2025_display)

2024 drafted LBs


Unnamed: 0,Round,Pick,Player,School,Year,predicted_round,tier_label,interpretation
0,1,17,Dallas Turner,Alabama,2024,2.598991,Round 2 Tier,Early Day 2
1,2,45,Edgerrin Cooper,Texas A&M,2024,3.982013,Round 4 Tier,Early Day 3
2,2,52,Junior Colson,Michigan,2024,5.226736,Round 5 Tier,Mid Day 3
3,3,72,Trevin Wallace,Kentucky,2024,4.285152,Round 4 Tier,Early Day 3
4,3,98,Payton Wilson,NC State,2024,2.145014,Round 2 Tier,Early Day 2
5,3,87,Marist Liufau,Notre Dame,2024,5.868149,Round 6 Tier,Late Day 3
6,4,114,Jaylan Ford,Texas,2024,6.175444,Round 6 Tier,Late Day 3
7,4,118,Tyrice Knight,UTEP,2024,4.002633,Round 4 Tier,Early Day 3
8,5,149,Edefuan Ulofoshio,Washington,2024,4.246435,Round 4 Tier,Early Day 3
9,5,160,Steele Chambers,Ohio State,2024,7.848669,Round 7 / UDFA Tier,Fringe draftable


2025 drafted LBs


Unnamed: 0,Round,Pick,Player,School,Year,predicted_round,tier_label,interpretation
0,1,15,Jalon Walker,Georgia,2025,4.451144,Round 4 Tier,Early Day 3
1,1,31,Jihaad Campbell,Alabama,2025,3.710444,Round 3 Tier,Late Day 2
2,2,33,Carson Schwesinger,UCLA,2025,4.156584,Round 4 Tier,Early Day 3
3,2,49,Demetrius Knight Jr,South Carolina,2025,5.473704,Round 5 Tier,Mid Day 3
4,3,75,Nick Martin,Oklahoma State,2025,5.132753,Round 5 Tier,Mid Day 3
5,4,107,Jack Kiser,Notre Dame,2025,5.939987,Round 6 Tier,Late Day 3
6,4,112,Danny Stutsman,Oklahoma,2025,5.067752,Round 5 Tier,Mid Day 3
7,4,115,Cody Simon,Ohio State,2025,3.69106,Round 3 Tier,Late Day 2
8,4,119,Barrett Carter,Clemson,2025,4.878601,Round 5 Tier,Mid Day 3
9,4,129,Teddye Buchanan,California,2025,3.964919,Round 4 Tier,Early Day 3


In [26]:
# 2026 evaluation
lb_2026 = prepare_lb_df('lb_drafted_2026.csv', 2026)

X_26_raw = lb_2026[FEATURES_WITH_COLLEGE_ALL].copy()
X_26 = imputer.transform(X_26_raw)
X_26_scaled = scaler.transform(X_26)

pred_26 = np.clip(ridge.predict(X_26_scaled), 1, 8)

# Check if Round column exists and has valid values for evaluation
if 'Round' in lb_2026.columns and lb_2026['Round'].notna().any():
    actual_26 = lb_2026['Round'].astype(int).values
    print('2026 LBs:')
    eval_metrics(actual_26, pred_26, '2026')
else:
    print(f'2026 LBs (n={len(pred_26)}): Predictions generated (no actual rounds available)')

# Display 2026 predictions
lb_2026_display = lb_2026[['Round', 'Pick', 'Player', 'School', 'Year']].copy()
lb_2026_display['predicted_round'] = pred_26
lb_2026_display['tier_label'] = [pred_round_to_tier(x)[0] for x in pred_26]
lb_2026_display['interpretation'] = [pred_round_to_tier(x)[1] for x in pred_26]
if 'Round' in lb_2026_display.columns and lb_2026_display['Round'].notna().any():
    lb_2026_display['Round'] = lb_2026_display['Round'].astype(int)

print('\n2026 drafted LBs')
display(lb_2026_display)

2026 LBs (n=18): Predictions generated (no actual rounds available)

2026 drafted LBs


Unnamed: 0,Round,Pick,Player,School,Year,predicted_round,tier_label,interpretation
0,,,Arvell Reese,Ohio State,2026,2.520062,Round 2 Tier,Early Day 2
1,,,Sonny Styles,Ohio State,2026,2.399693,Round 2 Tier,Early Day 2
2,,,CJ Allen,Georgia,2026,2.829166,Round 3 Tier,Late Day 2
3,,,Anthony Hill Jr,Texas,2026,3.358956,Round 3 Tier,Late Day 2
4,,,Deontae Lawson,Alabama,2026,4.541959,Round 4 Tier,Early Day 3
5,,,Josiah Trotter,Missouri,2026,3.885402,Round 4 Tier,Early Day 3
6,,,Jake Golday,Cincinnati,2026,3.00923,Round 3 Tier,Late Day 2
7,,,Taurean York,Texas A&M,2026,5.38397,Round 5 Tier,Mid Day 3
8,,,Jacob Rodriguez,Texas Tech,2026,5.243691,Round 5 Tier,Mid Day 3
9,,,Harold Perkins Jr,LSU,2026,3.504867,Round 3 Tier,Late Day 2


In [None]:
# Model results on entire training set (2017–2023), ordered by predicted_round
train_display = df[['Round', 'Pick', 'Player', 'School', 'Year']].copy()
train_display['predicted_round'] = y_pred_train
train_display['tier_label'] = [pred_round_to_tier(x)[0] for x in y_pred_train]
train_display['interpretation'] = [pred_round_to_tier(x)[1] for x in y_pred_train]
train_display = train_display.sort_values('predicted_round').reset_index(drop=True)
train_display

Unnamed: 0,Round,Pick,Player,School,Year,predicted_round,tier_label,interpretation
0,1.0,8.0,Isaiah Simmons,Clemson,2020,1.000000,Round 1 Tier,True 1st-round grade
1,1.0,5.0,Devin White,LSU,2019,1.933558,Round 2 Tier,Early Day 2
2,4.0,120.0,Brandon Smith,Penn St.,2022,2.123539,Round 2 Tier,Early Day 2
3,2.0,58.0,Troy Andersen,Montana St.,2022,2.369001,Round 2 Tier,Early Day 2
4,1.0,26.0,Takkarist McKinley,UCLA,2017,2.393954,Round 2 Tier,Early Day 2
...,...,...,...,...,...,...,...,...
244,,,Tuf Borland,Ohio St.,2021,8.000000,Round 7 / UDFA Tier,Fringe draftable
245,,,Justin Hughes,Kansas St.,2021,8.000000,Round 7 / UDFA Tier,Fringe draftable
246,,,Tavante Beckett,Marshall,2021,8.000000,Round 7 / UDFA Tier,Fringe draftable
247,,,Calvin Bundage,Oklahoma St.,2021,8.000000,Round 7 / UDFA Tier,Fringe draftable
