In [2]:
# ============================================================================
# IC¬≤ PURE: CFP-Era Training Only (2014-2024)
# ============================================================================

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)

print("="*80)
print("IC¬≤ PURE - CFP-ERA TRAINING (2014-2024)")
print("="*80)

# ----------------------------
# 1. LOAD AND CLEAN DATA 
# Add your own file path
# ----------------------------
df = 

bad_rows = (
    ((df['Year'] == 2020) & (df['school_key'] == 'connecticut')) |
    ((df['Year'] == 2020) & (df['school_key'] == 'old-dominion')) |
    ((df['Year'] == 1937) & (df['school_key'] == 'virginia'))
)
df = df[~bad_rows].copy()

print(f"Full dataset: {df.shape[0]:,} team-seasons ({df['Year'].min()}-{df['Year'].max()})")

# ----------------------------
# 2. DEFINE CHAMPION TARGET (on full data for scoring)
# ----------------------------
df['Polls_AP Post'] = pd.to_numeric(df['Polls_AP Post'], errors='coerce')
df['Polls_CFP Final'] = pd.to_numeric(df['Polls_CFP Final'], errors='coerce')

df['champion_target'] = (
    (df['Polls_AP Post'] == 1) &
    (df['Polls_CFP Final'].notna())
).astype(int)

# ----------------------------
# 3. FILTER TO CFP ERA FOR TRAINING
# ----------------------------
df_cfp = df[df['Year'] >= 2014].copy()

print(f"CFP-era training set: {len(df_cfp):,} team-seasons (2014-2024)")
print(f"Champions: {df_cfp['champion_target'].sum()}")
print(f"Non-champions: {(df_cfp['champion_target']==0).sum()}")

champions = df_cfp[df_cfp['champion_target'] == 1]
print(f"\nNational Champions in training data:")
print(champions[['Year', 'school_key']].sort_values('Year', ascending=False).to_string(index=False))

# ----------------------------
# 4. CALCULATE CHAMPION THRESHOLDS (from CFP era only)
# ----------------------------
CHAMPION_THRESHOLDS = {
    'WIN_PCT': champions['Overall_Pct'].min(),
    'SRS': champions['SRS_SRS'].min(),
    'SOS': champions['SRS_SOS'].min()
}

print("\n" + "="*80)
print("HISTORICAL CHAMPION CRITERIA (CFP Era 2014-2024)")
print("="*80)
print(f"1. Win Percentage ‚â• {CHAMPION_THRESHOLDS['WIN_PCT']:.3f}")
print(f"2. SRS Rating ‚â• {CHAMPION_THRESHOLDS['SRS']:.1f}")
print(f"3. Strength of Schedule ‚â• {CHAMPION_THRESHOLDS['SOS']:.1f}")

print(f"\nAverage champion statistics:")
print(f"  ‚Ä¢ Win%: {champions['Overall_Pct'].mean():.3f}")
print(f"  ‚Ä¢ SRS: {champions['SRS_SRS'].mean():.1f}")
print(f"  ‚Ä¢ SOS: {champions['SRS_SOS'].mean():.1f}")

# ----------------------------
# 5. TRAIN MODEL ON CFP ERA ONLY
# ----------------------------
features = ['Overall_Pct', 'SRS_SRS', 'SRS_SOS']

# Handle missing values in CFP training set
for col in features:
    df_cfp[col] = df_cfp[col].fillna(df_cfp[col].mean())

# Prepare training data
X_train = df_cfp[features]
y_train = df_cfp['champion_target']

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# Train model
model = LogisticRegression(
    class_weight='balanced',
    max_iter=1000,
    solver='liblinear',
    random_state=42
)
model.fit(X_train_scaled, y_train)

# Model coefficients
coef_df = pd.DataFrame({
    'feature': features,
    'coefficient': model.coef_[0]
}).sort_values('coefficient', ascending=False)

print("\n" + "="*80)
print("IC¬≤ PURE MODEL COEFFICIENTS (CFP-Era Trained)")
print("="*80)
print(coef_df.to_string(index=False))

# ----------------------------
# 6. SCORE ALL DATA (including 2025)
# ----------------------------
# Handle missing values in full dataset
for col in features:
    df[col] = df[col].fillna(df[col].mean())

X_full = df[features]
X_full_scaled = scaler.transform(X_full)

df['IC2_score'] = model.predict_proba(X_full_scaled)[:, 1] * 100
df['IC2_rank'] = df.groupby('Year')['IC2_score'].rank(method='first', ascending=False)

# ----------------------------
# 7. CREATE CHAMPION PROFILE SCORES (on all data)
# ----------------------------
df['meets_win'] = (df['Overall_Pct'] >= CHAMPION_THRESHOLDS['WIN_PCT']).astype(int)
df['meets_srs'] = (df['SRS_SRS'] >= CHAMPION_THRESHOLDS['SRS']).astype(int)
df['meets_sos'] = (df['SRS_SOS'] >= CHAMPION_THRESHOLDS['SOS']).astype(int)

df['champion_profile'] = df['meets_win'] + df['meets_srs'] + df['meets_sos']

# ----------------------------
# 8. HISTORICAL ACCURACY (2014-2023, excluding 2024 from validation)
# ----------------------------
print("\n" + "="*80)
print("HISTORICAL ACCURACY (2014-2023)")
print("="*80)

actual_champs = df[(df['champion_target'] == 1) & (df['Year'] >= 2014) & (df['Year'] <= 2023)]
predicted_champs = df[(df['IC2_rank'] == 1) & (df['Year'] >= 2014) & (df['Year'] <= 2023)]

merged = actual_champs.merge(
    predicted_champs[['Year', 'school_key']],
    on='Year',
    suffixes=('_actual', '_predicted')
)
merged['correct'] = merged['school_key_actual'] == merged['school_key_predicted']
accuracy = merged['correct'].mean()

print(f"\nAccuracy: {merged['correct'].sum()}/{len(merged)} = {accuracy:.1%}")

print("\n‚úÖ CORRECTLY PREDICTED:")
for _, row in merged[merged['correct']].sort_values('Year').iterrows():
    team_name = row['school_key_actual'].replace('-', ' ').title()
    print(f"  {row['Year']}: {team_name}")

if not merged['correct'].all():
    print("\n‚ùå MISSED:")
    for _, row in merged[~merged['correct']].sort_values('Year').iterrows():
        actual_name = row['school_key_actual'].replace('-', ' ').title()
        predicted_name = row['school_key_predicted'].replace('-', ' ').title()
        print(f"  {row['Year']}: Predicted {predicted_name}, Actual {actual_name}")

# ----------------------------
# 9. 2025 RANKINGS
# ----------------------------
df_2025 = df[df['Year'] == 2025].copy()
df_2025 = df_2025.sort_values('IC2_rank')

display_cols = [
    'school_key', 'Overall_Pct', 'SRS_SRS', 'SRS_SOS',
    'IC2_score', 'IC2_rank', 'champion_profile'
]

top25_2025 = df_2025[display_cols].head(25).copy()
top25_2025['school_key'] = top25_2025['school_key'].str.replace('-', ' ').str.title()
top25_2025.columns = [
    'Team', 'Win%', 'SRS', 'SOS',
    'IC¬≤ Score', 'IC¬≤ Rank', 'Champ Profile'
]

print("\n" + "="*80)
print("IC¬≤ PURE RANKINGS - 2025 TOP 25 (CFP-Era Model)")
print("="*80)
print(top25_2025.to_string(index=False))

# ----------------------------
# 10. TEAM ANALYSIS CARDS (TOP 5)
# ----------------------------
print("\n" + "="*80)
print("TEAM ANALYSIS CARDS - TOP 5")
print("="*80)

def create_team_card(team_data):
    """Generate detailed analysis card for a team"""
    card = []
    card.append(f"\nüèà {team_data['Team'].upper()}")
    card.append("‚îÄ" * 50)
    card.append(f"IC¬≤ Rank: #{int(team_data['IC¬≤ Rank'])}")
    card.append(f"IC¬≤ Score: {team_data['IC¬≤ Score']:.1f}")
    card.append(f"Champion Profile: {int(team_data['Champ Profile'])}/3 criteria")
    card.append("")
    
    # Champion criteria analysis
    card.append("CHAMPION CRITERIA ANALYSIS:")
    
    win_status = "‚úì" if team_data['Win%'] >= CHAMPION_THRESHOLDS['WIN_PCT'] else "‚úó"
    win_detail = f"{team_data['Win%']:.3f}" if win_status == "‚úì" else f"{team_data['Win%']:.3f} < {CHAMPION_THRESHOLDS['WIN_PCT']:.3f}"
    card.append(f"  {win_status} Win Percentage: {win_detail}")
    
    srs_status = "‚úì" if team_data['SRS'] >= CHAMPION_THRESHOLDS['SRS'] else "‚úó"
    srs_detail = f"{team_data['SRS']:.1f}" if srs_status == "‚úì" else f"{team_data['SRS']:.1f} < {CHAMPION_THRESHOLDS['SRS']:.1f}"
    card.append(f"  {srs_status} SRS Rating: {srs_detail}")
    
    sos_status = "‚úì" if team_data['SOS'] >= CHAMPION_THRESHOLDS['SOS'] else "‚úó"
    sos_detail = f"{team_data['SOS']:.1f}" if sos_status == "‚úì" else f"{team_data['SOS']:.1f} < {CHAMPION_THRESHOLDS['SOS']:.1f}"
    card.append(f"  {sos_status} Strength of Schedule: {sos_detail}")
    
    # Overall assessment
    card.append("")
    card.append("ASSESSMENT:")
    
    profile = int(team_data['Champ Profile'])
    if profile == 3:
        card.append("  ‚úÖ PERFECT CHAMPION PROFILE")
    elif profile == 2:
        card.append("  ‚ö†Ô∏è  NEAR-COMPLETE PROFILE - Strong contender")
    elif profile == 1:
        card.append("  üîÑ ONE-DIMENSIONAL - Strong in one area only")
    else:
        card.append("  ‚ùå NOT CHAMPIONSHIP CALIBER")
    
    return "\n".join(card)

for i, (_, team) in enumerate(top25_2025.head(5).iterrows(), 1):
    print(create_team_card(team))
    if i < 5:
        print("\n" + "‚ïê" * 50)

# ----------------------------
# 11. CHAMPION PROFILE ANALYSIS
# ----------------------------
print("\n" + "="*80)
print("CHAMPION PROFILE ANALYSIS - 2025")
print("="*80)

profile_counts = df_2025['champion_profile'].value_counts().sort_index(ascending=False)

print("\nTeams meeting champion criteria:")
for score in range(3, -1, -1):
    count = profile_counts.get(score, 0)
    if score == 3:
        desc = "Perfect profile"
    elif score == 2:
        desc = "Strong contender"
    elif score == 1:
        desc = "One-dimensional"
    else:
        desc = "Not championship caliber"
    print(f"  {score}/3 criteria: {count:3d} teams ({desc})")

contenders = df_2025[df_2025['champion_profile'] >= 2].sort_values('IC2_rank')
if not contenders.empty:
    print(f"\nTrue contenders (2+ criteria):")
    for _, team in contenders.iterrows():
        name = team['school_key'].replace('-', ' ').title()
        print(f"  ‚Ä¢ {name} (#{int(team['IC2_rank'])}) - {int(team['champion_profile'])}/3 criteria")

# ----------------------------
# 12. KEY INSIGHTS
# ----------------------------
print("\n" + "="*80)
print("KEY INSIGHTS - CFP-ERA MODEL")
print("="*80)

print(f"""
TRAINING APPROACH:
- Model trained ONLY on CFP-era data (2014-2024)
- {len(df_cfp):,} team-seasons, {df_cfp['champion_target'].sum()} champions
- More theoretically sound than using all historical data

ACCURACY:
- {accuracy:.1%} on 2014-2023 validation set
- Thresholds derived from actual CFP champions only

2025 CHAMPIONSHIP LANDSCAPE:
- {profile_counts.get(3, 0)} team(s) with perfect 3/3 profile
- {profile_counts.get(2, 0)} teams meet 2/3 criteria (true contenders)
- No team has proven they can win vs elite competition at elite level

COEFFICIENT CHANGES vs All-Years Model:
- Win% importance may shift
- SRS/SOS weights calibrated to modern football only
- Model learns ONLY from playoff-era patterns
""")

if not contenders.empty:
    best = contenders.iloc[0]
    best_name = best['school_key'].replace('-', ' ').title()
    print(f"PREDICTION: {best_name} most likely champion")
    print(f"  (#{int(best['IC2_rank'])}, {int(best['champion_profile'])}/3 criteria, {best['IC2_score']:.1f} IC¬≤ score)")

print("\n" + "="*80)
print("ANALYSIS COMPLETE! üèà")
print("="*80)

IC¬≤ PURE - CFP-ERA TRAINING (2014-2024)
Full dataset: 14,406 team-seasons (1869-2025)
CFP-era training set: 1,566 team-seasons (2014-2024)
Champions: 11
Non-champions: 1555

National Champions in training data:
 Year      school_key
 2024      ohio-state
 2023        michigan
 2022         georgia
 2021         georgia
 2020         alabama
 2019 louisiana-state
 2018         clemson
 2017         alabama
 2016         clemson
 2015         alabama
 2014      ohio-state

HISTORICAL CHAMPION CRITERIA (CFP Era 2014-2024)
1. Win Percentage ‚â• 0.875
2. SRS Rating ‚â• 20.1
3. Strength of Schedule ‚â• 5.2

Average champion statistics:
  ‚Ä¢ Win%: 0.958
  ‚Ä¢ SRS: 24.4
  ‚Ä¢ SOS: 6.6

IC¬≤ PURE MODEL COEFFICIENTS (CFP-Era Trained)
    feature  coefficient
Overall_Pct     2.539316
    SRS_SRS     2.178634
    SRS_SOS     1.102490

HISTORICAL ACCURACY (2014-2023)

Accuracy: 8/10 = 80.0%

‚úÖ CORRECTLY PREDICTED:
  2014: Ohio State
  2015: Alabama
  2018: Clemson
  2019: Louisiana State
  2020