# NHL Playoff Model - Model Training

This notebook creates simple predictive models for NHL playoff series outcomes and saves them in the correct format for the application.

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import os
import joblib
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import warnings
warnings.filterwarnings('ignore')

# Install xgboost if needed
try:
    import xgboost as xgb
except ImportError:
    !pip install xgboost
    import xgboost as xgb

# Set up paths
model_folder = os.path.join(os.path.dirname(os.path.abspath("__file__")), "models")
os.makedirs(model_folder, exist_ok=True)

print(f"Model folder: {model_folder}")

In [None]:
# Create sample training data for playoff series
# This is a simplified version for demo purposes

# Key features
features = [
    'points_diff',            # Point differential between teams
    'PP%_rel_diff',           # Special teams - power play
    'PK%_rel_diff',           # Special teams - penalty kill
    'xGoalsPercentage_diff',  # Expected goals percentage differential
    'goalDiff/G_diff',        # Goal differential per game
    'playoff_performance_score_diff'  # Prior playoff performance
]

# Create synthetic data that mimics real playoff matchups
np.random.seed(42)  # For reproducibility
n_samples = 200

# Generate feature values
X = pd.DataFrame({
    'points_diff': np.random.normal(0, 10, n_samples),  # Point differential (higher seed usually has more points)
    'PP%_rel_diff': np.random.normal(0, 0.05, n_samples),  # Power play differential
    'PK%_rel_diff': np.random.normal(0, 0.05, n_samples),  # Penalty kill differential
    'xGoalsPercentage_diff': np.random.normal(0, 0.06, n_samples),  # Expected goals differential
    'goalDiff/G_diff': np.random.normal(0, 0.5, n_samples),  # Goal differential per game
    'playoff_performance_score_diff': np.random.normal(0, 1.0, n_samples)  # Prior playoff performance
})

# Generate outcomes (higher seed tends to win more often)
# Let's make the probability depend on the features
win_probs = 1 / (1 + np.exp(-(0.08 * X['points_diff'] + 
                               5 * X['PP%_rel_diff'] + 
                               5 * X['PK%_rel_diff'] + 
                               10 * X['xGoalsPercentage_diff'] + 
                               0.5 * X['goalDiff/G_diff'] +
                               0.3 * X['playoff_performance_score_diff'])))

y = np.random.binomial(1, win_probs)

print(f"Created sample dataset with {len(X)} examples")
X.head()

In [None]:
# Train simple models
print("Training models...")

# 1. Logistic Regression
lr_model = LogisticRegression(random_state=42)
lr_model.fit(X, y)

# 2. XGBoost
xgb_model = xgb.XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')
xgb_model.fit(X, y)

# Create properly formatted model packages with features
lr_package = {
    'model': lr_model,
    'features': features
}

xgb_package = {
    'model': xgb_model,
    'features': features
}

# Also create a combined model package
playoff_model = {
    'mode': 'ensemble',
    'home_ice_boost': 0.039,
    'models': {
        'lr': lr_package,
        'xgb': xgb_package
    }
}

print("Models trained successfully")

In [None]:
# Quick test of models
test_input = pd.DataFrame({
    'points_diff': [10, -5],
    'PP%_rel_diff': [0.02, -0.01],
    'PK%_rel_diff': [0.01, -0.02],
    'xGoalsPercentage_diff': [0.03, -0.02],
    'goalDiff/G_diff': [0.5, -0.3],
    'playoff_performance_score_diff': [1.0, -0.5]
})

print("Logistic Regression predictions:")
lr_probs = lr_model.predict_proba(test_input)[:, 1]
print(f"Win probabilities: {lr_probs}")

print("\nXGBoost predictions:")
xgb_probs = xgb_model.predict_proba(test_input)[:, 1]
print(f"Win probabilities: {xgb_probs}")

# Check if both models have predict_proba method
print(f"\nLR has predict_proba: {hasattr(lr_model, 'predict_proba')}")
print(f"XGB has predict_proba: {hasattr(xgb_model, 'predict_proba')}")

In [None]:
# Save the models to disk
lr_path = os.path.join(model_folder, 'logistic_regression_model_final.pkl')
xgb_path = os.path.join(model_folder, 'xgboost_playoff_model_final.pkl')
combined_path = os.path.join(model_folder, 'playoff_model.pkl')

# Save each model
joblib.dump(lr_package, lr_path)
joblib.dump(xgb_package, xgb_path)
joblib.dump(playoff_model, combined_path)

print(f"Models saved to {model_folder}")

In [None]:
# Test loading the models to make sure they work
print("Testing model loading...")

try:
    # Load the combined model
    loaded_model = joblib.load(combined_path)
    
    print(f"Loaded model type: {type(loaded_model)}")
    print(f"Model mode: {loaded_model.get('mode', 'unknown')}")
    print(f"Home ice boost: {loaded_model.get('home_ice_boost', 'unknown')}")
    
    # Check if models are present and have predict_proba
    if 'models' in loaded_model:
        for model_name, model_package in loaded_model['models'].items():
            if 'model' in model_package and hasattr(model_package['model'], 'predict_proba'):
                print(f"✓ {model_name} model loaded correctly with predict_proba")
                print(f"  Features: {model_package.get('features', [])}")
            else:
                print(f"✗ {model_name} model missing or doesn't have predict_proba")
    else:
        print("No models found in the loaded package")
        
    # Test prediction with the loaded model
    if 'models' in loaded_model and 'lr' in loaded_model['models'] and 'model' in loaded_model['models']['lr']:
        lr_model = loaded_model['models']['lr']['model']
        test_pred = lr_model.predict_proba(test_input)[:, 1]
        print(f"\nTest predictions match: {np.allclose(test_pred, lr_probs)}")
    
except Exception as e:
    print(f"Error loading model: {str(e)}")