# Game Outcome Model

This notebook will test a number of classifier and logistic regression models to predict game outcome.

Models Include:
- Logistic Regression
- Decision Tree
- Random Forest

## Load Libraries

In [None]:
import os
import sys
import warnings

import IPython
import ipywidgets as widgets
import matplotlib.pyplot as plt
import seaborn as sns
import mglearn
import numpy as np
import pandas as pd
from IPython.display import HTML, display
from ipywidgets import interact, interactive
from sklearn.dummy import DummyClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_val_score, cross_validate, train_test_split
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB, BernoulliNB
import xgboost as xgb
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, classification_report
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from utils import *
from sklearn.model_selection import GridSearchCV
from sklearn.exceptions import ConvergenceWarning
from sklearn.metrics import make_scorer
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.metrics import r2_score
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import MinMaxScaler

import random

import yellowbrick
from yellowbrick.cluster import KElbowVisualizer, SilhouetteVisualizer
from sklearn.decomposition import NMF, PCA

%matplotlib inline
pd.set_option("display.max_colwidth", 200)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

## Set Folder Path

In [None]:
# Navigate two folders up
#os.chdir(os.path.join(os.getcwd(), ".."))

os.chdir("/Users/dusty/Desktop/projects/sports_analytics.nosync/fantasy_wizard/")

# Print the new working directory
print("Working In:", os.getcwd())

## Load Data

In [None]:
# Load rb data
game_data = pd.read_csv(os.path.join(os.getcwd(), "data", "processed", "games", "modeling_game_data.csv"))

## Investigate Data

In [None]:
game_data.shape

In [None]:
game_data.sort_values(by='game_id').head()

### Numeric Variable Selection

In [None]:
numeric_vars = []

varset_1 = [26] + list(range(1, 20))
#sns.pairplot(game_data.iloc[:,varset_1], hue='result')    

In [None]:
numeric_vars = numeric_vars + ['season', 'apy_cap_pct', 'years_left']

In [None]:
varset_2 = list(set([26] + list(range(21, 30))))
#sns.pairplot(game_data.iloc[:,varset_2], hue='result')   

In [None]:
numeric_vars = numeric_vars + ['favored', 'rolling_win_pct', 'rolling_off_ppg', 'rolling_off_pypg']

In [None]:
varset_3 = [26] + list(range(31, 40))
#sns.pairplot(game_data.iloc[:,varset_3], hue='result')   

In [None]:
numeric_vars = numeric_vars + ['roll_off_typg', 'rolling_off_ptdpg', 'rolling_off_rtdpg', 'rolling_off_ttdpg', 'rolling_def_rypg']

In [None]:
varset_4 = [26] + list(range(41, 50))
#sns.pairplot(game_data.iloc[:,varset_4], hue='result')   

In [None]:
numeric_vars = numeric_vars + ['rolling_def_ttdpg', 'rolling_avg_air_yards_differential','rolling_avg_attempts', 'rolling_avg_pass_yards', 'rolling_avg_pass_touchdowns', 'rolling_avg_interceptions']

In [None]:
varset_5 = [26] + list(range(51, 60))
#sns.pairplot(game_data.iloc[:,varset_5], hue='result')   

In [None]:
numeric_vars = numeric_vars + ['prev_season_win_pct']

In [None]:
varset_6 = [26] + list(range(61, 70))
#sns.pairplot(game_data.iloc[:,varset_6], hue='result')   

In [None]:
numeric_vars = numeric_vars + ['prev_season_off_plays_per_game', 'prev_season_off_run_pct', 'prev_season_off_pass_pct', 'prev_season_off_pypg', 'prev_season_off_rypg',
                               'prev_season_off_typg', 'prev_season_off_ptdpg', 'prev_season_off_rtdpg', 'prev_season_off_ttdpg']

In [None]:
varset_7 = [26] + list(range(71, 86))
#sns.pairplot(game_data.iloc[:,varset_7], hue='result')   

In [None]:
numeric_vars = numeric_vars + ['prev_season_off_spg', 'prev_season_off_ipg', 'prev_season_def_ppg', 'prev_season_def_plays_per_game',
                               'prev_season_def_pypg', 'prev_season_def_typg', 'prev_season_def_spg', 'prev_season_def_ipg']

In [None]:
print("Numeric Variables that show variation by game outcome: ", numeric_vars)

### Categorical Variable Selection

In [None]:
game_data.head()

## Split into Train, Test, Validation Sets
- 80-20 split
- Validation year will be 2023

In [None]:
# Get Target Columns
target_columns = ['result']

# Separate validation data
validation_season = game_data[game_data['season'] == 2023]
validation_season_x = validation_season.drop(columns=target_columns)
validation_season_y = validation_season.result

# Remove validation season
modeling_seasons = game_data[game_data['season'] != 2023]

# Separate Modeling Data
modeling_data_x = modeling_seasons.drop(columns=target_columns)
modeling_data_y = modeling_seasons.result

# Split into Train and Test
X_train, X_test, y_train, y_test = train_test_split(
    modeling_data_x, modeling_data_y, train_size=0.8, test_size=0.1, random_state=42
)

modeling_data_span = sorted(set(modeling_data_x.season))
validation_data_span = sorted(set(validation_season_x.season))

print("Modeling Seasons: ", modeling_data_span)
print("Validation Seasons: ", validation_data_span)

In [None]:
print(f"Training Data Dimensions: {X_train.shape}")
print(f"Test Data Dimensions: {X_test.shape}")

assert(X_train.shape[0] == y_train.shape[0])
assert(X_test.shape[0] == y_test.shape[0])

In [None]:
plt.hist(y_train);

## Define Columns for Modeling

In [None]:
modeling_columns = ['apy_cap_pct', 'years_left', 'spread_line',
                   'rolling_win_pct', 'rolling_off_ppg', 'rolling_off_pypg',
                   'rolling_off_rypg', 'rolling_off_typg', 'rolling_off_ptdpg',
                   'rolling_off_rtdpg', 'rolling_off_ttdpg', 'rolling_def_ppg',
                   'rolling_def_pypg', 'rolling_def_rypg', 'rolling_def_typg',
                   'rolling_def_ptdpg', 'rolling_def_rtdpg', 'rolling_def_ttdpg',
                   'rolling_avg_time_to_throw', 'rolling_avg_completed_air_yards',
                   'rolling_avg_intended_air_yards', 'rolling_avg_air_yards_differential',
                   'rolling_avg_attempts', 'rolling_avg_pass_yards',
                   'rolling_avg_pass_touchdowns', 'rolling_avg_interceptions',
                   'rolling_avg_passer_rating', 'rolling_avg_completions',
                   'rolling_avg_completion_percentage',
                   'rolling_avg_expected_completion_percentage',
                   'rolling_avg_completion_percentage_above_expectation',
                   'rolling_avg_avg_air_distance', 'rolling_avg_max_air_distance',
                   'rolling_n_on_report', 'rolling_n_on_practice_report',
                   'prev_season_win_pct', 'prev_season_off_ppg',
                   'prev_season_off_plays_per_game', 'prev_season_off_run_pct',
                   'prev_season_off_pass_pct', 'prev_season_off_pypg',
                   'prev_season_off_rypg', 'prev_season_off_typg', 'prev_season_off_ptdpg',
                   'prev_season_off_rtdpg', 'prev_season_off_ttdpg',
                   'prev_season_off_fdpg', 'prev_season_off_spg', 'prev_season_off_ipg',
                   'prev_season_def_ppg', 'prev_season_def_plays_per_game',
                   'prev_season_def_run_pct', 'prev_season_def_pass_pct',
                   'prev_season_def_pypg', 'prev_season_def_rypg', 'prev_season_def_typg',
                   'prev_season_def_ptdpg', 'prev_season_def_rtdpg',
                   'prev_season_def_ttdpg', 'prev_season_def_fdpg', 'prev_season_def_spg',
                   'prev_season_def_ipg']

drop_columns = ['game_id', 'team', 'points_scored', 'points_allowed', 'score']

# Get cat columns
cat_columns = ['season', 'week', 'game_type', 'home_away',
               'stadium_id', 'weekday', 'game_window', 'qb', 'coach', 'opposing_qb',
               'opposing_coach', 'home_rest', 'away_rest',
               'div_game', 'roof', 'temp_conditions', 'wind_conditions', 'favored']

# Get numeric columns
non_numeric_columns = drop_columns + cat_columns
numeric_columns = X_train.drop(columns=non_numeric_columns).columns

assert(len(numeric_columns) + len(drop_columns) + len(cat_columns) == len(X_train.columns))

In [None]:
# Preprocess Data

# Impute missing data
#imputer = SimpleImputer(strategy="median")
#PCA_data = imputer.fit_transform(modeling_data[numeric_columns])
PCA_data = modeling_data_x[numeric_columns].dropna()

# Create scaling object
scaler = MinMaxScaler()

# Fit scaling object
scaler.fit(modeling_data_x[numeric_columns].dropna())

# Create transformed data
scaled_modeling_data = pd.DataFrame(scaler.transform(modeling_data_x[numeric_columns].dropna()), columns=numeric_columns)

In [None]:
# Creating and fitting the model
pca = PCA(n_components=14, random_state=42).fit(PCA_data)

plt.figure(figsize=(15, 6))
plt.plot(range(1, 15), np.cumsum(pca.explained_variance_ratio_))
plt.xticks(range(1, 15))
plt.xlabel("number of components")
plt.ylabel("explained variance")
plt.grid();

In [None]:
np.cumsum(pca.explained_variance_ratio_)

In [None]:
pca.explained_variance_ratio_.sum()

### Summary

*Eight* PCA components explain ~96% of the explained variance in the data, so I will use 8 PCs in this analysis

In [None]:
n_pc = 8

### Define Pipeline

In [None]:
numeric_transformer = make_pipeline(SimpleImputer(strategy="median"), 
                                    StandardScaler())

categorical_transformer = make_pipeline(
    SimpleImputer(strategy="constant", fill_value="missing"),
    OneHotEncoder(handle_unknown="ignore", sparse_output=False),
)

preprocessor = make_column_transformer(
    ("drop", drop_columns),
    (numeric_transformer, numeric_columns),
    (categorical_transformer, cat_columns),
)

In [None]:
# Define Default Pipelines
pipe_logistic = make_pipeline(preprocessor, 
                              PCA(n_components=n_pc), 
                              LogisticRegression())
pipe_decision_tree = make_pipeline(preprocessor, 
                                   PCA(n_components=n_pc), 
                                   DecisionTreeClassifier())
pipe_random_forest = make_pipeline(preprocessor, 
                                   PCA(n_components=n_pc), 
                                   RandomForestClassifier())
pipe_naive_bayes = make_pipeline(preprocessor, 
                                 PCA(n_components=n_pc), 
                                 BernoulliNB())
pipe_xgboost = make_pipeline(preprocessor, 
                             PCA(n_components=n_pc), 
                             xgb.XGBClassifier())

# Define Parameter Grids
logistic_param_grid = {
    'regressor__C': [0.001, 0.01, 0.1, 1, 10, 100],  # Regularization parameter
    'regressor__penalty': ['l1', 'l2'],               # Penalty norm
    'regressor__solver': ['liblinear', 'saga']        # Algorithm for optimization
}

# Define Parameter Grids for Decision Tree
decision_tree_param_grid = {
    'classifier__max_depth': [None, 5, 10, 15],         # Maximum depth of the tree
    'classifier__min_samples_split': [2, 5, 10],        # Minimum samples required to split a node
    'classifier__min_samples_leaf': [1, 2, 4],          # Minimum samples required at each leaf node
    'classifier__max_features': ['sqrt', 'log2'] # Number of features to consider when looking for the best split
}

# Define Parameter Grids for Random Forest
random_forest_param_grid = {
    'classifier__n_estimators': [100, 200, 300],     # Number of trees in the forest
    'classifier__max_depth': [None, 5, 10, 15],       # Maximum depth of the tree
    'classifier__min_samples_split': [2, 5, 10],      # Minimum samples required to split a node
    'classifier__min_samples_leaf': [1, 2, 4],        # Minimum samples required at each leaf node
    'classifier__max_features': ['sqrt', 'log2'] # Number of features to consider when looking for the best split
}

naive_bayes_param_grid = {
    'classifier__alpha': [0.1, 0.5, 1.0],           # Smoothing parameter
    'classifier__binarize': [0.0, 0.1, 0.5],        # Threshold for binarizing features
    'classifier__fit_prior': [True, False],         # Whether to learn class prior probabilities
    'classifier__class_prior': [None, [0.3, 0.7]]   # Prior probabilities of the classes
}

# Define Parameter Grids for XGBoost
xgboost_param_grid = {
    'classifier__learning_rate': [0.01, 0.1, 0.2],             # Boosting learning rate
    'classifier__max_depth': [3, 5, 7],                        # Maximum depth of a tree
    'classifier__n_estimators': [100, 200, 300],               # Number of boosting rounds
    'classifier__gamma': [0, 0.1, 0.2],                        # Minimum loss reduction required to make a further partition on a leaf node of the tree
    'classifier__subsample': [0.8, 0.9, 1.0],                  # Subsample ratio of the training instances
    'classifier__colsample_bytree': [0.8, 0.9, 1.0]            # Subsample ratio of columns when constructing each tree
}

## Model 1: Logistic Regression

In [None]:
# Create tmp pipeline
opt_pipeline = Pipeline([('preprocessor', preprocessor),
                         ('pca', pipe_logistic['pca']),
                         ('regressor', pipe_logistic['logisticregression'])])

# Create the GridSearchCV object
grid_search_logistic = GridSearchCV(opt_pipeline, logistic_param_grid, cv=5, n_jobs=-1, verbose=1)

# Fit the GridSearchCV object to the data
grid_search_logistic.fit(X_train, y_train)

# Print the best parameters and the best score
print("Best parameters: ", grid_search_logistic.best_params_)
print("Best score: ", grid_search_logistic.best_score_)

# Define Winning Model
#params_logistic = grid_search_logistic.best_params_
#params_logistic = {'regressor__C': 0.1, 
#                   'regressor__penalty': 'l1', 
#                   'regressor__solver': 'liblinear'}
#print(params_logistic)

## Model 2: Decision Tree

In [None]:
# Create tmp pipeline
opt_pipeline = Pipeline([('preprocessor', preprocessor),
                         ('pca', pipe_decision_tree['pca']),
                         ('classifier', pipe_decision_tree['decisiontreeclassifier'])])

# Create the GridSearchCV object
grid_search_dt = GridSearchCV(opt_pipeline, decision_tree_param_grid, cv=5, n_jobs=-1, verbose=1)

# Fit the GridSearchCV object to the data
grid_search_dt.fit(X_train, y_train)

# Print the best parameters and the best score
print("Best parameters: ", grid_search_dt.best_params_)
print("Best score: ", grid_search_dt.best_score_)

# Define Winning Model
params_dt = grid_search_dt.best_params_
#params_dt = {'classifier__max_depth': 5, 
#             'classifier__max_features': 'sqrt', 
#             'classifier__min_samples_leaf': 2, 
#             'classifier__min_samples_split': 2}
#print(params_dt)

## Model 3: Random Forest

In [None]:
# Create tmp pipeline
opt_pipeline = Pipeline([('preprocessor', preprocessor),
                         ('pca', pipe_random_forest['pca']),
                         ('classifier', pipe_random_forest['randomforestclassifier'])])

# Create the GridSearchCV object
grid_search_rf = GridSearchCV(opt_pipeline, random_forest_param_grid, cv=5, n_jobs=-1, verbose=1)

# Fit the GridSearchCV object to the data
grid_search_rf.fit(X_train, y_train)

# Print the best parameters and the best score
print("Best parameters: ", grid_search_rf.best_params_)
print("Best score: ", grid_search_rf.best_score_)

# Define Winning Model
params_rf = grid_search_rf.best_params_
#params_rf = {'classifier__max_depth': 5, 
#             'classifier__max_features': 'log2', 
#             'classifier__min_samples_leaf': 4, 
#             'classifier__min_samples_split': 5, 
#             'classifier__n_estimators': 100}
#print(params_rf)

## Model 4: Naive Bayes - Bernoulli

In [None]:
# Create tmp pipeline
opt_pipeline = Pipeline([('preprocessor', preprocessor),
                         ('pca', pipe_naive_bayes['pca']),
                         ('classifier', pipe_naive_bayes['bernoullinb'])])

# Create the GridSearchCV object
grid_search_nb = GridSearchCV(opt_pipeline, naive_bayes_param_grid, cv=5, n_jobs=-1, verbose=1)

# Fit the GridSearchCV object to the data
grid_search_nb.fit(X_train, y_train)

# Print the best parameters and the best score
print("Best parameters: ", grid_search_nb.best_params_)
print("Best score: ", grid_search_nb.best_score_)

# Define Winning Model
params_nb = grid_search_nb.best_params_
#params_nb = {'classifier__alpha': 0.1, 
#             'classifier__binarize': 0.0, 
#             'classifier__class_prior': None, 
#             'classifier__fit_prior': False}
#print(params_nb)

## Model 5: XGBoost Classifier

In [None]:
# Create tmp pipeline
opt_pipeline = Pipeline([('preprocessor', preprocessor),
                         ('pca', pipe_xgboost['pca']),
                         ('classifier', pipe_xgboost['xgbclassifier'])])

# Create the GridSearchCV object
grid_search_xgb = GridSearchCV(opt_pipeline, xgboost_param_grid, cv=5, n_jobs=-1, verbose=1)

# Fit the GridSearchCV object to the data
grid_search_xgb.fit(X_train, y_train)

# Print the best parameters and the best score
print("Best parameters: ", grid_search_xgb.best_params_)
print("Best score: ", grid_search_xgb.best_score_)

# Define Winning Model
params_xgb = grid_search_xgb.best_params_
#params_xgb = {'classifier__colsample_bytree': 0.9, 
#              'classifier__gamma': 0.1, 
#              'classifier__learning_rate': 0.01, 
#              'classifier__max_depth': 3, 
#              'classifier__n_estimators': 100, 
#              'classifier__subsample': 0.9}
#print(params_xgb)

## Model Comparison

In [None]:
def mean_std_cross_val_scores(model, X_train, y_train, **kwargs):
    """
    Returns mean and std of cross validation

    Parameters
    ----------
    model :
        scikit-learn model
    X_train : numpy array or pandas DataFrame
        X in the training data
    y_train :
        y in the training data

    Returns
    ----------
        pandas Series with mean scores from cross_validation
    """

    scores = cross_validate(model, X_train, y_train)

    mean_scores = pd.DataFrame(scores).mean()
    std_scores = pd.DataFrame(scores).std()
    out_col = []

    for i in range(len(mean_scores)):
        out_col.append((f"%0.3f (+/- %0.3f)" % (mean_scores[i], std_scores[i])))

    return pd.Series(data=out_col, index=mean_scores.index)

In [None]:
def mape(true, pred):
    return 100.0 * np.mean(np.abs((pred - true) / true))


# make a scorer function that we can pass into cross-validation
mape_scorer = make_scorer(mape, greater_is_better=False)

scoring_metrics = {
    "neg RMSE": "neg_root_mean_squared_error",
    "r2": "r2",
    "mape": mape_scorer,
}

In [None]:
# Define Logist Regression
LR_pipe = make_pipeline(preprocessor,
                       PCA(n_components=n_pc),
                       LogisticRegression(C = params_logistic['regressor__C'],
                                          penalty = params_logistic['regressor__penalty'],
                                          solver = params_logistic['regressor__solver']))

# Define Decision Tree
DT_pipe = make_pipeline(preprocessor, 
                       PCA(n_components=n_pc),
                       DecisionTreeClassifier(max_depth = params_dt['classifier__max_depth'],
                                                 max_features = params_dt['classifier__max_features'],
                                                 min_samples_leaf = params_dt['classifier__min_samples_leaf'],
                                                 min_samples_split = params_dt['classifier__min_samples_split']))

# Define Random Forest
RF_pipe = make_pipeline(preprocessor, 
                        PCA(n_components=n_pc),
                        RandomForestClassifier(max_depth = params_rf['classifier__max_depth'],
                                               max_features = params_rf['classifier__max_features'],
                                               min_samples_leaf = params_rf['classifier__min_samples_leaf'],
                                               min_samples_split = params_rf['classifier__min_samples_split'],
                                               n_estimators = params_rf['classifier__n_estimators']))

# Define Naive Bayes - Bernoulli
NB_pipe = make_pipeline(preprocessor, 
                         PCA(n_components=n_pc),
                         BernoulliNB(alpha = params_nb['classifier__alpha'],
                                     binarize = params_nb['classifier__binarize'],
                                     class_prior = params_nb['classifier__class_prior'],
                                     fit_prior = params_nb['classifier__fit_prior']))

# Define XGBoost Classifier
XGB_pipe = make_pipeline(preprocessor, 
                         PCA(n_components=n_pc),
                         xgb.XGBClassifier
                         (colsample_bytree = params_xgb['classifier__colsample_bytree'],
                             gamma = params_xgb['classifier__gamma'],
                             learning_rate = params_xgb['classifier__learning_rate'],
                             max_depth = params_xgb['classifier__max_depth'],
                             n_estimators = params_xgb['classifier__n_estimators'],
                             subsample = params_xgb['classifier__subsample']))

models = {
    "Logistic Regression" : LR_pipe,
    "Decision Tree" : DT_pipe,
    "Random Forest" : RF_pipe,
    "Naive Bayes Bernoulli" : NB_pipe,
    "XGBoost" : XGB_pipe
}

In [None]:
results = {}

for (name, model) in models.items():
    results[name] = mean_std_cross_val_scores(
        model, X_train, y_train, return_train_score=True, scoring=scoring_metrics
    )

pd.DataFrame(results)

## Winning Model: Logistic Regression

### Fit Winning Middle

In [None]:
LR_fit = LR_pipe.fit(X_train, y_train)

### Generate Predictions

In [None]:
LR_predictions = LR_fit.predict(X_test)

### Evaluation

In [None]:
def evaluate_model(y_true, y_pred):
    """
    Calculates various evaluation metrics for a binary classification problem.

    Parameters:
    - y_true (array-like): True labels
    - y_pred (array-like): Predicted labels

    Returns:
    - accuracy (float): Accuracy of the model
    - f1 (float): F1 score of the model
    - auc (float): AUC-ROC of the model
    - precision (float): precision of the model
    - recall (float): recall of the model
    """
    accuracy = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred, average='macro')
    report = classification_report(y_true, y_pred, output_dict=True)
    precision = report['1']['precision']
    recall = report['1']['recall']
    
    print(f'Accuracy: {accuracy:.2f}')
    print(f'F1: {f1:.2f}')
    print(f'Precision: {precision:.2f}')
    print(f'Recall: {recall:.2f}')
    
    return (accuracy, f1, precision, recall)

In [None]:
model_performance = evaluate_model(y_test, LR_predictions)

In [None]:
# Map encoded values to name values
# labels
labels = ['loss', 'win']

# Create vectorized function for labeling encoded data (thanks chatGPT)
int2label = np.vectorize(lambda x: labels[x])

# Apply actual labels
LR_predictions_labeled = int2label(LR_predictions)
y_test_labeled = int2label(y_test.apply(int))

In [None]:
conf_mat = confusion_matrix(y_test_labeled, 
                            LR_predictions_labeled, 
                            normalize='true')

sns.heatmap(conf_mat, 
            annot=True,
            xticklabels=labels, 
            yticklabels=labels, )
plt.title('Logistic Regression Prediction Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

## Predict Validation Season

In [None]:
predictions_2023 = LR_fit.predict(validation_season_x)
probabilites_2023 = LR_fit.predict_proba(validation_season_x)
win_probabilities_2023 = probabilites_2023[:, 1]

In [None]:
validation_season['predicted_result'] = predictions_2023
validation_season['prediction_confidence'] = win_probabilities_2023
output = validation_season.loc[:,['game_id', 'season', 'week' , 'team', 'home_away', 'spread_line', 
                              'points_scored', 'points_allowed', 'favored', 'result', 
                              'predicted_result', 'prediction_confidence']]

In [None]:
output.sort_values(by=['season', 'week', 'team'])