# NHL Game Predictor
### Important Notes
* When I use the year, it refers to the year the season ends.
    * Like for example the 2024-2025 is marked as 2025.
* nhl-XX.csv is the score data for a given season.
    * Links to get this data is as follows:
      * [2022 Season](https://shanemcd.org/2021/09/23/2021-22-nhl-schedule-and-results-in-excel-xlsx-and-csv-formats/)
      * [2023 Season](https://shanemcd.org/2022/07/12/2022-23-nhl-schedule-and-results-in-excel-xlsx-and-csv-formats/)
      * [2024 Season](https://shanemcd.org/2023/08/23/2023-24-nhl-schedule-and-results-in-excel-xlsx-and-csv-formats/)
      * [2025 Season](https://shanemcd.org/2024/09/07/2024-25-nhl-schedule-and-results-in-excel-xlsx-and-csv-formats/)
      * All on shanemcd.org, note that I have to redownload the 2025 season data for updated results (csv download of course).
* teamsXX.csv is the team data for a given season.
    * Links to the data is from [MoneyPuck.com](https://moneypuck.com/data.htm)
    * Downloads are from the Team Level

In [None]:
!pip install tensorflow

In [112]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow.keras as keras
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, Dropout
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.optimizers import Adam
%matplotlib inline
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn import metrics
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.utils.class_weight import compute_class_weight
from scikeras.wrappers import KerasClassifier
pd.set_option('display.max_columns', None)

### getTeam(abbreviation)
This returns the team name given the abbrevation

In [40]:
abbrev = pd.read_csv("abbreviations.csv")
abbrev.index = abbrev["Abbrev"]
abbrev = abbrev.drop("Abbrev", axis=1)
abbrev = abbrev.to_dict("index")
abbrev = {k: v['Team'] for k, v in abbrev.items()}
def getTeam(text):
    text = text.upper()
    return abbrev[text]

The cell bellow imports the game data and makes only gets the score and winner

In [None]:
def import_games(file):
    game_data = pd.read_csv(file)
    game_data = game_data[game_data["Status"] != "Scheduled"]
    game_data = game_data.rename(columns={"Score": "AwayScore", "Score.1": "HomeScore","Visitor": "Away"})
    game_data["HomeWin"] = game_data["HomeScore"] > game_data["AwayScore"]
    game_data["HomeWin"] = game_data["HomeWin"].astype(int)
    game_data = game_data[["Away","AwayScore","Home","HomeScore","HomeWin"]]
    return game_data
games22 = import_games("nhl-22.csv")
games23 = import_games("nhl-23.csv")
games24 = import_games("nhl-24.csv")
games25 = import_games("nhl-25.csv")
games25

In [None]:
def get_records(data):
    results = {}
    for i, row in data.iterrows():
        away = row["Away"]
        awayScore = row["AwayScore"]
        home = row["Home"]
        homeScore = row["HomeScore"]
        homeWin = row["HomeWin"]
        # [points scored, points allowed, wins, gp, win%]
        if away not in results:
            results[away] = [0,0,0,0,0]
        if home not in results:
            results[home] = [0,0,0,0,0]
        results[away][3] += 1
        results[home][3] += 1
        if homeWin == 0:
            results[away][2] += 1
        if homeWin == 1:
            results[home][2] += 1
        results[away][0] += awayScore
        results[away][1] += homeScore
        results[home][0] += homeScore
        results[home][1] += awayScore
        results[away][4] = results[away][2] / results[away][3]
        results[home][4] = results[home][2] / results[home][3]
        df = pd.DataFrame.from_dict(results, orient="index", columns=["GS", "GA", "Wins","GP","Win%"])
        df["GD"] = df["GS"] - df["GA"]
        df = df[["GS","GA","GD","Win%"]]
    return df
records22 = get_records(games22)
records23 = get_records(games23)
records24 = get_records(games24)
records25 = get_records(games25)
records25

In [45]:
def import_teams(file):
    season_data = pd.read_csv(file)
    season_data = season_data[season_data.situation == "all"]
    season_data = season_data.drop(["team.1", "position", "situation", "season", "name"], axis=1)
    season_data["team"] = season_data["team"].apply(getTeam)
    season_data = season_data.reset_index()
    season_data = season_data.drop(["index"], axis=1)
    return season_data

In [None]:
def add_records(teamdata, recorddata):
    teamdata.insert(1,"Win%",0.0)
    teamdata.insert(1,"GD",0)
    teamdata.insert(1,"GA",0)
    teamdata.insert(1,"GS",0)
    for i in recorddata.index:
        current = recorddata.loc[i]
        teamdata.loc[teamdata["team"] == i,"GS"] = current["GS"]
        teamdata.loc[teamdata["team"] == i,"GA"] = current["GA"]
        teamdata.loc[teamdata["team"] == i,"GD"] = current["GD"]
        teamdata.loc[teamdata["team"] == i,"Win%"] = current["Win%"]
    return teamdata

teamdata22 = import_teams("teams22.csv")
teamdata23 = import_teams("teams23.csv")
teamdata24 = import_teams("teams24.csv")
teamdata25 = import_teams("teams25.csv")
add_records(teamdata22, records22)
add_records(teamdata23, records23)
add_records(teamdata24, records24)
add_records(teamdata25, records25)
teamdata25

In [None]:
def set_game_stats(games, teams):
    gameStats = games.copy(deep = False)
    difflist = []
    for i, row in gameStats.iterrows():
        awayStats = teams[teams["team"] == row["Away"]]
        homeStats = teams[teams["team"] == row["Home"]]
        awayStats = awayStats.drop(["team"], axis=1)
        homeStats = homeStats.drop(["team"], axis=1)
        diff = pd.DataFrame(np.subtract(homeStats.values, awayStats.values), columns=homeStats.columns)
        diff.index = [i]
        difflist.append(diff)
    difflist = pd.concat(difflist, ignore_index = False)
    difflist = pd.concat([gameStats, difflist], axis=1)
    return difflist
gameStats22 = set_game_stats(games22, teamdata22)
gameStats23 = set_game_stats(games23, teamdata23)
gameStats24 = set_game_stats(games24, teamdata24)
gameStats25 = set_game_stats(games25, teamdata25)
all_game_stats = pd.concat([gameStats22, gameStats23, gameStats24, gameStats25], ignore_index=True)
all_game_stats

In [48]:
def getX(gameStats, filters):
    return pd.DataFrame(StandardScaler().fit_transform(gameStats[filters].T).T,columns=filters)
def getY(gameStats):
    return gameStats["HomeWin"]

## DNN Model

In [None]:
filter_dnn = [
    "GS", "GA", "GD", "Win%", "xGoalsPercentage", "corsiPercentage", "fenwickPercentage",
    "shotAttemptsFor", "shotAttemptsAgainst", "xReboundsFor", "xReboundsAgainst",
    "xOnGoalFor", "xOnGoalAgainst", "xGoalsFor", "xGoalsAgainst", "shotsOnGoalFor",
    "shotsOnGoalAgainst", "blockedShotAttemptsFor", "xGoalsFromxReboundsOfShotsFor",
    "unblockedShotAttemptsAgainst", "blockedShotAttemptsAgainst", "takeawaysFor", "takeawaysAgainst",
    "penaltiesFor", "faceOffsWonFor"
]
scaler = StandardScaler()
x = getX(all_game_stats, filter_dnn)
y = getY(all_game_stats)
x_train, x_test, y_train, y_test = train_test_split(x.values, y.values, test_size=0.25)
x

In [113]:
def create_dnn_model(X_train, y_train, X_test, y_test, hidden=[64,32,16], epochs=150 , batch_size=64):
    model = Sequential()
    model.add(Flatten())
    for n in hidden:
        model.add(Dense(n, activation='relu'))
        model.add(Dropout(.2))
    model.add(Dense(1, activation='sigmoid'))

    
    model.compile(loss="binary_crossentropy",
                optimizer=Adam(learning_rate=.001),
                metrics=['accuracy'])
    
    class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y_train), y=y_train)
    class_weights = dict(enumerate(class_weights))
    
    early_stopping = EarlyStopping(monitor='val_loss', mode='min', patience=100, restore_best_weights=True)
    lr_scheduler = ReduceLROnPlateau(monitor='val_loss', mode='min',factor=0.5, patience=25,min_lr=.0001, verbose=1)

    history = model.fit(x=X_train, y=y_train,
                        validation_data=(X_test, y_test),
                        batch_size=batch_size, epochs=epochs,
                        verbose=1, callbacks=[early_stopping, lr_scheduler], 
                        class_weight=class_weights)
    model.summary()

    train_pred = np.rint(model.predict(X_train))
    train_acc = 100. * accuracy_score(y_train, train_pred)
    print("Accuracy on train set: {:.2f}%".format(train_acc))

    test_pred = np.rint(model.predict(X_test))
    test_acc = 100. * accuracy_score(y_test, test_pred)
    print("Accuracy on test set: {:.2f}%".format(test_acc))

    cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = confusion_matrix(y_test, test_pred), display_labels = ["Away Win", "Home Win"])
    cm_display.plot()
    plt.show()

    return history, model, test_acc

In [None]:
test_acc = 0
while test_acc < 64.8:
    history, model, test_acc = create_dnn_model(x_train, y_train, x_test, y_test, hidden=[128, 96, 64, 32, 16, 4], epochs=150, batch_size=128)


In [122]:
def getTeam(team, stats):
    return stats.loc[stats["team"] == team]

def getGameSet(awayTeam, homeTeam):
    awayTeam = awayTeam.drop(["team"], axis=1)
    homeTeam = homeTeam.drop(["team"], axis=1)
    diff = pd.DataFrame(np.subtract(homeTeam.values, awayTeam.values), columns=homeTeam.columns)
    return diff

In [None]:
import pickle
# pickle.dump(model, open("dnn.pkl","wb"))
test = pickle.load(open("dnn.pkl", "rb"))

In [None]:
away = getTeam("Los Angeles Kings", teamdata25)
home = getTeam("New York Rangers", teamdata25)
gameSet = getGameSet(away, home)
prediction = getX(gameSet, filter_dnn)
#Use newly generated model 
result_new = model.predict(prediction.values)
result_new = (result_new >= 0.5).astype(int)
print(result_new)
#Use saved model 
result_saved = test.predict(prediction.values)
result_saved = (result_saved >= 0.5).astype(int)
print(result_saved)


In [None]:
aways = ["Ottawa Senators","Chicago Blackhawks","Los Angeles Kings","Philadelphia Flyers","Vegas Golden Knights","Anaheim Ducks","Buffalo Sabres","Montreal Canadiens","Pittsburgh Penguins","Toronto Maple Leafs","St. Louis Blues","Nashville Predators","Boston Bruins","Florida Panthers","Tampa Bay Lightning","Utah Hockey Club"]
homes = ["Carolina Hurricanes","New Jersey Devils","New York Rangers","Minnesota Wild","Edmonton Oilers","Columbus Blue Jackets","Washington Capitals","Winnipeg Jets","Ottawa Senators","Detroit Red Wings","Dallas Stars","Colorado Avalanche","Vancouver Canucks","Calgary Flames","Seattle Kraken","San Jose Sharks"]
p = []
year = teamdata25
for i in range(len(aways)):
    a = getTeam(aways[i], year)
    h = getTeam(homes[i], year)
    gS = getGameSet(a,h)
    t = getX(gS, filter_dnn)
    cols_to_move = ["Away", "Home", "Prediction"]
    t["Away"] = aways[i]
    t["Home"] = homes[i]
    t["Prediction"] = -1
    t = t[cols_to_move + [col for col in t.columns if col not in cols_to_move]]
    p.append(t)

predictions = pd.concat(p, ignore_index=True)
results = test.predict(predictions[filter_dnn].values)
predictions["Prediction"] = results
print(results)
predictions