# Import Dependencies and data 
#### Read the data using pandas 

In [1]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
import numpy as np
from sklearn import tree

In [None]:
# Study data files
data1 = "../Data/nhl odds 2014-15.csv"
data2 = "../Data/nhl odds 2015-16.csv"
data3 = "../Data/nhl odds 2016-17.csv"
data4 = "../Data/nhl odds 2017-18.csv"
data5 = "../Data/nhl odds 2018-19.csv"
data6 = "../Data/nhl odds 2019-20.csv"

# Read the mouse data and the study results
data1 = pd.read_csv(data1)
data2 = pd.read_csv(data2)
data3 = pd.read_csv(data3)
data4 = pd.read_csv(data4)
data5 = pd.read_csv(data5)
data6 = pd.read_csv(data6)

data6 = data6.rename(columns={"PuckLine": "Puck Line", "OpenOU": "Open OU", "CloseOU": "Close OU"})

frames = [data1, data2, data3, data4, data5, data6]


# Merge all the different seasons 

In [None]:
def set_win(row):
    if row['Point Dif'] > 0:
        return 1
    else:
        return 0

# Create row to see who won or loss by checking the point diferential was positive or negative.
def set_win_pl(row):
    if row['Point Dif'] >= 2:
        return 1
    elif (row['Point Dif'] >= -1) & (row['Puck Line'] == 1.5):
        return 1
    else:
        return 0

def data_clean(df):
    df = df.replace({'Team': {'Arizonas': 'Arizona', 'NewJersey': 'New Jersey', 'NYIslanders': 'NY Islanders', 'NYRangers': 'NY Rangers',  'SanJose': 'San Jose',  'St.Louis': 'St. Louis', 'TampaBay': 'Tampa Bay','LosAngeles':'Los Angeles'}})
    p_dif = []
    wins = []

    # Iterate through table and calculate point dif and winners and insert into list 
    for i in range(1, df.shape[0], 2):
        home_dif = df['Final'].iloc[i] - df['Final'].iloc[i-1]
        away_dif = df['Final'].iloc[i-1] - df['Final'].iloc[i]
        p_dif.append(away_dif)
        p_dif.append(home_dif)
            
    # Add point dif column and data         
    df['Point Dif'] = p_dif
    df = df.reset_index()
    # Create point differential column 
    line_move = []

    # Iterate through table and calculate point dif and insert into list 
    for i in range(len(df)):
        move = df['Open'].iloc[i] - df['Close'].iloc[i]
        line_move.append(move)

    # Add point dif column and data         
    df['Line Mvmt'] = line_move
    df = df.reset_index()
    
    # add the result column 
    df = df.assign(Win=df.apply(set_win, axis=1))
    df = df.assign(Win_PL=df.apply(set_win_pl, axis=1)) 
    
    goal_dict = {}
    net_goals = []
    win_dict = {}
    wins = []

    for i in df.iterrows():
    #point difference
        if i[1]['Team'] in goal_dict:
            net_goals.append(goal_dict[i[1]['Team']])
            goal_dict[i[1]['Team']] = goal_dict[i[1]['Team']] + int(i[1]['Point Dif'])
        else:
            goal_dict[i[1]['Team']] = int(i[1]['Point Dif'])
            net_goals.append(0)
            
    df['net_goals'] = net_goals
    return df

In [None]:
new_frames = []
for frame in frames:
    clean = data_clean(frame)
    new_frames.append(clean)
    
df = pd.concat(new_frames)
cols = [0,1]
df.drop(df.columns[cols],axis=1,inplace=True)
df

In [None]:
wd = {}
w = []

for i in df.iterrows():
    #wins calculation
        if i[1]['Team'] in wd:
            w.append(wd[i[1]['Team']])
            wd[i[1]['Team']] = wd[i[1]['Team']] + int(i[1]['Win'])
        else:
            wd[i[1]['Team']] = int(i[1]['Win'])
            w.append(0)

df['season_wins'] = w

In [None]:
df.to_csv('data.csv')

# Cleaning Up the Dataframe

In [None]:
cols = [0,1,3,4,5,6,7,8,10,12,13,14,15,16]
df.drop(df.columns[cols],axis=1,inplace=True)
df

In [None]:
df = df.replace({'VH': {'V': 0, 'H': 1}})
df = df.rename(columns={"VH": "Home","Close": "ML", "Unnamed: 11": "PL"})
df = df[["Home", "ML", "PL", "Line Mvmt", "net_goals", "season_wins", "Win", "Win_PL"]]
df = df.dropna()

df

In [None]:
df.to_csv('final_df.csv')

In [2]:
df = pd.read_csv('final_df.csv')

# Random Forest Model

In [3]:
#target = df["Win_PL"]
target = df["Win"]
target_names = ["loss", "win"]

In [4]:
data = df.drop(columns=['Win', 'Win_PL'],axis=1)
feature_names = data.columns

In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=42)

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

param_grid = {
    'n_estimators': [170,180,190,200],
    'criterion': ['gini','entropy']
}

grid = GridSearchCV(RandomForestClassifier(), param_grid, verbose=True, n_jobs=-1)

grid.fit(X_train, y_train)

Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


In [None]:
print(grid.best_params_)

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=180,criterion='entropy')
rf = rf.fit(X_train, y_train)
rf.score(X_test, y_test)

In [None]:
sorted(zip(rf.feature_importances_, feature_names), reverse=True)

# Deep Network

In [None]:
from tensorflow.keras.utils import to_categorical
y_train_categorical = to_categorical(y_train)
y_test_categorical = to_categorical(y_test)

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

deep_model = Sequential()
deep_model.add(Dense(units=18, activation='relu', input_dim=6))
deep_model.add(Dense(units=9, activation='relu'))
deep_model.add(Dense(units=2, activation='softmax'))

In [None]:
deep_model.compile(optimizer='adam',
                   loss='categorical_crossentropy',
                   metrics=['accuracy'])

In [None]:
deep_model.fit(
    X_train,
    y_train_categorical,
    epochs=50,
    shuffle=True,
    verbose=2
)

In [None]:
model_loss, model_accuracy = deep_model.evaluate(
    X_test, y_test_categorical, verbose=2)
print(f"Deep Neural Network - Loss: {model_loss}, Accuracy: {model_accuracy}")

In [None]:
import pandas as pd
import re

In [None]:
url = 'https://www.pro-football-reference.com/years/2007/games.htm'

In [None]:
tables = pd.read_html(url)
tables