In [1]:
!pip install tensorflow



In [2]:
# Import depedencies
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from pathlib import Path
import tensorflow as tf
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Sequential

from sklearn.preprocessing import OneHotEncoder

In [3]:
# Read the CSV data and create a new dataframe

df = pd.read_csv("match_team_data.csv")

In [4]:
# Create a column to account for Wins, Losses, and Ties
# Win = 1 , Loss = 0, Tie = -1

#We changed 1 to win and 0 to lose because XGBoost only accepts binary values.

df["home_win"] = np.where(df["home_score"] > df["away_score"], 1,0)

In [5]:
# Check for NaNs
df.isna().mean().sort_values(ascending=False).head(10)

team_2_post_season_result                1.000000
team_1_post_season_result                1.000000
winning_abbr                             0.079338
winning_name                             0.079338
losing_name                              0.079338
losing_abbr                              0.079338
away_score                               0.075625
home_score                               0.075625
team_2_pass_completions                  0.000000
team_2_offensive_simple_rating_system    0.000000
dtype: float64

In [6]:
df["winning_name"].isna()

0       False
1       False
2       False
3       False
4       False
        ...  
2957     True
2958     True
2959     True
2960     True
2961     True
Name: winning_name, Length: 2962, dtype: bool

In [7]:
# Remove NaNs
df = df.drop(columns=["team_2_post_season_result","team_1_post_season_result"])

In [8]:
df3 = df.dropna()

In [9]:
df3

Unnamed: 0,boxscore,away_name,away_abbr,away_score,home_name,home_abbr,home_score,winning_name,winning_abbr,losing_name,...,team_2_rush_yards_per_attempt,team_2_simple_rating_system,team_2_strength_of_schedule,team_2_turnovers,team_2_win_percentage,team_2_wins,team_2_yards,team_2_yards_from_penalties,team_2_yards_per_play,home_win
0,201209050nyg,Dallas Cowboys,dal,24.0,New York Giants,nyg,17.0,Dallas Cowboys,dal,New York Giants,...,4.7,-4.2,-2.2,2,0.667,2,972,188,5.3,0
1,201209090chi,Indianapolis Colts,clt,21.0,Chicago Bears,chi,41.0,Chicago Bears,chi,Indianapolis Colts,...,4.0,-4.8,2.2,6,0.500,1,994,153,4.8,1
2,201209090cle,Philadelphia Eagles,phi,17.0,Cleveland Browns,cle,16.0,Philadelphia Eagles,phi,Cleveland Browns,...,4.4,38.9,26.9,1,1.000,3,1341,173,6.4,0
3,201209090det,St. Louis Rams,ram,23.0,Detroit Lions,det,27.0,Detroit Lions,det,St. Louis Rams,...,3.4,-9.0,-6.0,7,0.667,2,919,126,5.3,1
4,201209090htx,Miami Dolphins,mia,10.0,Houston Texans,htx,30.0,Houston Texans,htx,Miami Dolphins,...,3.3,12.4,6.1,2,1.000,3,1066,91,6.4,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2733,202209250crd,Los Angeles Rams,ram,20.0,Arizona Cardinals,crd,12.0,Los Angeles Rams,ram,Arizona Cardinals,...,3.4,-9.0,-6.0,7,0.667,2,919,126,5.3,0
2734,202209250sea,Atlanta Falcons,atl,27.0,Seattle Seahawks,sea,23.0,Atlanta Falcons,atl,Seattle Seahawks,...,4.9,-11.3,-10.9,7,0.333,1,1063,99,5.9,0
2735,202209250tam,Green Bay Packers,gnb,14.0,Tampa Bay Buccaneers,tam,12.0,Green Bay Packers,gnb,Tampa Bay Buccaneers,...,4.7,12.5,11.5,5,0.667,2,1067,100,5.7,0
2736,202209250den,San Francisco 49ers,sfo,10.0,Denver Broncos,den,11.0,Denver Broncos,den,San Francisco 49ers,...,4.5,-4.8,-8.2,5,0.333,1,971,165,5.1,1


In [10]:
df3["home_win"].value_counts()


1    1520
0    1207
Name: home_win, dtype: int64

In [11]:
# Check for categorical data

df3.dtypes.sort_values().tail(15)

team_1_points_contributed_by_offense    float64
away_name                                object
away_abbr                                object
home_name                                object
home_abbr                                object
team_2_abbreviation                      object
losing_name                              object
winning_abbr                             object
losing_abbr                              object
teams                                    object
team_1_abbreviation                      object
team_2_name                              object
team_1_name                              object
winning_name                             object
boxscore                                 object
dtype: object

In [12]:
# Remove categorical data

df3.drop(columns=["away_name","away_abbr","home_name","home_abbr","team_2_abbreviation","losing_name","winning_abbr","losing_abbr","teams","team_1_abbreviation","team_2_name","team_1_name","winning_name","boxscore"], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [13]:
# Remove categorical data

df3.drop(columns=["year","week","home_score","away_score"], inplace=True)

In [14]:
# Create target and features

y = df3["home_win"]
X = df3.drop(columns="home_win")

In [15]:
# Apply Train_Test_Split

X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=1)

In [16]:
# Set the scaler
scaler = StandardScaler()

In [17]:
# Fit and Transform the data
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [18]:
# Create the model
nn = Sequential()

In [19]:
number_input_features = len(X_train.iloc[0])

In [20]:
number_output_neurons = 1

In [21]:
hidden_nodes_layer1 =  (number_input_features + number_output_neurons)//2

In [22]:
hidden_nodes_layer2 =  (hidden_nodes_layer1 + number_output_neurons)//2

In [23]:
# Fit the data
nn.add(Dense(units= hidden_nodes_layer1, input_dim=number_input_features, activation="relu"))

In [24]:
nn.add(Dense(units= hidden_nodes_layer2, activation="relu"))

In [25]:
nn.add(Dense(units=number_output_neurons, activation="sigmoid"))

In [26]:
nn.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 37)                2775      
                                                                 
 dense_1 (Dense)             (None, 19)                722       
                                                                 
 dense_2 (Dense)             (None, 1)                 20        
                                                                 
Total params: 3,517
Trainable params: 3,517
Non-trainable params: 0
_________________________________________________________________


In [27]:
nn.compile(loss = "binary_crossentropy", optimizer =  "Nadam", metrics = ["accuracy"])

In [28]:
nn_fitted = nn.fit(X_train_scaled, y_train, epochs = 50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [29]:
model_loss, model_accuracy = nn.evaluate(X_test_scaled, y_test, verbose = 2)

22/22 - 0s - loss: 0.8391 - accuracy: 0.5660 - 170ms/epoch - 8ms/step


In [30]:
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

Loss: 0.8391186594963074, Accuracy: 0.5659824013710022
