# MatchPredictor

### A neural network which predicts the outcomes of Premier League football matches

#### Importing Libraries & Packages

In [19]:
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F

import torch.utils.data 

from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV

import pandas as pd
import matplotlib.pyplot as plt


#### Hyperparameters

In [20]:
past = 5    # How many past games are taken into account for team form calculations
rseed = 1 
batch_size = 32
num_epochs = 50
# fc1_size = 32
# fc2_size = 32
# fc3_size = 20
# out_size = 3
input_size = 12
h_sizes  = [12,32,32,20,3]
act="relu"
            #add num hidden layers, optimizer loss fcn
lr = 0.01 

#### Importing Match Data

In [21]:
# Imported CSV becomes a pandas dataframe object

data_20 = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/MI/FINAL PROJECT/england-premier-league-matches-2019-to-2020-stats.csv")
data_19 = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/MI/FINAL PROJECT/england-premier-league-matches-2018-to-2019-stats.csv")
data_18 = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/MI/FINAL PROJECT/england-premier-league-matches-2017-to-2018-stats.csv")
data_17 = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/MI/FINAL PROJECT/england-premier-league-matches-2016-to-2017-stats.csv")
data_16 = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/MI/FINAL PROJECT/england-premier-league-matches-2015-to-2016-stats.csv")
data_15 = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/MI/FINAL PROJECT/england-premier-league-matches-2014-to-2015-stats.csv")
data_14 = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/MI/FINAL PROJECT/england-premier-league-matches-2013-to-2014-stats.csv")
data_13 = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/MI/FINAL PROJECT/england-premier-league-matches-2012-to-2013-stats.csv")


data = [data_20, data_19, data_18, data_17, data_16, data_15, data_14, data_13]

In [22]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


#### DATA PREPROCSESSING  
  
Here we create the inputs for our model from the raw .csv files we collected from *footystats.org*.  
  
- Each season will be represented by a matrix.  
- Each row of this matrix will consist of the inputs that describe a single match to the neural net.  
  
In each row, the entries are as follows:  
*Note: **past** is an integer hyperparameter*  
  
**Index 0**: Home team average goals scored per game over last **past** games.  
**Index 1**:  Home team average goals conceded per game over last **past** games.  
**Index 2**:  Home team pre-match PPG.  
**Index 3**:  Home team ppg from last game (so current game isn’t included).  
**Index 4**:  Home team average number of shots on target over last **past** games.  
**Index 5**:  Home team average number of corners over last **past** games.  
**Index 6**:  Away team average goals scored per game over last **past** games.  
**Index 7**:  Away team average goals conceded per game over last **past** games.  
**Index 8**:  Away team pre-match PPG.  
**Index 9**:  Away team ppg from last game (so current game isn’t included).  
**Index 10**:  Away team average number of shots on target over last **past** games.  
**Index 11**:  Away team average number of corners over last **past** games.  
**Index 12 (LABEL)**:  0 if Home Team won, 1 if Away Team won, 2 if Draw. 
  
  
*NOTE: The first "past" weeks from each season cannot be used in training/testing as they have no previous matches to get data from.*

In [23]:
# Set up empty matrices to be filled with INPUTS.
# Once filled with values, each row will represent the inputs that describe a match to the NN.
    
in_20 = np.zeros((380-(past*10),13))
in_19 = np.zeros((380-(past*10),13))
in_18 = np.zeros((380-(past*10),13))
in_17 = np.zeros((380-(past*10),13))
in_16 = np.zeros((380-(past*10),13))
in_15 = np.zeros((380-(past*10),13))
in_14 = np.zeros((380-(past*10),13))
in_13 = np.zeros((380-(past*10),13))


input_seasons = [in_20, in_19, in_18, in_17, in_16, in_15, in_14, in_13]


season_idx = 0
for season in data:
    
    week = past + 1
    input_season = input_seasons[season_idx]
    
    
    while week < 39:
        
        row_idx = (week-1)*10             # index of the first match of new week
        match_count = 0                   # counting the 10 matches played in a given week

        
        while match_count < 10:
            
            match = season.iloc[[row_idx]]                     # saving current match
            
             # getting match LABEL (match result)
            if (match.iat[0,12] > match.iat[0,13]):            
                    result = 0   # home team win
            elif (match.iat[0,12] < match.iat[0,13]):
                    result = 1   # away team win
            else:
                    result = 2   # draw
                    
            
            home_team = season.at[row_idx,'home_team_name']    # saving home team name
            away_team = season.at[row_idx,'away_team_name']    # saving away team name
            
            total_home_goals = 0                               # counts total goals scored by home team over "past" matches
            total_away_goals = 0                               # counts total goals scored by away team over "past" matches
            
            total_home_conceded = 0                            # counts total goals scored against home team over "past" matches 
            total_away_conceded = 0                            # counts total goals scored against away team over "past" matches
            
            total_home_shotson = 0                             # counts total shots on target taken by home team over...
            total_away_shotson = 0                             # counts total shots on target taken by away team over...
            
            total_home_corners = 0                             # counts total corners taken by home team over...
            total_away_corners = 0                             # counts total corners taken by away team over...
            
            home_PPG_pre = match.iat[0,8]                      # home team pre-match points per game (PPG)
            away_PPG_pre = match.iat[0,9]                      # away team pre-match points per game (PPG)
            
            
            previous = 1                                       # counts up to "past"
            
            
            while previous <= past:

                home_match_prev = season.loc[(season['home_team_name'] == home_team) & (season['Game Week'] == week-previous)]   # picking out home team's previous match
                h_sc_idx = 12       # home team score index
                h_shon_idx = 32     # home shots on target index
                h_corn_idx = 20     # home corners index
                h_ppg_idx = 10
                
                away_match_prev = season.loc[(season['away_team_name'] == away_team) & (season['Game Week'] == week-previous)]   # picking out away team's previous match
                a_sc_idx = 13       # away team score index
                a_shon_idx = 33     # away team shots on target index
                a_corn_idx = 21     # away corners index
                a_ppg_idx = 11
                
                # if home team name was not found in 'home_team_name' column...
                if (home_match_prev.size == 0):
                    home_match_prev = season.loc[(season['away_team_name'] == home_team) & (season['Game Week'] == week-previous)]   # picking out home team's previous match
                    h_sc_idx = 13      # home team score index
                    h_shon_idx = 33    # home shots on target index
                    h_corn_idx = 21    # home corners index
                    h_ppg_idx = 11
                    
                # if away team name was not found in 'away_team_name' column...
                if (away_match_prev.size == 0):
                    away_match_prev = season.loc[(season['home_team_name'] == away_team) & (season['Game Week'] == week-previous)]   # picking out away team's previous match
                    a_sc_idx = 12      # away team score index
                    a_shon_idx = 32    # away team shots on target index
                    a_corn_idx = 20    # away corners index
                    a_ppg_idx = 10
                    
                # if loop is 1 match in the past...   
                if previous == 1:
                    home_PPG = home_match_prev.iat[0,h_ppg_idx]
                    away_PPG = away_match_prev.iat[0,a_ppg_idx]
                    
                #print(home_team, 'goals scored in week', week-previous, '= ', home_match_prev.iat[0,h_sc_idx])
                total_home_goals += home_match_prev.iat[0,h_sc_idx]
                total_away_goals += away_match_prev.iat[0,a_sc_idx]
                
                #print(home_team, 'goals conceded in week', week-previous, '=', home_match_prev.iat[0,a_sc_idx])
                total_home_conceded += home_match_prev.iat[0,a_sc_idx]
                total_away_conceded += away_match_prev.iat[0,h_sc_idx]
                
                total_home_shotson += home_match_prev.iat[0,h_shon_idx]
                total_away_shotson += away_match_prev.iat[0,a_shon_idx]
                
                total_home_corners += home_match_prev.iat[0,h_corn_idx]
                total_away_corners += away_match_prev.iat[0,a_corn_idx]
                
                
                previous += 1

            in_idx = row_idx - (past*10)
            input_season[in_idx][0] = total_home_goals/past          # input INDEX 0 (home team avg. goals over "past")
            input_season[in_idx][1] = total_home_conceded/past       # input INDEX 1 (home team avg. conceded goals over "past")
            input_season[in_idx][2] = home_PPG_pre                   # input INDEX 2 (home team pre-match PPG: PPG in current season)
            input_season[in_idx][3] = home_PPG                       # input INDEX 3 (home team PPG including past seasons)
            input_season[in_idx][4] = total_home_shotson/past        # input INDEX 4 (home team avg. shots on target over "past")
            input_season[in_idx][5] = total_home_corners/past        # input INDEX 5 (home team avg. corner kicks over "past")
            input_season[in_idx][6] = total_away_goals/past          # input INDEX 6 (away team avg. goals over "past")
            input_season[in_idx][7] = total_away_conceded/past       # input INDEX 7 (away team avg. conceded goals over "past")
            input_season[in_idx][8] = away_PPG_pre                   # input INDEX 8 (away team pre-match PPG: PPG in current season)
            input_season[in_idx][9] = away_PPG                       # input INDEX 9 (away team PPG including past seasons)
            input_season[in_idx][10] = total_away_shotson/past       # input INDEX 10 (away team avg. shots on target over "past")
            input_season[in_idx][11] = total_away_corners/past       # input INDEX 11 (away team avg. corner kicks over "past")
            input_season[in_idx][12] = result                        # label INDEX 12 (match result)
            
            #print(input_season[in_idx])
            
            row_idx += 1
            match_count += 1

        week += 1
        
    season_idx += 1
      
        
print('19/20 season input matrix: \n', in_20, '\n')
print('18/19 season input matrix: \n', in_19, '\n')
print('17/18 season input matrix: \n', in_18, '\n')
print('16/17 season input matrix: \n', in_17, '\n')
print('15/16 season input matrix: \n', in_16, '\n')
print('14/15 season input matrix: \n', in_15, '\n')
print('13/14 season input matrix: \n', in_14, '\n')
print('12/13 season input matrix: \n', in_13, '\n')

19/20 season input matrix: 
 [[1.   1.2  0.5  ... 5.2  4.8  1.  ]
 [1.2  0.8  2.   ... 6.4  6.4  0.  ]
 [1.2  1.4  1.5  ... 6.   3.   0.  ]
 ...
 [0.8  1.4  1.44 ... 8.   9.4  1.  ]
 [1.4  0.8  1.   ... 3.8  6.2  0.  ]
 [2.   0.8  1.17 ... 5.   5.2  2.  ]] 

18/19 season input matrix: 
 [[ 1.4   2.    1.5  ...  5.8   5.    2.  ]
 [ 0.6   2.    0.   ...  5.    5.4   0.  ]
 [ 0.6   1.2   0.5  ... 10.2   8.4   1.  ]
 ...
 [ 1.6   2.2   1.22 ...  3.4   3.8   2.  ]
 [ 1.    0.8   2.06 ...  5.8   8.    2.  ]
 [ 1.4   1.6   1.5  ...  5.6   6.    1.  ]] 

17/18 season input matrix: 
 [[0.8  1.2  3.   ... 7.8  9.   1.  ]
 [1.2  1.   1.5  ... 5.2  6.4  2.  ]
 [0.4  2.   1.5  ... 3.6  4.2  0.  ]
 ...
 [0.4  1.8  1.17 ... 4.4  2.6  1.  ]
 [1.2  1.2  2.22 ... 3.2  7.2  0.  ]
 [1.2  2.   1.33 ... 3.6  3.2  0.  ]] 

16/17 season input matrix: 
 [[1.6  2.   1.5  ... 5.2  4.6  0.  ]
 [0.6  1.2  1.5  ... 7.8  5.8  0.  ]
 [2.2  1.2  3.   ... 3.4  4.   0.  ]
 ...
 [0.8  0.6  1.33 ... 5.   5.2  1.  ]
 [1.2

In [41]:
# Joining all season input data into one dataset
data_full_np = np.concatenate((in_20, in_19, in_18, in_17, in_16, in_15, in_14, in_13), axis=0)

# Turn numpy array back into pd.DataFrame
columns = ['h_goals','h_conceded','h_prePPG','h_avgPPG','h_shotsOn','h_corners',
           'a_goals','a_conceded','a_prePPG','a_avgPPG','a_shotsOn','a_corners',
           'outcome']

data_full = pd.DataFrame(data_full_np, columns=columns)

#Print 'outcome' column sums to determine dataset balance
print('The numbers of match outcomes (Home Team Wins, Away Team Wins, Draws):', data_full["outcome"].value_counts())

#============================ BALANCE THE DATASET ==========================#
#since away wins and ties are roughly the same->want to remove some samples from the over represented class
#home wins count =away win count

data_full_home  = data_full[data_full["outcome"]==0.0]
data_full_away = data_full[data_full["outcome"]==1.0]
data_full_tie = data_full[data_full["outcome"]==2.0]
# print(data_full_home["outcome"].value_counts(),'\n')
# print(data_full_away["outcome"].value_counts(),'\n')
# print(data_full_tie["outcome"].value_counts(),'\n')

data_full_home = data_full_home.sample(len(data_full_away), random_state=0)
# print(data_full_home["outcome"].value_counts(),'\n')
# print(data_full_away["outcome"].value_counts(),'\n')
# print(data_full_tie["outcome"].value_counts(),'\n')

subsets = [data_full_home,data_full_away,data_full_tie]
data_full = pd.concat(subsets)
print(data_full["outcome"].value_counts(),'\n')

# Get a better idea of what our data looks like BEFORE NORMALIZATION
def verbose_print(data):     # helper function
    with pd.option_context('display.max_rows', None, 'display.max_columns', None):
        print(data.head())
       
# print("\n \n Data characteristics:")
# verbose_print(data_full.describe())

# Turn 'outcome' column values into one hot-encoded form
lb = LabelBinarizer()
y = lb.fit_transform(data_full_np[:,12])   # y is vector of 1-hot encoded labels

#X = data_full_np[:,:12]                    # x is matrix of inputs (need to be normalized still)

# Normalize continuous inputs
X = data_full.drop(columns=['outcome'])

for feature in X:
  mean = X[feature].mean()
  std = X[feature].std()
  X[feature] = X[feature] - mean
  X[feature] = X[feature]/std

X = X.values
X = train_test_split(X,test_size=0.2,random_state=1)
y = train_test_split(y,test_size=0.2,random_state=1)

X_train = X[0]  # extracting training inputs
y_train = y[0]  # extracting training labels
X_infer = train_test_split(X[1],test_size = 0.33,random_state=1) # splitting inputs into validation and test sets
y_infer = train_test_split(y[1],test_size = 0.33,random_state=1) # splitting labels into validation and test sets

The numbers of match outcomes (Home Team Wins, Away Team Wins, Draws): 0.0    1228
1.0     788
2.0     624
Name: outcome, dtype: int64
1.0    788
0.0    788
2.0    624
Name: outcome, dtype: int64 



## Multi-Layer Perceptron Model

In [42]:
class MultiLayerPerceptron(nn.Module):

    def __init__(self, input_size,h_sizes,act):
      #current parameters are:
        #input size = matrix size
        #h_sizes = list of each layer size ->ex: [12,32,32,20,3]
        #MAKE SURE LAST HID_LAYER OUTPUTS TO 3
        #act_fcn = ?
        super(MultiLayerPerceptron, self).__init__()
        # self.hidden = nn.ModuleList()
        # for k in range(len(h_sizes)-2):
        #     self.hidden.append(nn.Linear(h_sizes[k], h_sizes[k+1]))

        if act =='relu':
          self.act = nn.ReLU()
        elif act == 'sigmoid':  
          self.act = nn.sigmoid()
        self.out = nn.Softmax()


        self.fc1 = nn.Linear(input_size,32)
        self.fc2 = nn.Linear(32,32)
        self.fc3 = nn.Linear(32,20)
        self.fc4 = nn.Linear(20,3)   #output is 3 classes for home win, away win, tie

    def forward(self, x):
      # for i in self.hidden:
      #   x = self.act(self.hidden[i])
      # x =  self.out(x)

      x = F.relu(self.fc1(x))
      x = F.relu(self.fc2(x))
      x = F.relu(self.fc3(x))
      x = torch.sigmoid(self.fc4(x))
      return x

### Helper Functions

#### Dataloader creation


In [26]:
# MatchDataset turns matrix-style datasets into map-style datasets
class MatchDataset(torch.utils.data.Dataset):

    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, index):
        return self.X[index],self.y[index]
    

def load_data(batch_size):

    train_dataset = MatchDataset(X_train,y_train)
    valid_dataset = MatchDataset(X_infer[0],y_infer[0])
    test_dataset = MatchDataset(X_infer[1],y_infer[1])

    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=batch_size, shuffle=True)
    test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=True) 

    return train_loader, valid_loader, test_loader

train_loader, valid_loader, test_loader = load_data(batch_size)





#### Get prediction from output

In [27]:
def get_preds(z):
    
  out = np.zeros(z.shape)
  #inputs: output of model
  #outputs: corresponding output to prediction
  max_idxs = torch.max(z,1)[1]
  for entry,idx in enumerate(max_idxs,0):
      for val in range(3):
        if val == idx:
          out[entry][val] = 1

  return out

#### Accuracy function

In [28]:
def accuracy(preds, labels):
    #inputs: preds: array, labels:array 
    #output: overall accuracy
    
    correct = 0
    for batch in range(len(preds)):
        for i in range(3):
            if (preds[batch][i] == 1) and (labels[batch][i] == 1):
                correct += 1     
    #for i in range(len(preds)):
     #   if preds[i] == labels[i].numpy():
        #  correct +=1
    return (correct/len(preds))

#### Validate 

In [29]:
def validate(model,val_loader,loss_fcn):
  val_acc=[]
  val_loss=[]
  for epoch in range(0,num_epochs):
    for i,data in enumerate(val_loader,0): #iterate over val_loader, start idx=0
      inputs,labels = data

      z = model(inputs.float())  #z is size=3 

      preds = get_preds(z) #preds used for accuracy 

      loss = loss_fcn(input=z, target=labels.float())

      val_acc.append(accuracy(preds,labels))
      val_loss.append(loss.item())

  ValLoss = sum(val_loss)/len(val_loss)
  ValAcc = sum(val_acc) / len(val_acc)
  
  return ValAcc, ValLoss

#### plotting function

In [30]:
def plot(epoch, train_var, val_var,xlabel,ylabel):
    plt.plot(epoch, train_var, label = 'train')
    plt.plot(epoch, val_var, label = 'validation')
    plt.title(str(xlabel) + ' vs. '+str(ylabel))
    plt.xlabel(str(xlabel))
    plt.ylabel(str(ylabel))
    plt.legend()
    plt.show()

#### MLP loader

In [31]:
def load_MLP(lr):
  #add optimizer,loss functions as a hyperparameters
  model = MultiLayerPerceptron(input_size,h_sizes,act)
  optimizer = torch.optim.SGD(model.parameters(),lr=lr)
  loss_fcn = nn.MSELoss()
  return model,optimizer,loss_fcn

##Training Loop

In [None]:
def train(rseed,lr,num_epochs):

    torch.manual_seed(rseed)

    model,optimizer,loss_fcn = load_MLP(lr)   #initialize model

    #records for plotting 
    TrainAccRec = []
    TrainLossRec = []
    ValAccRec = []  
    ValLossRec = []
 
# ========================================TRAINING LOOP =========================================# 
    for epoch in range(0,num_epochs):
        batch_count = 0
        train_acc_sum = 0
        train_loss_sum = 0
        for i,data in enumerate(train_loader,0): #iterate over train_loader, start idx=0
            inputs,labels = data

            optimizer.zero_grad()  #initialize the gradients to zero  

            z = model(inputs.float())  #z is size=3 

            preds = get_preds(z) #preds used for accuracy 
            print(preds)

            loss = loss_fcn(input=z, target=labels.float())

            loss.backward() #get gradients 

            optimizer.step() #update parameters

            train_acc = accuracy(preds,labels)
            train_loss = loss.item()

            #add to overall records
            train_acc_sum += train_acc
            train_loss_sum += train_loss
            
            batch_count += 1
    

        TrainAccRec.append(train_acc_sum/batch_count)
        TrainLossRec.append(train_loss/batch_count)
        val_acc, val_loss = validate(model, valid_loader,loss_fcn)
        ValAccRec.append(val_acc)
        ValLossRec.append(val_loss)

        print("Epoch:",epoch+1)
        print("train acc:",train_acc)
        print("val acc:",val_acc)

    

    #plottting
    e = np.arange(0,num_epochs)
    plot(e,TrainAccRec,ValAccRec,'Epochs','Accuracy')
    plot(e,TrainLossRec,ValLossRec,'Epochs','Losses')
    print("Max training accuracy",max(TrainAccRec))
    print("Max Validation accuracy",max(ValAccRec))
    print("Min training loss",min(TrainLossRec))
    print("Min Validation loss",min(ValLossRec))


train(rseed,lr,num_epochs)  