# MatchPredictor

### A neural network which predicts the outcomes of Premier League football matches

#### Importing Libraries & Packages

In [2]:
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F

import pandas as pd

#### Hyperparameters

In [None]:
past = 5    # How many past games are taken into account for team form calculations
rseed = 1 
num_epoch = 100
            #add num hidden layers, optimizer loss fcn
lr = 0.01 

#### Importing Match Data

In [None]:
# Imported CSV becomes a pandas dataframe object
data_20 = pd.read_csv("Prem_data_19-20\england-premier-league-matches-2019-to-2020-stats.csv")
data_19 = pd.read_csv("Prem_data_18-19\england-premier-league-matches-2018-to-2019-stats.csv")
data_18 = pd.read_csv("Prem_data_17-18\england-premier-league-matches-2017-to-2018-stats.csv")
data_17 = pd.read_csv("Prem_data_16-17\england-premier-league-matches-2016-to-2017-stats.csv")
data_16 = pd.read_csv("Prem_data_15-16\england-premier-league-matches-2015-to-2016-stats.csv")

data = [data_20, data_19, data_18, data_17, data_16]

#### Preprocessing Data

In [None]:
'''
# Set up empty matrices to be filled with INPUTS

# Each row represents the inputs that describe a match
# The first "past" weeks from each season cannot be used in training/testing as they have no previous matches to get data from
# Note: "past" is an hyperparameter (integer)

in_20 = np.zeros((380-(past*10),12))
in_19 = np.zeros((380-(past*10),12))
in_18 = np.zeros((380-(past*10),12))
in_17 = np.zeros((380-(past*10),12))
in_16 = np.zeros((380-(past*10),12))


# Computing average goals scored per game over last "past" games
#for season in data:

season = data_20
week = past + 1

while week < 39:
    row_idx = (week-1)*10             # index of the first match of new week
    match_count = 0                   # counting the 10 matches played in a given week

    while match_count < 10:
       # match = season.iloc[[row_idx]]
        home_team = season.at[row_idx,'home_team_name']    # saving home team name
        total_home_goals = 0                               # counting the total goals scored by the home team over "past" matches
        previous = 1                                       # index used to keep track of how many past matches have been looked at

        while previous <= past:
    
            home_match_prev = season.loc[(season['home_team_name'] == home_team) & (season['Game Week'] == week-previous)]   # picking out home team's previous match
            hsc_idx = 12
    
            if (home_match_prev.size == 0):
                home_match_prev = season.loc[(season['away_team_name'] == home_team) & (season['Game Week'] == week-previous)]   # picking out home team's previous match
                hsc_idx = 13
            
            print(home_team, 'goals scored in week', week-previous, '= ', home_match_prev.iat[0,hsc_idx])
            total_home_goals += home_match_prev.iat[0,hsc_idx]
            previous += 1
        
        in_idx = row_idx - (past*10)
        in_20[in_idx][0] = total_home_goals/past
        row_idx += 1
        match_count += 1

    week += 1
        
        
print(in_20) 
''' 

'\n# Set up empty matrices to be filled with INPUTS\n\n# Each row represents the inputs that describe a match\n# The first "past" weeks from each season cannot be used in training/testing as they have no previous matches to get data from\n# Note: "past" is an hyperparameter (integer)\n\nin_20 = np.zeros((380-(past*10),12))\nin_19 = np.zeros((380-(past*10),12))\nin_18 = np.zeros((380-(past*10),12))\nin_17 = np.zeros((380-(past*10),12))\nin_16 = np.zeros((380-(past*10),12))\n\n\n# Computing average goals scored per game over last "past" games\n#for season in data:\n\nseason = data_20\nweek = past + 1\n\nwhile week < 39:\n    row_idx = (week-1)*10             # index of the first match of new week\n    match_count = 0                   # counting the 10 matches played in a given week\n\n    while match_count < 10:\n       # match = season.iloc[[row_idx]]\n        home_team = season.at[row_idx,\'home_team_name\']    # saving home team name\n        total_home_goals = 0                     

#### DATA PREPROCSESSING  
  
Here we create the inputs for our model from the raw .csv files we collected from *footystats.org*.  
  
- Each season will be represented by a matrix.  
- Each row of this matrix will consist of the inputs that describe a single match to the neural net.  
  
In each row, the entries are as follows:  
*Note: **past** is an integer hyperparameter*  
  
**Index 0**: Home team average goals scored per game over last **past** games.  
**Index 1**:  Home team average goals conceded per game over last **past** games.  
**Index 2**:  Home team pre-match PPG.  
**Index 3**:  Home team ppg from last game (so current game isn’t included).  
**Index 4**:  Home team average number of shots on target over last **past** games.  
**Index 5**:  Home team average number of corners over last **past** games.  
**Index 6**:  Away team average goals scored per game over last **past** games.  
**Index 7**:  Away team average goals conceded per game over last **past** games.  
**Index 8**:  Away team pre-match PPG.  
**Index 9**:  Away team ppg from last game (so current game isn’t included).  
**Index 10**:  Away team average number of shots on target over last **past** games.  
**Index 11**:  Away team average number of corners over last **past** games.  
**Index 12 (LABEL)**:  0 if Home Team won, 1 if Away Team won.  
  
  
*NOTE: The first "past" weeks from each season cannot be used in training/testing as they have no previous matches to get data from.*

In [None]:
# Set up empty matrices to be filled with INPUTS.
# Once filled with values, each row will represent the inputs that describe a match to the NN.
    
    
in_20 = np.zeros((380-(past*10),12))
in_19 = np.zeros((380-(past*10),12))
in_18 = np.zeros((380-(past*10),12))
in_17 = np.zeros((380-(past*10),12))
in_16 = np.zeros((380-(past*10),12))

input_seasons = [in_20, in_19, in_18, in_17, in_16]


season_idx = 0
for season in data:

    week = past + 1
    input_season = input_seasons[season_idx]

    while week < 39:
        row_idx = (week-1)*10             # index of the first match of new week
        match_count = 0                   # counting the 10 matches played in a given week

        while match_count < 10:
           # match = season.iloc[[row_idx]]
            home_team = season.at[row_idx,'home_team_name']    # saving home team name
            total_home_goals = 0                               # counting the total goals scored by the home team over "past" matches
            previous = 1                                       # index used to keep track of how many past matches have been looked at

            while previous <= past:

                home_match_prev = season.loc[(season['home_team_name'] == home_team) & (season['Game Week'] == week-previous)]   # picking out home team's previous match
                hsc_idx = 12

                if (home_match_prev.size == 0):
                    home_match_prev = season.loc[(season['away_team_name'] == home_team) & (season['Game Week'] == week-previous)]   # picking out home team's previous match
                    hsc_idx = 13

                #print(home_team, 'goals scored in week', week-previous, '= ', home_match_prev.iat[0,hsc_idx])
                total_home_goals += home_match_prev.iat[0,hsc_idx]
                previous += 1

            in_idx = row_idx - (past*10)
            input_season[in_idx][0] = total_home_goals/past
            row_idx += 1
            match_count += 1

        week += 1
        
    season_idx += 1
        
print('19/20 season input matrix: \n', in_20, '\n')
print('18/19 season input matrix: \n', in_19, '\n')
print('17/18 season input matrix: \n', in_18, '\n')
print('16/17 season input matrix: \n', in_17, '\n')
print('15/16 season input matrix: \n', in_16, '\n')

19/20 season input matrix (row = match): 
 [[1.  0.  0.  ... 0.  0.  0. ]
 [1.2 0.  0.  ... 0.  0.  0. ]
 [1.2 0.  0.  ... 0.  0.  0. ]
 ...
 [0.8 0.  0.  ... 0.  0.  0. ]
 [1.4 0.  0.  ... 0.  0.  0. ]
 [2.  0.  0.  ... 0.  0.  0. ]] 

18/19 season input matrix (row = match): 
 [[1.4 0.  0.  ... 0.  0.  0. ]
 [0.6 0.  0.  ... 0.  0.  0. ]
 [0.6 0.  0.  ... 0.  0.  0. ]
 ...
 [1.6 0.  0.  ... 0.  0.  0. ]
 [1.  0.  0.  ... 0.  0.  0. ]
 [1.4 0.  0.  ... 0.  0.  0. ]] 

17/18 season input matrix (row = match): 
 [[0.8 0.  0.  ... 0.  0.  0. ]
 [1.2 0.  0.  ... 0.  0.  0. ]
 [0.4 0.  0.  ... 0.  0.  0. ]
 ...
 [0.4 0.  0.  ... 0.  0.  0. ]
 [1.2 0.  0.  ... 0.  0.  0. ]
 [1.2 0.  0.  ... 0.  0.  0. ]] 

16/17 season input matrix (row = match): 
 [[1.6 0.  0.  ... 0.  0.  0. ]
 [0.6 0.  0.  ... 0.  0.  0. ]
 [2.2 0.  0.  ... 0.  0.  0. ]
 ...
 [0.8 0.  0.  ... 0.  0.  0. ]
 [1.2 0.  0.  ... 0.  0.  0. ]
 [0.2 0.  0.  ... 0.  0.  0. ]] 

15/16 season input matrix (row = match): 
 [[1.4 0. 

## Multi-Layer Perceptron Model

In [11]:
import torch.nn as nn
import torch.nn.functional as F

class MultiLayerPerceptron(nn.Module):

    def __init__(self, input_size):

        super(MultiLayerPerceptron, self).__init__()

        self.fc1 = nn.Linear(input_size,64)
        self.fc2 = nn.Linear(64,64)
        self.fc3 = nn.Linear(64,64)
        self.fc4 = nn.Linear(64,3)   #output is 3 classes for home win, away win, tie

    def forward(self, features):

        features = F.relu(self.fc1(features))
        features = F.relu(self.fc2(features))
        features = F.relu(self.fc3(features))
        features = F.relu(self.fc4(features))
        features = torch.sigmoid(features)

        return features

###Helper Functions

#### Get prediction from output

In [25]:
def get_preds(z):
  dims = z.size()
  out = np.zeros(dims)
  #inputs: output of model
  #outputs: corresponding output to prediction
  max_idxs = torch.max(z,1)[1]
  print(max_idxs)
  for entry,idx in enumerate(max_idxs,0):
      for val in range(dims[1]):
        if val == idx:
          out[entry][val] = 1
  return out

tensor([[0.4863, 0.2926, 0.3121, 0.6335, 0.7843, 0.7847],
        [0.5819, 0.0760, 0.6694, 0.9646, 0.5324, 0.1178],
        [0.9359, 0.9813, 0.5585, 0.9650, 0.0881, 0.8878],
        [0.7985, 0.7015, 0.7754, 0.6497, 0.5587, 0.9334],
        [0.7968, 0.7619, 0.1397, 0.7798, 0.8115, 0.7050]])
tensor([5, 3, 1, 5, 4])
[[0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 1. 0. 0.]
 [0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 1. 0.]]


####accuracy function

In [None]:
def accuracy(preds, labels):
    #inputs: preds: array, labels:array 
    #output: overall accuracy
    correct = 0
    for i in range(len(preds)):
        if preds[i] == labels[i]:
          correct +=1
    return (correct/len(preds))

####validate 

In [None]:
def validate(model,val_loader,loss_fcn):
  val_acc=[]
  val_loss=[]
  for epoch in range(0,num_epochs):
    for i,data in enumerate(val_loader,0): #iterate over val_loader, start idx=0
      inputs,labels = data  #val_loader idx0->data, idx1->labels
      
      labels = F.one_hot(labels) #convert labels 0:home 1:away 2:tie to vector. Ex: 0->[1,0,0] 1->[0,1,0], 2->[0,0,1]

      optimizer.zero_grad()  #initialize the gradients to zero  

      z = model(inputs)
      
      preds = get_preds(z) #preds used for accuracy 
      loss = loss_fcn(input=z.squeeze(), target=labels.float())

      mini_val_acc.append(accuracy(preds,labels))
      mini_val_loss.append(loss.item())

  ValLoss = sum(val_loss)/len(val_loss)
  ValAcc = sum(val_acc) / len(val_acc)
  
  return ValAcc, ValLoss

#### MLP loader

In [None]:
def load_MLP(lr):
  #add optimizer,loss functions as a hyperparameters
  model = MultiLayerPerceptron(input_size)
  optimizer = torch.optim.SGD(model.parameters(),lr=lr)
  loss_fcn = nn.MSELoss()
  return model,optimizer,loss_fcn

##Training Loop

In [None]:
def train(rseed,lr,num_epochs):

  torch.manual_seed(rseed)

  model,optimizer,loss_fcn = load_MLP(lr)   #initialize model 
  soft = nn.SoftMax(dim=1) #create softmax act function


  #records for plotting 
  TrainAccRec = []
  TrainLossRec = []
  ValAccRec = []  
  ValLossRec = []
 
# ========================================TRAINING LOOP =========================================# 
  for epoch in range(0,num_epochs):
    for i,data in enumerate(train_loader,0): #iterate over train_loader, start idx=0
      inputs,labels = data  #trainloader idx0->data, idx1->labels
      
      labels = F.one_hot(labels) #convert labels 0:home 1:away 2:tie to vector. Ex: 0->[1,0,0] 1->[0,1,0], 2->[0,0,1]

      optimizer.zero_grad()  #initialize the gradients to zero  

      z = model(inputs)  #z array of vector outputs, each size=3 

      #z = soft(z)  #may not need since using sigmoid act fcn already

      preds = get_preds(z) #preds used for accuracy 

      loss = loss_fcn(input=z.squeeze(), target=labels.float())

      
      loss.backward() #get gradients 
     
      optimizer.step() #update parameters

      train_acc = accuracy(preds,labels)
      train_loss = loss.item()

      val_acc, val_loss = validate(model, val_iter,loss_fcn)
        
    #add to overall records
      TrainAccRec.append(train_acc)
      TrainLossRec.append(train_loss)
      ValAccRec.append(epoch_val_acc)
      ValLossRec.append(epoch_val_loss)

    # print("Epoch:",epoch+1)
    # print("train acc:",epoch_train_acc)
    # print("val acc:",epoch_val_acc)
 
    

#plottting
  e = np.arange(0,num_epochs)
  plot(e,TrainAccRec,ValAccRec,'Epochs','Accuracy')
  plot(e,TrainLossRec,ValLossRec,'Epochs','Losses')
  print("Max training accuracy",max(TrainAccRec))
  print("Max Validation accuracy",max(ValAccRec))
  print("Min training loss",min(TrainLossRec))
  print("Min Validation loss",min(ValLossRec))