In [39]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
import re
import string

In [40]:
data = pd.read_csv('results.csv')

In [41]:
data.head()

Unnamed: 0,Season,HomeTeam,AwayTeam,FTHG,FTAG
0,1993-94,Arsenal,Coventry,0,3
1,1993-94,Aston Villa,QPR,4,1
2,1993-94,Chelsea,Blackburn,1,2
3,1993-94,Liverpool,Sheffield Weds,2,0
4,1993-94,Man City,Leeds,1,1


# Data Cleaning

In [42]:
data['HomeTeam'] = data['HomeTeam'].str.strip()
data['AwayTeam'] = data['AwayTeam'].str.strip()

In [43]:
data.dtypes

Season      object
HomeTeam    object
AwayTeam    object
FTHG         int64
FTAG         int64
dtype: object

## Creating a new column: Total Goals 

In [44]:
data['TotalGoals'] = data['FTHG'] + data['FTAG']

In [45]:
data

Unnamed: 0,Season,HomeTeam,AwayTeam,FTHG,FTAG,TotalGoals
0,1993-94,Arsenal,Coventry,0,3,3
1,1993-94,Aston Villa,QPR,4,1,5
2,1993-94,Chelsea,Blackburn,1,2,3
3,1993-94,Liverpool,Sheffield Weds,2,0,2
4,1993-94,Man City,Leeds,1,1,2
...,...,...,...,...,...,...
11108,2021-22,Aston Villa,Tottenham,0,4,4
11109,2021-22,Brentford,West Ham,2,0,2
11110,2021-22,Leicester,Crystal Palace,2,1,3
11111,2021-22,Norwich,Burnley,2,0,2


# Building the Model

In [46]:
from scipy.stats import poisson

## Calculate Strength

In [47]:
data_home = data[['HomeTeam', 'FTHG', 'FTAG']]
data_away = data[['AwayTeam', 'FTHG', 'FTAG']]

In [48]:
data_home = data_home.rename(columns={'HomeTeam': 'Team', 'FTHG': 'GoalsScored', 'FTAG': 'GoalsConceded'})
data_away = data_away.rename(columns={'AwayTeam': 'Team', 'FTHG': 'GoalsConceded', 'FTAG': 'GoalsScored'})

In [49]:
data_team_strength = pd.concat([data_home, data_away], ignore_index=True).groupby('Team').mean()
data_team_strength

Unnamed: 0_level_0,GoalsScored,GoalsConceded
Team,Unnamed: 1_level_1,Unnamed: 2_level_1
Arsenal,1.779492,0.99637
Aston Villa,1.211325,1.319515
Barnsley,0.973684,2.157895
Birmingham,1.026316,1.353383
Blackburn,1.313456,1.316514
Blackpool,1.447368,2.052632
Bolton,1.163968,1.508097
Bournemouth,1.268421,1.736842
Bradford,0.894737,1.815789
Brentford,1.21875,1.5


## Function predict_winner

In [50]:
def predict_winner(home, away):
    if home in data_team_strength.index and away in data_team_strength.index:
        lambda_home = data_team_strength.at[home, 'GoalsScored'] * data_team_strength.at[away, 'GoalsConceded']
        lambda_away = data_team_strength.at[away, 'GoalsScored'] * data_team_strength.at[home, 'GoalsConceded']
        prob_home, prob_away, prob_draw = 0, 0, 0
        
        #Maximum goals in a match: 10, Minimum: 0
        for x in range(0, 11):
            for y in range(0, 11):
                p = poisson.pmf(x, lambda_home) * poisson.pmf(y, lambda_away)
                if x == y:
                    prob_draw += p
                elif x > y:
                    prob_home += p
                else:
                    prob_away += p
                    
        winner = max(prob_home, prob_away, prob_draw)
        if winner == prob_home:
            print('{} has a better chance of winning this match.'.format(home))
        elif winner == prob_away:
            print('{} has a better chance of winning this match'.format(away))
        else: print('Draw')
    else:
        return (0, 0)
        

### Testing function

In [51]:
predict_winner('Arsenal', 'Brighton')

Arsenal has a better chance of winning this match.
