# Predict Football Match Result

In [1]:
import sqlite3
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, accuracy_score

In [2]:
# Database connection
database = sqlite3.connect("database.sqlite")

In [3]:
# I chose English Premiere League
matches = pd.read_sql_query("SELECT season, date, home_team_api_id,away_team_api_id,home_team_goal,away_team_goal FROM Match WHERE league_id is 1729", database)

In [4]:
matches.head()

Unnamed: 0,season,date,home_team_api_id,away_team_api_id,home_team_goal,away_team_goal
0,2008/2009,2008-08-17 00:00:00,10260,10261,1,1
1,2008/2009,2008-08-16 00:00:00,9825,8659,1,0
2,2008/2009,2008-08-16 00:00:00,8472,8650,0,1
3,2008/2009,2008-08-16 00:00:00,8654,8528,2,1
4,2008/2009,2008-08-17 00:00:00,10252,8456,4,2


In [5]:
matches.shape

(3040, 6)

## Taking Care of Missing Data

In [6]:
# There is no missing data
matches.isnull().sum()

season              0
date                0
home_team_api_id    0
away_team_api_id    0
home_team_goal      0
away_team_goal      0
dtype: int64

## Statistical Summary 

In [7]:
print("Average of home team goal: {}".format(matches["home_team_goal"].mean()))
print("Average of away team goal: {}".format(matches["away_team_goal"].mean()))
print("Maximum goals scored by the home team: {}".format(matches["home_team_goal"].max()))
print("Maximum goals scored by the away team: {}".format(matches["away_team_goal"].max()))

Average of home team goal: 1.550986842105263
Average of away team goal: 1.1595394736842106
Maximum goals scored by the home team: 9
Maximum goals scored by the away team: 6


## Data Preprocessing

In [8]:
def label_match(row):
    if row["home_team_goal"] > row["away_team_goal"]:
        return "W"
    elif row["home_team_goal"] < row["away_team_goal"]:
        return "L"
    else:
        return "D"

In [9]:
matches['final_result'] = matches.apply (lambda row: label_match(row), axis=1)

In [10]:
#Final result column added based on the score of the match.
matches

Unnamed: 0,season,date,home_team_api_id,away_team_api_id,home_team_goal,away_team_goal,final_result
0,2008/2009,2008-08-17 00:00:00,10260,10261,1,1,D
1,2008/2009,2008-08-16 00:00:00,9825,8659,1,0,W
2,2008/2009,2008-08-16 00:00:00,8472,8650,0,1,L
3,2008/2009,2008-08-16 00:00:00,8654,8528,2,1,W
4,2008/2009,2008-08-17 00:00:00,10252,8456,4,2,W
...,...,...,...,...,...,...,...
3035,2015/2016,2015-10-17 00:00:00,8466,8197,2,2,D
3036,2015/2016,2015-10-19 00:00:00,10003,10194,0,1,L
3037,2015/2016,2015-10-17 00:00:00,8586,8650,0,0,D
3038,2015/2016,2015-10-17 00:00:00,9817,9825,0,3,L


In [11]:
#Some teams have played less match than others.
matches["home_team_api_id"].value_counts()

10260    152
8455     152
9825     152
8586     152
8650     152
10194    152
8456     152
8668     152
10252    152
8472     152
10261    133
8654     133
8659     133
9879     114
8528      95
10003     95
8559      76
8466      76
8655      76
8667      76
9850      76
10172     57
9826      57
8602      57
8658      38
8191      38
8462      38
8197      38
8483      19
9798      19
8549      19
8344      19
8678      19
9817      19
Name: home_team_api_id, dtype: int64

In [12]:
# What is the win rate of home team?
number_of_matches = matches.shape[0]
number_of_home_wins = (matches.final_result == "W").sum()

win_rate = (float(number_of_home_wins) / number_of_matches) * 100
                          

In [13]:
print("Total number of matches: {}".format(number_of_matches))
print("Number of matches won by home team: {}".format(number_of_home_wins))
print("Win rate: {}%".format(win_rate))

Total number of matches: 3040
Number of matches won by home team: 1390
Win rate: 45.723684210526315%


In [14]:
# Type of fields should be integer or float. Therefore I changed the type of result from object to int
matches.dtypes

season              object
date                object
home_team_api_id     int64
away_team_api_id     int64
home_team_goal       int64
away_team_goal       int64
final_result        object
dtype: object

In [15]:
del matches["season"]

In [16]:
matches

Unnamed: 0,date,home_team_api_id,away_team_api_id,home_team_goal,away_team_goal,final_result
0,2008-08-17 00:00:00,10260,10261,1,1,D
1,2008-08-16 00:00:00,9825,8659,1,0,W
2,2008-08-16 00:00:00,8472,8650,0,1,L
3,2008-08-16 00:00:00,8654,8528,2,1,W
4,2008-08-17 00:00:00,10252,8456,4,2,W
...,...,...,...,...,...,...
3035,2015-10-17 00:00:00,8466,8197,2,2,D
3036,2015-10-19 00:00:00,10003,10194,0,1,L
3037,2015-10-17 00:00:00,8586,8650,0,0,D
3038,2015-10-17 00:00:00,9817,9825,0,3,L


In [17]:
matches["target"] = matches["final_result"].astype("category").cat.codes

In [18]:
matches

Unnamed: 0,date,home_team_api_id,away_team_api_id,home_team_goal,away_team_goal,final_result,target
0,2008-08-17 00:00:00,10260,10261,1,1,D,0
1,2008-08-16 00:00:00,9825,8659,1,0,W,2
2,2008-08-16 00:00:00,8472,8650,0,1,L,1
3,2008-08-16 00:00:00,8654,8528,2,1,W,2
4,2008-08-17 00:00:00,10252,8456,4,2,W,2
...,...,...,...,...,...,...,...
3035,2015-10-17 00:00:00,8466,8197,2,2,D,0
3036,2015-10-19 00:00:00,10003,10194,0,1,L,1
3037,2015-10-17 00:00:00,8586,8650,0,0,D,0
3038,2015-10-17 00:00:00,9817,9825,0,3,L,1


In [19]:
# I choose Arsenal for training my models.
home_matches = matches[matches["home_team_api_id"] == 9825]
away_matches = matches[matches["away_team_api_id"] == 9825]
arsenal_matches = pd.concat([home_matches, away_matches])
arsenal_matches

Unnamed: 0,date,home_team_api_id,away_team_api_id,home_team_goal,away_team_goal,final_result,target
1,2008-08-16 00:00:00,9825,8659,1,0,W,2
11,2008-10-29 00:00:00,9825,8586,4,4,D,0
30,2008-11-08 00:00:00,9825,10260,2,1,W,2
41,2008-11-15 00:00:00,9825,10252,0,2,L,1
71,2008-12-06 00:00:00,9825,8528,1,0,W,2
...,...,...,...,...,...,...,...
2965,2016-05-08 00:00:00,8456,9825,2,2,D,0
2985,2015-08-29 00:00:00,10261,9825,0,1,L,1
3002,2015-09-19 00:00:00,8455,9825,2,0,W,2
3010,2015-09-26 00:00:00,8197,9825,2,5,L,1


In [20]:
n_matches_arsenal = arsenal_matches.shape[0]
n_wins = (arsenal_matches.final_result == "W").sum()
win_rate_arsenal = (float(n_wins) / n_matches_arsenal) * 100

In [21]:
print("Total number of matches that Arsenal played: {}".format(n_matches_arsenal))
print("Number of matches won by Arsenal: {}".format(n_wins))
print("Win rate: {}%".format(win_rate_arsenal))

Total number of matches that Arsenal played: 304
Number of matches won by Arsenal: 137
Win rate: 45.06578947368421%


In [22]:
print("The home goal average of Arsenal scored: {}".format(arsenal_matches["home_team_goal"].mean()))
print("The away goal average of Arsenal scored: {}".format(arsenal_matches["away_team_goal"].mean()))
print("The maximum home goals scored by Arsenal: {}".format(arsenal_matches["home_team_goal"].max()))
print("The maximum away goals scored by Arsenal: {}".format(arsenal_matches["away_team_goal"].max()))

The home goal average of Arsenal scored: 1.6578947368421053
The away goal average of Arsenal scored: 1.2796052631578947
The maximum home goals scored by Arsenal: 8
The maximum away goals scored by Arsenal: 6


In [23]:
train, test = train_test_split(arsenal_matches, test_size=0.2, random_state=25)

In [24]:
print(f"Number of training examples: {train.shape[0]}")
print(f"Number of testing examples: {test.shape[0]}")

Number of training examples: 243
Number of testing examples: 61


## Choosing The Models

## Random Forrest Classifier

In [25]:
# I trained the all models based on Arsenal's past home and away mathes.
predictors = ["home_team_api_id", "away_team_api_id"]

In [26]:
rf = RandomForestClassifier(n_estimators=100, min_samples_split=10, random_state=1)
rf.fit(train[predictors], train["target"])
rf_prediction = rf.predict(test[predictors])
rf_acc = accuracy_score(test["target"], rf_prediction)
rf_score = precision_score(test["target"], rf_prediction, average="micro")

## Gradient Boosting Classifier

In [27]:
gb = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0)
gb.fit(train[predictors], train["target"])
gb_prediction = gb.predict(test[predictors])
gb_acc = accuracy_score(test["target"], gb_prediction)
gb_score = precision_score(test["target"], gb_prediction, average="micro")

## K-nearest Classifier

In [28]:
neigh = KNeighborsClassifier(n_neighbors=5)
neigh.fit(train[predictors], train["target"])
neigh_prediction = neigh.predict(test[predictors])
neigh_acc = accuracy_score(test["target"], neigh_prediction)
neigh_score = precision_score(test["target"], neigh_prediction, average="micro")

## Accuracy Results

In [29]:
print("Random Forrest Classifier Accuracy Score: {}%".format(rf_acc*100))
print("Gradient Boosting Classifier Accuracy Score: {}%".format(gb_acc*100))
print("K-nearest Classifier Accuracy Score: {}%".format(neigh_acc*100))

Random Forrest Classifier Accuracy Score: 52.459016393442624%
Gradient Boosting Classifier Accuracy Score: 54.09836065573771%
K-nearest Classifier Accuracy Score: 47.540983606557376%


## Precision Scores

In [30]:
print("Random Forrest Classifier Precision score: {}%".format(rf_score*100))
print("Gradient Boosting Classifier Precision score: {}%".format(gb_score*100))
print("K-nearest Classifier Precision score: {}%".format(neigh_score*100))

Random Forrest Classifier Precision score: 52.459016393442624%
Gradient Boosting Classifier Precision score: 54.09836065573771%
K-nearest Classifier Precision score: 47.540983606557376%


## Confusion Matrixes

In [31]:
combined = pd.DataFrame(dict(actual=test["target"], predicted=rf_prediction))
pd.crosstab(index=combined["actual"], columns=combined["predicted"])

predicted,0,1,2
actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,2,5,6
1,3,9,5
2,3,7,21


In [32]:
combined = pd.DataFrame(dict(actual=test["target"], predicted=gb_prediction))
pd.crosstab(index=combined["actual"], columns=combined["predicted"])

predicted,0,1,2
actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,1,7,5
1,1,11,5
2,3,7,21


In [33]:
combined = pd.DataFrame(dict(actual=test["target"], predicted=neigh_prediction))
pd.crosstab(index=combined["actual"], columns=combined["predicted"])

predicted,0,1,2
actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,3,7,3
1,6,8,3
2,6,7,18
