In [12]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

pd.set_option("display.min_rows", 200)
pd.set_option("display.max_rows", 100)
pd.set_option("display.max_columns", 100)

# Premier League - England
premier_league_2016_2017 = pd.read_csv("https://www.football-data.co.uk/mmz4281/1617/E0.csv")
premier_league_2017_2018 = pd.read_csv("https://www.football-data.co.uk/mmz4281/1718/E0.csv")
premier_league_2018_2019 = pd.read_csv("https://www.football-data.co.uk/mmz4281/1819/E0.csv")
premier_league_2019_2020 = pd.read_csv("https://www.football-data.co.uk/mmz4281/1920/E0.csv")
premier_league_2020_2021 = pd.read_csv("https://www.football-data.co.uk/mmz4281/2021/E0.csv")
premier_league_2021_2022 = pd.read_csv("https://www.football-data.co.uk/mmz4281/2122/E0.csv")
premier_league_2022_2023 = pd.read_csv("https://www.football-data.co.uk/mmz4281/2223/E0.csv")

df = pd.concat([premier_league_2016_2017, premier_league_2017_2018, premier_league_2018_2019, premier_league_2019_2020, premier_league_2020_2021, premier_league_2021_2022, premier_league_2022_2023], axis=0, ignore_index=True)
#df = pd.concat([premier_league_2021_2022, premier_league_2022_2023], axis=0, ignore_index=True)
df["League"] = "Premier League"
df = df[["League", "Date", "HomeTeam", "AwayTeam", "B365H", "B365D", "B365A", "FTHG", "FTAG"]]
df.columns = ["league", "date", "home", "away", "home_odds", "draw_odds", "away_odds", "home_goals_ft", "away_goals_ft"]
df = df.astype({"date":"datetime64"})
df.dropna(inplace=True)
df = df.convert_dtypes(infer_objects=True)
df.reset_index(inplace=True, drop=True)
df.tail()

Unnamed: 0,league,date,home,away,home_odds,draw_odds,away_odds,home_goals_ft,away_goals_ft
2516,Premier League,2023-02-25,Bournemouth,Man City,10.0,5.75,1.29,1,4
2517,Premier League,2023-02-25,Crystal Palace,Liverpool,4.75,3.8,1.73,0,0
2518,Premier League,2023-02-26,Tottenham,Chelsea,2.55,3.25,2.9,2,0
2519,Premier League,2023-01-03,Arsenal,Everton,1.36,4.75,10.0,4,0
2520,Premier League,2023-01-03,Liverpool,Wolves,1.5,4.75,6.0,2,0


<h5>Feature Engineering and Preprocessing</h5>
<h6>When the quantity of the score of the Home Goals Full Time and Away Goals Full Time is bigger then 0 then assign 1 to the field "btts".When the quantity of the score of the visitor team is different of the home team(principal) then assign 0 to field "btts".</h6>

In [13]:
df["btts"] = np.where((df["home_goals_ft"] > 0) & (df["away_goals_ft"] > 0), 1, 0)
df["sum_odds"] = df["home_odds"] + df["draw_odds"] + df["away_odds"]
df.head()

Unnamed: 0,league,date,home,away,home_odds,draw_odds,away_odds,home_goals_ft,away_goals_ft,btts,sum_odds
0,Premier League,2016-08-13,Burnley,Swansea,2.4,3.3,3.25,0,1,0,8.95
1,Premier League,2016-08-13,Crystal Palace,West Brom,2.0,3.3,4.5,0,1,0,9.8
2,Premier League,2016-08-13,Everton,Tottenham,3.2,3.4,2.4,1,1,1,9.0
3,Premier League,2016-08-13,Hull,Leicester,4.5,3.6,1.91,2,1,1,10.01
4,Premier League,2016-08-13,Man City,Sunderland,1.25,6.5,15.0,2,1,1,22.75


<h5>Defining X and Y</h5>

In [14]:
X = df.loc[:, ["home_odds", "draw_odds", "away_odds", "sum_odds"]]
y = df.loc[:, "btts"]

<h5>Split Train and Test</h5>

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
print(X)

(1890, 4) (631, 4) (1890,) (631,)
      home_odds  draw_odds  away_odds  sum_odds
0          2.40       3.30       3.25      8.95
1          2.00       3.30       4.50      9.80
2          3.20       3.40       2.40      9.00
3          4.50       3.60       1.91     10.01
4          1.25       6.50      15.00     22.75
5          2.38       3.20       3.40      8.98
6          1.80       3.75       5.00     10.55
7          2.40       3.50       3.10      9.00
8          4.75       3.60       1.85     10.20
9          1.65       4.00       6.00     11.65
10         1.53       4.20       7.50     13.23
11         6.50       4.20       1.57     12.27
12         2.88       3.60       2.50      8.98
13         5.50       3.80       1.73     11.03
14         1.80       3.60       5.25     10.65
15         1.45       4.75       8.00     14.20
16         5.50       3.80       1.73     11.03
17         3.00       3.25       2.60      8.85
18         2.55       3.20       3.10      8.85
19    

<h5>Train the Model</h5>

In [17]:
lr = LogisticRegression(random_state=0)
lr.fit(X, y)

100%|██████████| 29/29 [00:02<00:00, 11.29it/s]


<h5>Evaluate the Model</h5>

In [19]:
models

NameError: name 'models' is not defined

<h5>Informations about model</h5>

In [None]:
print(f"Games that was predicted as been 0 and was 0: {confusion_matrix(y_test, y_predictions)[0][0]}")
print(f"Games that was predicted as been 0 and was 1: {confusion_matrix(y_test, y_predictions)[1][0]}")
print(f"Games that was predicted as been 1 and was 0: {confusion_matrix(y_test, y_predictions)[0][1]}")
print(f"Games that was predicted as been 1 and was 1: {confusion_matrix(y_test, y_predictions)[1][1]}")

test_total_matches = confusion_matrix(y_test, y_predictions)[0][0] + \
                     confusion_matrix(y_test, y_predictions)[0][1] + \
                     confusion_matrix(y_test, y_predictions)[1][0] + \
                     confusion_matrix(y_test, y_predictions)[1][1]

entry_games = confusion_matrix(y_test, y_predictions)[0][1] + confusion_matrix(y_test, y_predictions)[1][1]
entry_tax = (entry_games / test_total_matches) * 100
hit_rate = (confusion_matrix(y_test, y_predictions)[1][1] / entry_games) * 100

print(f"Total Matches: {test_total_matches}")
print(f"Entry Games: {entry_games}")
print(f"Entry Tax: {entry_tax.round(2)}%")
print(f"Hit Rate: {hit_rate.round(2)}%")

<h5>Prevision / Probabilities of classification</h5>

In [None]:
lr = LogisticRegression(random_state=0)
lr.fit(X, y)
predict_probability = lr.predict_proba(X)
df["predictions"] = y_predictions = lr.predict(X)
df["probability_0"] = predict_probability[:, 0].round(2)
df["probability_1"] = predict_probability[:, 1].round(2)
df.tail()

<h5>Backtest</h5>

In [None]:
stake = 1
win_back = stake * (df["home_odds"] - 1)
lose_back = -stake

df.loc[(df["predictions"] == 1) & (df["win_home"] == 1), "profit"] = win_back
df.loc[(df["predictions"] == 1) & (df["win_home"] == 0), "profit"] = lose_back
df.loc[(df["predictions"] == 0) & (df["win_home"] == 1), "profit"] = 0
df.loc[(df["predictions"] == 0) & (df["win_home"] == 1), "profit"] = 0

df = df[df["predictions"] == 1]
df["profit_accumulated"] = df["profit"].cumsum()
df.tail(20)

<h5>Plot Backtest</h5>