In [2]:
# All imports

import sys 
import os
import os.path
import time
import numpy as np
import csv
import datetime
import pandas as pd
import pickle
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
import lightgbm as lgb
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

# For the scrapper

import selenium
from selenium import webdriver
import io
import requests
from webdriver_manager.chrome import ChromeDriverManager
from selenium.common.exceptions import ElementClickInterceptedException
from selenium.webdriver.support.ui import WebDriverWait
from datetime import date, timedelta
from selenium.webdriver.common.action_chains import ActionChains
import pickle

# Import data_cleaning notebooks 

import import_ipynb
import data_cleaning_sofa_L1
from data_cleaning_sofa_L1 import dfL1_largedim
from data_cleaning_sofa_PL import dfPL_smalldim, dfPL_largedim

# For Excel

import openpyxl

<h2 style='color:yellow'>Creating the compiled dataset</h2>
<p style='font-style:italic'>In order to predict the result of the game or the score difference (we will do both), we first need to have a dataset with both the stats of the home and away team.<p>

In [3]:
# We create a dictionnary with all the values home/away on a single row - Each row correspond to a football game

features = [
'Ball possession', 'Total shots', 'Shots on target', 'Shots off target', 'Blocked shots', 'Corner kicks', 'Offsides', 'Fouls', 'Yellow cards',
'Shots inside box', 'Shots outside box', 'Goalkeeper saves', 'Passes', 'Acc. passes', 'Duels won', 'Aerials won',
'Hit woodwork', 'Red cards', 'Big chances', 'Big chances missed', 'Long balls', 'Crosses', 'Dribbles', 'Tackles',
'Interceptions', 'Clearances', 'Acc. passes prop' ,'Crosses prop', 'Long balls prop','Dribbles prop', 
'Score difference', 'Result'
]

dic_largeX = {}

# Iterate on all rows
for index, row in dfL1_largedim.iterrows():
    if index < len(dfL1_largedim) - 1:
        # Each time the date is equal and the team equal opponent
        if (dfL1_largedim.loc[index, 'Team'] == dfL1_largedim.loc[index + 1, 'Opponent']) & (dfL1_largedim.loc[index, 'Date'] == dfL1_largedim.loc[index + 1, 'Date']):
            # Boucler sur toutes les valeurs communes pour les ajouter avec même nom '_opp'
            dic_largeX[index] = {}
            dic_largeX[index]['Team'] = dfL1_largedim.loc[index, 'Team']
            dic_largeX[index]['Opponent'] = dfL1_largedim.loc[index, 'Opponent']
            dic_largeX[index]['Date'] = dfL1_largedim.loc[index, 'Date']
            dic_largeX[index]['Home'] = dfL1_largedim.loc[index, 'Home']
            dic_largeX[index]['Result'] = dfL1_largedim.loc[index, 'Result']
            dic_largeX[index]['Score difference'] = dfL1_largedim.loc[index, 'Score difference']
            # The values which are different for home/away
            for feature in features:
                dic_largeX[index][f"{feature} ext"] = dfL1_largedim.loc[index + 1, feature]
                dic_largeX[index][f'{feature} home'] = dfL1_largedim.loc[index, feature]


# Create a data frame from the dictionnary
df = pd.DataFrame.from_dict(dic_largeX, orient='index')
df = df.reset_index().drop(columns=['index'])
df

Unnamed: 0,Team,Opponent,Date,Home,Result,Score difference,Ball possession ext,Ball possession home,Total shots ext,Total shots home,...,Crosses prop ext,Crosses prop home,Long balls prop ext,Long balls prop home,Dribbles prop ext,Dribbles prop home,Score difference ext,Score difference home,Result ext,Result home
0,Olympique de Marseille,Toulouse,2018-08-10,1,1,4.0,0.40,0.60,5.0,23.0,...,0.25,0.20,0.48,0.58,0.42,0.46,-4.0,4.0,-1,1
1,FC Nantes,AS Monaco,2018-08-11,1,-1,-2.0,0.34,0.66,11.0,16.0,...,0.29,0.26,0.49,0.58,0.75,0.35,2.0,-2.0,1,-1
2,Angers,Nîmes Olympique,2018-08-11,1,-1,-1.0,0.44,0.56,14.0,20.0,...,0.38,0.19,0.37,0.31,0.44,0.59,1.0,-1.0,1,-1
3,Lille OSC,Stade Rennais,2018-08-11,1,1,2.0,0.44,0.56,8.0,15.0,...,0.16,0.40,0.42,0.52,0.50,0.77,-2.0,2.0,-1,1
4,Montpellier,Dijon,2018-08-11,1,-1,-1.0,0.46,0.54,10.0,17.0,...,0.20,0.27,0.45,0.53,0.64,1.00,1.0,-1.0,1,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1107,Metz,Lorient,2021-12-12,1,1,3.0,0.67,0.33,18.0,4.0,...,0.15,0.33,0.59,0.28,0.64,0.20,-3.0,3.0,-1,1
1108,Stade Rennais,OGC Nice,2021-12-12,1,-1,-1.0,0.33,0.67,8.0,21.0,...,0.25,0.31,0.46,0.81,0.78,0.44,1.0,-1.0,1,-1
1109,Troyes,Bordeaux,2021-12-12,1,-1,-1.0,0.35,0.65,7.0,20.0,...,0.14,0.13,0.44,0.51,0.56,0.58,1.0,-1.0,1,-1
1110,Strasbourg,Olympique de Marseille,2021-12-12,1,-1,-2.0,0.51,0.49,8.0,11.0,...,0.43,0.17,0.49,0.61,0.63,0.62,2.0,-2.0,1,-1


<h2 style='color:yellow'> Preprocessing</h2>

In [4]:
# Distinguish numerical, categorical, date and other features for the prprocessing
numerical_features = [
    'Ball possession home', 'Total shots home', 'Shots on target home', 'Shots off target home', 'Blocked shots home', 'Corner kicks home', 'Offsides home', 'Fouls home', 'Yellow cards home',
    'Shots inside box home', 'Shots outside box home', 'Goalkeeper saves home', 'Passes home', 'Acc. passes home', 'Duels won home', 'Aerials won home',
    'Hit woodwork home', 'Red cards home', 'Big chances home', 'Big chances missed home', 'Long balls home', 'Crosses home', 'Dribbles home', 'Tackles home',
    'Interceptions home', 'Clearances home', 'Acc. passes prop home','Crosses prop home', 'Long balls prop home','Dribbles prop home', 'Ball possession ext', 'Total shots ext', 'Shots on target ext', 'Shots off target ext', 'Blocked shots ext', 'Corner kicks ext', 'Offsides ext', 'Fouls ext', 'Yellow cards ext',
    'Shots inside box ext', 'Shots outside box ext', 'Goalkeeper saves ext', 'Passes ext', 'Acc. passes ext', 'Duels won ext', 'Aerials won ext',
    'Hit woodwork ext', 'Red cards ext', 'Big chances ext', 'Big chances missed ext', 'Long balls ext', 'Crosses ext', 'Dribbles ext', 'Tackles ext',
    'Interceptions ext', 'Clearances ext', 'Acc. passes prop ext','Crosses prop ext', 'Long balls prop ext','Dribbles prop ext'
]
categorical_features = ['Team', 'Opponent']
date_features = ['Day']
passthrough_features = ['Home']

In [5]:
# Encoding dates method

def _encode_dates(X):
    X = X.copy()  # modify a copy of X
    # Encode the date information from the DateOfDeparture columns
    X.loc[:, 'Year'] = X['Date'].dt.year
    X.loc[:, 'Month'] = X['Date'].dt.month
    X.loc[:, 'Day'] = X['Date'].dt.day
    X.loc[:, 'Weekday'] = X['Date'].dt.weekday

    # Finally we can drop the original columns from the dataframe
    return X

# Encode the X
df = _encode_dates(df)

In [6]:
 # Create the preprocessor

preprocessor = ColumnTransformer(
    [
        ("date", OrdinalEncoder(), date_features),
        ("cat", OneHotEncoder(handle_unknown='ignore'), categorical_features),
        ('numeric', StandardScaler(), numerical_features),
        ('period', 'passthrough', passthrough_features)
    ]
)

<h2 style='color:yellow'>Model - Result</h2>

In [131]:
# Train test split
X = df.drop(columns=['Result', 'Score difference', 'Result home', 'Score difference home', 'Result ext', 'Score difference ext'])
y = df['Result']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [132]:
params = {
    'colsample_bytree': 0.9,
    'learning_rate': 0.01, 
    'max_depth': -1, 
    'min_child_samples': 111, 
    'min_child_weight': 1, 
    'n_estimators': 500, 
    'num_leaves': 32, 
    'reg_alpha': 0, 
    'reg_lambda': 20, 
    'subsample': 0.2
}

regressor = lgb.LGBMClassifier(**params, random_state=21)

pipe = make_pipeline(preprocessor, regressor)
result = pipe.fit(X_train, y_train)

print(f'Train set, Accuracy_score={accuracy_score(y_train, result.predict(X_train)):.2f}')
print(f'Test set, Accuracy_score={accuracy_score(y_test, result.predict(X_test)):.2f}')

Train set, Accuracy_score=0.79
Test set, Accuracy_score=0.68


<h2 style='color:yellow'>Model - Score difference</h2>

In [17]:
# Train test split
df = df[df['Score difference'].notna()]
X_sd = df.drop(columns=['Result', 'Score difference', 'Result home', 'Score difference home', 'Result ext', 'Score difference ext'])
y_sd = df['Score difference']

X_train_sd, X_test_sd, y_train_sd, y_test_sd = train_test_split(X_sd, y_sd, test_size=0.2, random_state=42)


In [18]:
params_sd = {
    'colsample_bytree': 0.9,
    'learning_rate': 0.01, 
    'max_depth': -1, 
    'min_child_samples': 111, 
    'min_child_weight': 1, 
    'n_estimators': 500, 
    'num_leaves': 32, 
    'reg_alpha': 0, 
    'reg_lambda': 20, 
    'subsample': 0.2
}

regressor_sd = lgb.LGBMClassifier(**params_sd, random_state=21)

pipe_sd = make_pipeline(preprocessor, regressor_sd)
result_sd = pipe_sd.fit(X_train_sd, y_train_sd)

print(f'Train set, Accuracy_score={accuracy_score(y_train_sd, result_sd.predict(X_train_sd)):.2f}')
print(f'Test set, Accuracy_score={accuracy_score(y_test_sd, result_sd.predict(X_test_sd)):.2f}')

Train set, Accuracy_score=0.58
Test set, Accuracy_score=0.37


<h2 style='color:yellow'>Get the predictions</h2>

In [30]:
# Variables pour le scrapper : Choisir la ligue

# Ligue : à changer selon besoin
league_str = "a[href='/tournament/football/france/ligue-1/34']"
# Les matchs (format : a>div>div.Cell-sc ...)
match_str = 'a>div>div.Cell-sc-t6h3ns-0'
# La date (format : div>div.classe)
date_str = 'div>div.ixemiF'
# Le nom des équipes (format : div.classe > div.Content-...)
team_names_str = 'div.bGpZoa>div.Content-sc-1morvta-0'
# Le score
score_str = 'div.jUEsho>div.Content-sc-1morvta-0'
# Le button stat(format : .classe)
buttons_str = ".iCdnqS"
# Les stats (format : .styles__StatisticsItemContent...)
stats_str = '.styles__StatisticsItemContent-sc-1imujgi-0'
# Croix (format : path[d:'XX X XXX ...'])
cross_str = "path[d='M4 4 L20 20 M4 20 L20 4']"

<h2 style='color:yellow'>Scraper for match you want to predict</h2>
<p style='font-style:italic'>Usually you should seek for the next day of the league<p>

In [31]:
## Peut on aussi entrainer le DF avec 5 derniers résultats ??? Pour pouvoir le mettre dans la filenmaeion ???


# Retake a scrapper
driver = webdriver.Chrome(ChromeDriverManager().install())

#Specify Search URL
search_url='https://www.sofascore.com/'


#Variables des sélecteurs 
filehandler = open ('predict.pickle', 'w') 

# Scrapper
def daterange(start_date, end_date,filehandler):
    with open('teams_stats_sofascore_L1.csv','wb') as f:
        dic_matchs = {}
        i = 0
        for n in range(int((end_date - start_date).days)):
            date = start_date + timedelta(n)
            search_url = f"https://www.sofascore.com/{date}"
            driver.get(search_url)
            ligue_tags = driver.find_elements_by_css_selector(league_str)
            if len(ligue_tags) < 2:
                driver.execute_script("window.scrollBy(0, arguments[0]);", 1000)
                ligue_tags = driver.find_elements_by_css_selector(league_str)
            if len(ligue_tags) >= 2:
                ligue_tag = ligue_tags[1].find_element_by_xpath('..').find_element_by_xpath('..').find_element_by_xpath('..').find_element_by_xpath('..')
                test_tag = ligue_tags[1].find_element_by_xpath('..').find_element_by_xpath('..').find_element_by_xpath('..').find_element_by_xpath('..')
                ligue_tag_style = ligue_tag.get_attribute("style")
                test_tag_style = ligue_tag.get_attribute("style")
                matchs=[]
                matchs_childs=[]
                while test_tag_style[:12] == ligue_tag_style[:12]:
                    matchs.append(test_tag)
                    #Si jamais le tag de Ligue 1 est trop bas
                    if test_tag.location['y'] > 1000 :
                        desired_y1 = (test_tag.size['height'] / 2) + test_tag.location['y']
                        current_y1 = (driver.execute_script('return window.innerHeight') / 2) + driver.execute_script('return window.pageYOffset')
                        scroll_y1_by = desired_y1 - current_y1
                        driver.execute_script("window.scrollBy(0, arguments[0]);", scroll_y1_by)
                    #time.sleep(2)
                    test_tag_style = test_tag.find_element_by_xpath("following-sibling::div").get_attribute('style')
                    test_tag = test_tag.find_element_by_xpath("following-sibling::div")
                del matchs[0]
                for match in matchs:
                    if match:
                        matchs_childs.append(match.find_element_by_css_selector(f"{match_str}"))
                for game in matchs_childs:
                    print
                    dic_matchs[i] = {}
                    dic_matchs[i+1] = {}
                    date = game.find_element_by_css_selector(f"{date_str}")
                    team_names = game.find_elements_by_css_selector(f"{team_names_str}")
                    score = game.find_elements_by_css_selector(f"{score_str}")
                    dic_matchs[i]['Date'] = date.get_attribute('innerHTML')
                    dic_matchs[i+1]['Date'] = date.get_attribute('innerHTML')
                    dic_matchs[i]['Home'] = 1
                    dic_matchs[i+1]['Home'] = 0
                    dic_matchs[i]['Team'] = team_names[0].text
                    dic_matchs[i]['Opponent'] = team_names[1].text
                    dic_matchs[i+1]['Team'] = team_names[1].text
                    dic_matchs[i+1]['Opponent'] = team_names[0].text
                    desired_y = (game.size['height'] / 2) + game.location['y']
                    current_y = (driver.execute_script('return window.innerHeight') / 2) + driver.execute_script('return window.pageYOffset')
                    scroll_y_by = desired_y - current_y
                    driver.execute_script("window.scrollBy(0, arguments[0]);", scroll_y_by)
                    game.click()
                    i = i + 2
                    with open('predict.pickle', 'wb') as handle:
                        pickle.dump(dic_matchs, handle, protocol=pickle.HIGHEST_PROTOCOL)
        return dic_matchs


start_date = datetime.date(2021,12,21)
end_date = datetime.date(2021,12,23)

global_dic = {}
ligue_date = datetime.date(2021,12,22)

# Gestion des erreurs 
dic_matchs = daterange(start_date, end_date,filehandler)
with open('predict.pickle', 'rb') as handle:
    dic_match = pickle.load(handle)
    i = 0
    for key, value in dic_match.items():
        global_dic[len(global_dic) + i] = value
        i = i + 1
    last_date_raw = dic_match[len(dic_match)-1]['Date']
    last_date = last_date_raw[:2] + '-' + last_date_raw[3:5]+ '-' + last_date_raw[6:]
    last_date_2 = datetime.datetime.strptime(last_date, '%d-%m-%y').date()
    start_date = last_date_2




Current google-chrome version is 96.0.4664
Get LATEST chromedriver version for 96.0.4664 google-chrome
Driver [/Users/charlesproye/.wdm/drivers/chromedriver/mac64/96.0.4664.45/chromedriver] found in cache
  driver = webdriver.Chrome(ChromeDriverManager().install())
  ligue_tags = driver.find_elements_by_css_selector(league_str)
  ligue_tags = driver.find_elements_by_css_selector(league_str)


In [32]:
# Create prediction DataFrame
prediction = pd.DataFrame.from_dict(dic_matchs, orient='index')
# Only home game (to be discussed)
prediction = prediction[prediction['Home'] == 1]
# Encode the date
prediction['Date'] = prediction['Date'].apply(lambda x: datetime.datetime.strptime(x, "%d/%m/%y"))
prediction = _encode_dates(prediction)

# The data we do not have - Weighted mean on the n_last_games of the team
n_last_games = 5

numerical_features_home = [
    'Ball possession home', 'Total shots home', 'Shots on target home', 'Shots off target home', 'Blocked shots home', 'Corner kicks home', 'Offsides home', 'Fouls home', 'Yellow cards home',
    'Shots inside box home', 'Shots outside box home', 'Goalkeeper saves home', 'Passes home', 'Acc. passes home', 'Duels won home', 'Aerials won home',
    'Hit woodwork home', 'Red cards home', 'Big chances home', 'Big chances missed home', 'Long balls home', 'Crosses home', 'Dribbles home', 'Tackles home',
    'Interceptions home', 'Clearances home', 'Acc. passes prop home','Crosses prop home', 'Long balls prop home','Dribbles prop home'
]

numerical_features_ext = [
    'Ball possession ext', 'Total shots ext', 'Shots on target ext', 'Shots off target ext', 'Blocked shots ext', 'Corner kicks ext', 'Offsides ext', 'Fouls ext', 'Yellow cards ext',
    'Shots inside box ext', 'Shots outside box ext', 'Goalkeeper saves ext', 'Passes ext', 'Acc. passes ext', 'Duels won ext', 'Aerials won ext',
    'Hit woodwork ext', 'Red cards ext', 'Big chances ext', 'Big chances missed ext', 'Long balls ext', 'Crosses ext', 'Dribbles ext', 'Tackles ext',
    'Interceptions ext', 'Clearances ext', 'Acc. passes prop ext','Crosses prop ext', 'Long balls prop ext','Dribbles prop ext'
]

# Method to get the weighted average

# Donner plus d'importance au match récent et au match ayant le même statut (domicile extérieur)

def create_weigted_mean_columns_home(team, dataset, n, feature, bool):
    
    # Find the n last games of the team in the dataset or max last games you can if you have less than n (exemple : new club in the league)
    if dataset[(dataset['Team'] == team) | (dataset['Opponent'] == team)].shape[0] < n:
        n = dataset[(dataset['Team'] == team) | (dataset['Opponent'] == team)].shape[0]
    data_team = dataset[(dataset['Team'] == team) | (dataset['Opponent'] == team)].iloc[-n:,:]
    
    # We add a columns both - it is a if statement to get the value of the team that we will use to compute the weighted average
    data_team[f'{feature} both'] = np.where(data_team['Team'] == team, data_team[feature], data_team[feature])
    
    # Can be discussed : Helps to ponderate to give additional importance to home or away game, depending if the game we want to predict is home or away
    # For now it is computed s.t the last match (if not of the same cat) and the last match of the same category has the same weight
    m = n / (n + 1)
    
    # Get the weighted average - Note that the weight are a if statement to valorize the category of match (away/home) according to what we want to predict
    # And that we use the index normalized to ponderate the importance of the date (the last match is more important thant the match two months ago)
    # If the team is home for the game we want to predict, we apply a m factor to the weight of the previous matchs played home
    if bool :
        data_team['Weights'] = np.where(data_team['Team'] == team, m * (data_team['Team'].reset_index().index + 1), data_team['Team'].reset_index().index + 1)
    else:
        data_team['Weights'] = np.where(data_team['Opponent'] == team, m * (data_team['Team'].reset_index().index + 1), data_team['Team'].reset_index().index + 1)
    
    w_average = np.average(data_team[f'{feature} both'], weights=data_team['Weights']) 
    return w_average


for feature in numerical_features_home:
    bool = True
    prediction[feature] = prediction['Team'].apply(lambda x: create_weigted_mean_columns_home(x, df, n_last_games, feature, bool))

for feature in numerical_features_ext:
    bool = False
    prediction[feature] = prediction['Opponent'].apply(lambda x: create_weigted_mean_columns_home(x, df, n_last_games, feature, bool))



# Même ordre que le training
prediction = prediction[X.columns]

NameError: name 'X' is not defined

In [135]:
predictions = result.predict(prediction)
prediction = prediction.reset_index().drop(columns=['index'])
pred_df = pd.DataFrame(prediction['Date'])
pred_df = pred_df.join(prediction['Team'])
pred_df = pred_df.join(prediction['Opponent'])
pred_df = pred_df.join(pd.DataFrame(predictions, columns=['Result encoded']))
pred_df

Unnamed: 0,Date,Team,Opponent,Result encoded
0,2021-12-22,AS Monaco,Stade Rennais,-1
1,2021-12-22,Bordeaux,Lille OSC,1
2,2021-12-22,Clermont Foot 63,Strasbourg,0
3,2021-12-22,Lorient,Paris Saint-Germain,-1
4,2021-12-22,Montpellier,Angers,-1
5,2021-12-22,OGC Nice,RC Lens,-1
6,2021-12-22,Olympique Lyonnais,Metz,1
7,2021-12-22,Olympique de Marseille,Stade de Reims,1
8,2021-12-22,Saint-Étienne,FC Nantes,1
9,2021-12-22,Troyes,Stade Brestois 29,-1


<h2 style='color:yellow'>Get it on Excel</h2>

In [125]:
league_name = 'Ligue1'
day_number = 19
n = n_last_games

with pd.ExcelWriter('bet_excel.xlsx', mode='a', engine='openpyxl') as writer:
    pred_df.to_excel(writer, sheet_name=f'{league_name} - Journée {day_number} (n = {n})')

### Idées et choses à faires:
- Pourrait on comparer la côte à la sécurité de la classification pour essayer de réaliser des arbitrages ? (récupérer cette valeur)
- Chercher à trouver le meilleur modèle et à le tuner
- Parier quand le modèle prévoit des grosses côtes

<h2 style='color:yellow'>Check the algo on past days to choose the best N</h2>

*Try to get the best n and to check if the model is working*

In [138]:
df_past = dfL1_largedim.iloc[:,:4]

In [139]:
df_past

Unnamed: 0,Date,Home,Team,Opponent
0,2018-08-10,1,Olympique de Marseille,Toulouse
1,2018-08-10,0,Toulouse,Olympique de Marseille
2,2018-08-11,1,FC Nantes,AS Monaco
3,2018-08-11,0,AS Monaco,FC Nantes
4,2018-08-11,1,Angers,Nîmes Olympique
...,...,...,...,...
2344,2021-12-12,0,Bordeaux,Troyes
2345,2021-12-12,1,Strasbourg,Olympique de Marseille
2346,2021-12-12,0,Olympique de Marseille,Strasbourg
2347,2021-12-12,1,Paris Saint-Germain,AS Monaco


<h2 style='color:yellow'>N = 5</h2>

In [142]:
# Only home game (to be discussed)
df_past = df_past[df_past['Home'] == 1]
# Encode the date
df_past = _encode_dates(df_past)

# The data we do not have - Weighted mean on the n_last_games of the team
n_last_games = 5

numerical_features_home = [
    'Ball possession home', 'Total shots home', 'Shots on target home', 'Shots off target home', 'Blocked shots home', 'Corner kicks home', 'Offsides home', 'Fouls home', 'Yellow cards home',
    'Shots inside box home', 'Shots outside box home', 'Goalkeeper saves home', 'Passes home', 'Acc. passes home', 'Duels won home', 'Aerials won home',
    'Hit woodwork home', 'Red cards home', 'Big chances home', 'Big chances missed home', 'Long balls home', 'Crosses home', 'Dribbles home', 'Tackles home',
    'Interceptions home', 'Clearances home', 'Acc. passes prop home','Crosses prop home', 'Long balls prop home','Dribbles prop home'
]

numerical_features_ext = [
    'Ball possession ext', 'Total shots ext', 'Shots on target ext', 'Shots off target ext', 'Blocked shots ext', 'Corner kicks ext', 'Offsides ext', 'Fouls ext', 'Yellow cards ext',
    'Shots inside box ext', 'Shots outside box ext', 'Goalkeeper saves ext', 'Passes ext', 'Acc. passes ext', 'Duels won ext', 'Aerials won ext',
    'Hit woodwork ext', 'Red cards ext', 'Big chances ext', 'Big chances missed ext', 'Long balls ext', 'Crosses ext', 'Dribbles ext', 'Tackles ext',
    'Interceptions ext', 'Clearances ext', 'Acc. passes prop ext','Crosses prop ext', 'Long balls prop ext','Dribbles prop ext'
]

# Method to get the weighted average

# Donner plus d'importance au match récent et au match ayant le même statut (domicile extérieur)

def create_weigted_mean_columns_home(team, dataset, n, feature, bool):
    
    # Find the n last games of the team in the dataset or max last games you can if you have less than n (exemple : new club in the league)
    if dataset[(dataset['Team'] == team) | (dataset['Opponent'] == team)].shape[0] < n:
        n = dataset[(dataset['Team'] == team) | (dataset['Opponent'] == team)].shape[0]
    data_team = dataset[(dataset['Team'] == team) | (dataset['Opponent'] == team)].iloc[-n:,:]
    
    # We add a columns both - it is a if statement to get the value of the team that we will use to compute the weighted average
    data_team[f'{feature} both'] = np.where(data_team['Team'] == team, data_team[feature], data_team[feature])
    
    # Can be discussed : Helps to ponderate to give additional importance to home or away game, depending if the game we want to predict is home or away
    # For now it is computed s.t the last match (if not of the same cat) and the last match of the same category has the same weight
    m = n / (n + 1)
    
    # Get the weighted average - Note that the weight are a if statement to valorize the category of match (away/home) according to what we want to predict
    # And that we use the index normalized to ponderate the importance of the date (the last match is more important thant the match two months ago)
    # If the team is home for the game we want to predict, we apply a m factor to the weight of the previous matchs played home
    if bool :
        data_team['Weights'] = np.where(data_team['Team'] == team, m * (data_team['Team'].reset_index().index + 1), data_team['Team'].reset_index().index + 1)
    else:
        data_team['Weights'] = np.where(data_team['Opponent'] == team, m * (data_team['Team'].reset_index().index + 1), data_team['Team'].reset_index().index + 1)
    
    w_average = np.average(data_team[f'{feature} both'], weights=data_team['Weights']) 
    return w_average


for feature in numerical_features_home:
    bool = True
    df_past[feature] = df_past['Team'].apply(lambda x: create_weigted_mean_columns_home(x, df, n_last_games, feature, bool))

for feature in numerical_features_ext:
    bool = False
    df_past[feature] = df_past['Opponent'].apply(lambda x: create_weigted_mean_columns_home(x, df, n_last_games, feature, bool))



# Même ordre que le training
df_past = df_past[X.columns]
df_pasts = result.predict(df_past)
df_past = df_past.reset_index().drop(columns=['index'])
pred_df_past = pd.DataFrame(df_past['Date'])
pred_df_past = pred_df_past.join(df_past['Team'])
pred_df_past = pred_df_past.join(df_past['Opponent'])
pred_df_past = pred_df_past.join(pd.DataFrame(df_pasts, columns=['Result encoded']))
pred_df_past

Unnamed: 0,Date,Team,Opponent,Result encoded
0,2018-08-10,Olympique de Marseille,Toulouse,1
1,2018-08-11,FC Nantes,AS Monaco,1
2,2018-08-11,Angers,Nîmes Olympique,-1
3,2018-08-11,Lille OSC,Stade Rennais,-1
4,2018-08-11,Montpellier,Dijon,1
...,...,...,...,...
1168,2021-12-12,Metz,Lorient,1
1169,2021-12-12,Stade Rennais,OGC Nice,-1
1170,2021-12-12,Troyes,Bordeaux,-1
1171,2021-12-12,Strasbourg,Olympique de Marseille,-1


In [175]:
# Check the % of right
dfL1_test = dfL1_largedim
dfL1_test = dfL1_test[dfL1_test['Home'] == 1]
pred_df_past['Result encoded']
dfL1_test = dfL1_test.reset_index().drop(columns=['index'])
print(f"The % of match well predicted with the weighted mean of old match when n = 5 is {(np.sum(dfL1_test['Result'] == pred_df_past['Result encoded']) / len(dfL1_test['Result'])):.2f}")

The % of match well predicted with the weighted mean of old match when n = 5 is 0.36


<h2 style='color:yellow'>N = 10</h2>

In [173]:
df_past = dfL1_largedim.iloc[:,:4]
# Only home game (to be discussed)
df_past = df_past[df_past['Home'] == 1]
# Encode the date
df_past = _encode_dates(df_past)

# The data we do not have - Weighted mean on the n_last_games of the team
n_last_games = 10

numerical_features_home = [
    'Ball possession home', 'Total shots home', 'Shots on target home', 'Shots off target home', 'Blocked shots home', 'Corner kicks home', 'Offsides home', 'Fouls home', 'Yellow cards home',
    'Shots inside box home', 'Shots outside box home', 'Goalkeeper saves home', 'Passes home', 'Acc. passes home', 'Duels won home', 'Aerials won home',
    'Hit woodwork home', 'Red cards home', 'Big chances home', 'Big chances missed home', 'Long balls home', 'Crosses home', 'Dribbles home', 'Tackles home',
    'Interceptions home', 'Clearances home', 'Acc. passes prop home','Crosses prop home', 'Long balls prop home','Dribbles prop home'
]

numerical_features_ext = [
    'Ball possession ext', 'Total shots ext', 'Shots on target ext', 'Shots off target ext', 'Blocked shots ext', 'Corner kicks ext', 'Offsides ext', 'Fouls ext', 'Yellow cards ext',
    'Shots inside box ext', 'Shots outside box ext', 'Goalkeeper saves ext', 'Passes ext', 'Acc. passes ext', 'Duels won ext', 'Aerials won ext',
    'Hit woodwork ext', 'Red cards ext', 'Big chances ext', 'Big chances missed ext', 'Long balls ext', 'Crosses ext', 'Dribbles ext', 'Tackles ext',
    'Interceptions ext', 'Clearances ext', 'Acc. passes prop ext','Crosses prop ext', 'Long balls prop ext','Dribbles prop ext'
]

# Method to get the weighted average

# Donner plus d'importance au match récent et au match ayant le même statut (domicile extérieur)

def create_weigted_mean_columns_home(team, dataset, n, feature, bool):
    
    # Find the n last games of the team in the dataset or max last games you can if you have less than n (exemple : new club in the league)
    if dataset[(dataset['Team'] == team) | (dataset['Opponent'] == team)].shape[0] < n:
        n = dataset[(dataset['Team'] == team) | (dataset['Opponent'] == team)].shape[0]
    data_team = dataset[(dataset['Team'] == team) | (dataset['Opponent'] == team)].iloc[-n:,:]
    
    # We add a columns both - it is a if statement to get the value of the team that we will use to compute the weighted average
    data_team[f'{feature} both'] = np.where(data_team['Team'] == team, data_team[feature], data_team[feature])
    
    # Can be discussed : Helps to ponderate to give additional importance to home or away game, depending if the game we want to predict is home or away
    # For now it is computed s.t the last match (if not of the same cat) and the last match of the same category has the same weight
    m = n / (n + 1)
    
    # Get the weighted average - Note that the weight are a if statement to valorize the category of match (away/home) according to what we want to predict
    # And that we use the index normalized to ponderate the importance of the date (the last match is more important thant the match two months ago)
    # If the team is home for the game we want to predict, we apply a m factor to the weight of the previous matchs played home
    if bool :
        data_team['Weights'] = np.where(data_team['Team'] == team, m * (data_team['Team'].reset_index().index + 1), data_team['Team'].reset_index().index + 1)
    else:
        data_team['Weights'] = np.where(data_team['Opponent'] == team, m * (data_team['Team'].reset_index().index + 1), data_team['Team'].reset_index().index + 1)
    
    w_average = np.average(data_team[f'{feature} both'], weights=data_team['Weights']) 
    return w_average


for feature in numerical_features_home:
    bool = True
    df_past[feature] = df_past['Team'].apply(lambda x: create_weigted_mean_columns_home(x, df, n_last_games, feature, bool))

for feature in numerical_features_ext:
    bool = False
    df_past[feature] = df_past['Opponent'].apply(lambda x: create_weigted_mean_columns_home(x, df, n_last_games, feature, bool))



# Même ordre que le training
df_past = df_past[X.columns]
df_pasts = result.predict(df_past)
df_past = df_past.reset_index().drop(columns=['index'])
pred_df_past_10 = pd.DataFrame(df_past['Date'])
pred_df_past_10 = pred_df_past_10.join(df_past['Team'])
pred_df_past_10 = pred_df_past_10.join(df_past['Opponent'])
pred_df_past_10 = pred_df_past_10.join(pd.DataFrame(df_pasts, columns=['Result encoded']))
pred_df_past_10

Unnamed: 0,Date,Team,Opponent,Result encoded
0,2018-08-10,Olympique de Marseille,Toulouse,0
1,2018-08-11,FC Nantes,AS Monaco,-1
2,2018-08-11,Angers,Nîmes Olympique,-1
3,2018-08-11,Lille OSC,Stade Rennais,0
4,2018-08-11,Montpellier,Dijon,1
...,...,...,...,...
1168,2021-12-12,Metz,Lorient,1
1169,2021-12-12,Stade Rennais,OGC Nice,-1
1170,2021-12-12,Troyes,Bordeaux,0
1171,2021-12-12,Strasbourg,Olympique de Marseille,1


In [176]:
# Check the % of right
dfL1_test = dfL1_largedim
dfL1_test = dfL1_test[dfL1_test['Home'] == 1]
dfL1_test = dfL1_test.reset_index().drop(columns=['index'])
print(f"The % of match well predicted with the weighted mean of old match when n = {n_last_games} is {(np.sum(dfL1_test['Result'] == pred_df_past_10['Result encoded']) / len(dfL1_test['Result'])):.2f}")

The % of match well predicted with the weighted mean of old match when n = 10 is 0.37


<h2 style='color:yellow'>N = 20</h2>

In [177]:
df_past = dfL1_largedim.iloc[:,:4]
# Only home game (to be discussed)
df_past = df_past[df_past['Home'] == 1]
# Encode the date
df_past = _encode_dates(df_past)

# The data we do not have - Weighted mean on the n_last_games of the team
n_last_games = 20

numerical_features_home = [
    'Ball possession home', 'Total shots home', 'Shots on target home', 'Shots off target home', 'Blocked shots home', 'Corner kicks home', 'Offsides home', 'Fouls home', 'Yellow cards home',
    'Shots inside box home', 'Shots outside box home', 'Goalkeeper saves home', 'Passes home', 'Acc. passes home', 'Duels won home', 'Aerials won home',
    'Hit woodwork home', 'Red cards home', 'Big chances home', 'Big chances missed home', 'Long balls home', 'Crosses home', 'Dribbles home', 'Tackles home',
    'Interceptions home', 'Clearances home', 'Acc. passes prop home','Crosses prop home', 'Long balls prop home','Dribbles prop home'
]

numerical_features_ext = [
    'Ball possession ext', 'Total shots ext', 'Shots on target ext', 'Shots off target ext', 'Blocked shots ext', 'Corner kicks ext', 'Offsides ext', 'Fouls ext', 'Yellow cards ext',
    'Shots inside box ext', 'Shots outside box ext', 'Goalkeeper saves ext', 'Passes ext', 'Acc. passes ext', 'Duels won ext', 'Aerials won ext',
    'Hit woodwork ext', 'Red cards ext', 'Big chances ext', 'Big chances missed ext', 'Long balls ext', 'Crosses ext', 'Dribbles ext', 'Tackles ext',
    'Interceptions ext', 'Clearances ext', 'Acc. passes prop ext','Crosses prop ext', 'Long balls prop ext','Dribbles prop ext'
]

# Method to get the weighted average

# Donner plus d'importance au match récent et au match ayant le même statut (domicile extérieur)

def create_weigted_mean_columns_home(team, dataset, n, feature, bool):
    
    # Find the n last games of the team in the dataset or max last games you can if you have less than n (exemple : new club in the league)
    if dataset[(dataset['Team'] == team) | (dataset['Opponent'] == team)].shape[0] < n:
        n = dataset[(dataset['Team'] == team) | (dataset['Opponent'] == team)].shape[0]
    data_team = dataset[(dataset['Team'] == team) | (dataset['Opponent'] == team)].iloc[-n:,:]
    
    # We add a columns both - it is a if statement to get the value of the team that we will use to compute the weighted average
    data_team[f'{feature} both'] = np.where(data_team['Team'] == team, data_team[feature], data_team[feature])
    
    # Can be discussed : Helps to ponderate to give additional importance to home or away game, depending if the game we want to predict is home or away
    # For now it is computed s.t the last match (if not of the same cat) and the last match of the same category has the same weight
    m = n / (n + 1)
    
    # Get the weighted average - Note that the weight are a if statement to valorize the category of match (away/home) according to what we want to predict
    # And that we use the index normalized to ponderate the importance of the date (the last match is more important thant the match two months ago)
    # If the team is home for the game we want to predict, we apply a m factor to the weight of the previous matchs played home
    if bool :
        data_team['Weights'] = np.where(data_team['Team'] == team, m * (data_team['Team'].reset_index().index + 1), data_team['Team'].reset_index().index + 1)
    else:
        data_team['Weights'] = np.where(data_team['Opponent'] == team, m * (data_team['Team'].reset_index().index + 1), data_team['Team'].reset_index().index + 1)
    
    w_average = np.average(data_team[f'{feature} both'], weights=data_team['Weights']) 
    return w_average


for feature in numerical_features_home:
    bool = True
    df_past[feature] = df_past['Team'].apply(lambda x: create_weigted_mean_columns_home(x, df, n_last_games, feature, bool))

for feature in numerical_features_ext:
    bool = False
    df_past[feature] = df_past['Opponent'].apply(lambda x: create_weigted_mean_columns_home(x, df, n_last_games, feature, bool))



# Même ordre que le training
df_past = df_past[X.columns]
df_pasts = result.predict(df_past)
df_past = df_past.reset_index().drop(columns=['index'])
pred_df_past_20 = pd.DataFrame(df_past['Date'])
pred_df_past_20 = pred_df_past_20.join(df_past['Team'])
pred_df_past_20 = pred_df_past_20.join(df_past['Opponent'])
pred_df_past_20 = pred_df_past_20.join(pd.DataFrame(df_pasts, columns=['Result encoded']))
pred_df_past_20

Unnamed: 0,Date,Team,Opponent,Result encoded
0,2018-08-10,Olympique de Marseille,Toulouse,-1
1,2018-08-11,FC Nantes,AS Monaco,1
2,2018-08-11,Angers,Nîmes Olympique,-1
3,2018-08-11,Lille OSC,Stade Rennais,1
4,2018-08-11,Montpellier,Dijon,1
...,...,...,...,...
1168,2021-12-12,Metz,Lorient,1
1169,2021-12-12,Stade Rennais,OGC Nice,1
1170,2021-12-12,Troyes,Bordeaux,0
1171,2021-12-12,Strasbourg,Olympique de Marseille,1


In [178]:
# Check the % of right
dfL1_test = dfL1_largedim
dfL1_test = dfL1_test[dfL1_test['Home'] == 1]
dfL1_test = dfL1_test.reset_index().drop(columns=['index'])
print(f"The % of match well predicted with the weighted mean of old match when n = {n_last_games} is {(np.sum(dfL1_test['Result'] == pred_df_past_20['Result encoded']) / len(dfL1_test['Result'])):.2f}")

The % of match well predicted with the weighted mean of old match when n = 20 is 0.41


<h2 style='color:yellow'>N = 30</h2>

In [180]:
df_past = dfL1_largedim.iloc[:,:4]
# Only home game (to be discussed)
df_past = df_past[df_past['Home'] == 1]
# Encode the date
df_past = _encode_dates(df_past)

# The data we do not have - Weighted mean on the n_last_games of the team
n_last_games = 30

numerical_features_home = [
    'Ball possession home', 'Total shots home', 'Shots on target home', 'Shots off target home', 'Blocked shots home', 'Corner kicks home', 'Offsides home', 'Fouls home', 'Yellow cards home',
    'Shots inside box home', 'Shots outside box home', 'Goalkeeper saves home', 'Passes home', 'Acc. passes home', 'Duels won home', 'Aerials won home',
    'Hit woodwork home', 'Red cards home', 'Big chances home', 'Big chances missed home', 'Long balls home', 'Crosses home', 'Dribbles home', 'Tackles home',
    'Interceptions home', 'Clearances home', 'Acc. passes prop home','Crosses prop home', 'Long balls prop home','Dribbles prop home'
]

numerical_features_ext = [
    'Ball possession ext', 'Total shots ext', 'Shots on target ext', 'Shots off target ext', 'Blocked shots ext', 'Corner kicks ext', 'Offsides ext', 'Fouls ext', 'Yellow cards ext',
    'Shots inside box ext', 'Shots outside box ext', 'Goalkeeper saves ext', 'Passes ext', 'Acc. passes ext', 'Duels won ext', 'Aerials won ext',
    'Hit woodwork ext', 'Red cards ext', 'Big chances ext', 'Big chances missed ext', 'Long balls ext', 'Crosses ext', 'Dribbles ext', 'Tackles ext',
    'Interceptions ext', 'Clearances ext', 'Acc. passes prop ext','Crosses prop ext', 'Long balls prop ext','Dribbles prop ext'
]

# Method to get the weighted average

# Donner plus d'importance au match récent et au match ayant le même statut (domicile extérieur)

def create_weigted_mean_columns_home(team, dataset, n, feature, bool):
    
    # Find the n last games of the team in the dataset or max last games you can if you have less than n (exemple : new club in the league)
    if dataset[(dataset['Team'] == team) | (dataset['Opponent'] == team)].shape[0] < n:
        n = dataset[(dataset['Team'] == team) | (dataset['Opponent'] == team)].shape[0]
    data_team = dataset[(dataset['Team'] == team) | (dataset['Opponent'] == team)].iloc[-n:,:]
    
    # We add a columns both - it is a if statement to get the value of the team that we will use to compute the weighted average
    data_team[f'{feature} both'] = np.where(data_team['Team'] == team, data_team[feature], data_team[feature])
    
    # Can be discussed : Helps to ponderate to give additional importance to home or away game, depending if the game we want to predict is home or away
    # For now it is computed s.t the last match (if not of the same cat) and the last match of the same category has the same weight
    m = n / (n + 1)
    
    # Get the weighted average - Note that the weight are a if statement to valorize the category of match (away/home) according to what we want to predict
    # And that we use the index normalized to ponderate the importance of the date (the last match is more important thant the match two months ago)
    # If the team is home for the game we want to predict, we apply a m factor to the weight of the previous matchs played home
    if bool :
        data_team['Weights'] = np.where(data_team['Team'] == team, m * (data_team['Team'].reset_index().index + 1), data_team['Team'].reset_index().index + 1)
    else:
        data_team['Weights'] = np.where(data_team['Opponent'] == team, m * (data_team['Team'].reset_index().index + 1), data_team['Team'].reset_index().index + 1)
    
    w_average = np.average(data_team[f'{feature} both'], weights=data_team['Weights']) 
    return w_average


for feature in numerical_features_home:
    bool = True
    df_past[feature] = df_past['Team'].apply(lambda x: create_weigted_mean_columns_home(x, df, n_last_games, feature, bool))

for feature in numerical_features_ext:
    bool = False
    df_past[feature] = df_past['Opponent'].apply(lambda x: create_weigted_mean_columns_home(x, df, n_last_games, feature, bool))



# Même ordre que le training
df_past = df_past[X.columns]
df_pasts = result.predict(df_past)
df_past = df_past.reset_index().drop(columns=['index'])
pred_df_past_20 = pd.DataFrame(df_past['Date'])
pred_df_past_20 = pred_df_past_20.join(df_past['Team'])
pred_df_past_20 = pred_df_past_20.join(df_past['Opponent'])
pred_df_past_20 = pred_df_past_20.join(pd.DataFrame(df_pasts, columns=['Result encoded']))
pred_df_past_20
# Check the % of right
dfL1_test = dfL1_largedim
dfL1_test = dfL1_test[dfL1_test['Home'] == 1]
dfL1_test = dfL1_test.reset_index().drop(columns=['index'])
print(f"The % of match well predicted with the weighted mean of old match when n = {n_last_games} is {(np.sum(dfL1_test['Result'] == pred_df_past_20['Result encoded']) / len(dfL1_test['Result'])):.2f}")

The % of match well predicted with the weighted mean of old match when n = 30 is 0.39


<h2 style='color:yellow'>Trying a neural network</h2>

In [7]:
# multi-class classification with Keras
import pandas
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import np_utils
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline

In [8]:

# load dataset
X_nn = df.drop(columns=['Result', 'Score difference', 'Result home', 'Score difference home', 'Result ext', 'Score difference ext'])
X_nn = _encode_dates(X_nn)
y_nn = df['Result']
# encode class values as integers
encoder = LabelEncoder()
encoder.fit(y_nn)
encoded_Y = encoder.transform(y_nn)
# convert integers to dummy variables (i.e. one hot encoded)
dummy_y = np_utils.to_categorical(encoded_Y)
 
# define baseline model
def baseline_model():
	# create model
	model = Sequential()
	model.add(Dense(32, activation='relu')) # 32 RELU
	model.add(Dense(32, activation='relu')) # 32 RELU
	model.add(Dense(3, activation='softmax')) # 32 RELU
	# Compile model
	model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
	return model
 
estimator = KerasClassifier(build_fn=baseline_model, epochs=30, batch_size=5, verbose=1)
pipe = make_pipeline(preprocessor, estimator)
kfold = KFold(n_splits=5, shuffle=True)
results = cross_val_score(pipe, X_nn, dummy_y, cv=kfold)
print("Baseline: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

  estimator = KerasClassifier(build_fn=baseline_model, epochs=30, batch_size=5, verbose=1)
2021-12-24 13:27:43.217691: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 2

In [239]:
pipe.fit(X_nn, y_nn)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('date', OrdinalEncoder(),
                                                  ['Day']),
                                                 ('cat',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['Team', 'Opponent']),
                                                 ('numeric', StandardScaler(),
                                                  ['Ball possession home',
                                                   'Total shots home',
                                                   'Shots on target home',
                                                   'Shots off target home',
                                                   'Blocked shots home',
                                                   'Corner kicks home',
                                                   'Offsides home',
                 

<h2 style='color:yellow'>N = 20 (for the NN model)</h2>

In [45]:

df_past = dfL1_largedim.iloc[:,:4]
# Only home game (to be discussed)
df_past = df_past[df_past['Home'] == 1]
# Encode the date
df_past = _encode_dates(df_past)

# The data we do not have - Weighted mean on the n_last_games of the team
n_last_games = 20

numerical_features_home = [
    'Ball possession home', 'Total shots home', 'Shots on target home', 'Shots off target home', 'Blocked shots home', 'Corner kicks home', 'Offsides home', 'Fouls home', 'Yellow cards home',
    'Shots inside box home', 'Shots outside box home', 'Goalkeeper saves home', 'Passes home', 'Acc. passes home', 'Duels won home', 'Aerials won home',
    'Hit woodwork home', 'Red cards home', 'Big chances home', 'Big chances missed home', 'Long balls home', 'Crosses home', 'Dribbles home', 'Tackles home',
    'Interceptions home', 'Clearances home', 'Acc. passes prop home','Crosses prop home', 'Long balls prop home','Dribbles prop home'
]

numerical_features_ext = [
    'Ball possession ext', 'Total shots ext', 'Shots on target ext', 'Shots off target ext', 'Blocked shots ext', 'Corner kicks ext', 'Offsides ext', 'Fouls ext', 'Yellow cards ext',
    'Shots inside box ext', 'Shots outside box ext', 'Goalkeeper saves ext', 'Passes ext', 'Acc. passes ext', 'Duels won ext', 'Aerials won ext',
    'Hit woodwork ext', 'Red cards ext', 'Big chances ext', 'Big chances missed ext', 'Long balls ext', 'Crosses ext', 'Dribbles ext', 'Tackles ext',
    'Interceptions ext', 'Clearances ext', 'Acc. passes prop ext','Crosses prop ext', 'Long balls prop ext','Dribbles prop ext'
]

# Method to get the weighted average

# Donner plus d'importance au match récent et au match ayant le même statut (domicile extérieur)

def create_weigted_mean_columns_home(team, dataset, n, feature, bool):
    
    # Find the n last games of the team in the dataset or max last games you can if you have less than n (exemple : new club in the league)
    if dataset[(dataset['Team'] == team) | (dataset['Opponent'] == team)].shape[0] < n:
        n = dataset[(dataset['Team'] == team) | (dataset['Opponent'] == team)].shape[0]
    data_team = dataset[(dataset['Team'] == team) | (dataset['Opponent'] == team)].iloc[-n:,:]
    print(data_team)
    
    # We add a columns both - it is a if statement to get the value of the team that we will use to compute the weighted average
    data_team[f'{feature} both'] = np.where(data_team['Team'] == team, data_team[feature], data_team[feature])
    
    # Can be discussed : Helps to ponderate to give additional importance to home or away game, depending if the game we want to predict is home or away
    # For now it is computed s.t the last match (if not of the same cat) and the last match of the same category has the same weight
    m = n / (n + 1)
    
    # Get the weighted average - Note that the weight are a if statement to valorize the category of match (away/home) according to what we want to predict
    # And that we use the index normalized to ponderate the importance of the date (the last match is more important thant the match two months ago)
    # If the team is home for the game we want to predict, we apply a m factor to the weight of the previous matchs played home
    if bool :
        data_team['Weights'] = np.where(data_team['Team'] == team, m * (data_team['Team'].reset_index().index + 1), data_team['Team'].reset_index().index + 1)
    else:
        data_team['Weights'] = np.where(data_team['Opponent'] == team, m * (data_team['Team'].reset_index().index + 1), data_team['Team'].reset_index().index + 1)
    
    w_average = np.average(data_team[f'{feature} both'], weights=data_team['Weights']) 
    return w_average


for feature in numerical_features_home:
    bool = True
    df_past[feature] = df_past['Team'].apply(lambda x: create_weigted_mean_columns_home(x, df, n_last_games, feature, bool))

for feature in numerical_features_ext:
    bool = False
    df_past[feature] = df_past['Opponent'].apply(lambda x: create_weigted_mean_columns_home(x, df, n_last_games, feature, bool))



# Renormalize possession
df_past['Ball possession ext'] = 1 / ( 1 + df_past['Ball possession home'] / df_past['Ball possession ext'])
df_past['Ball possession home'] =  df_past['Ball possession ext'] * (df_past['Ball possession home'] / df_past['Ball possession ext'])
# Même ordre que le training
df_past = df_past[X.columns]
df_pasts = pipe.predict(df_past)
df_past = df_past.reset_index().drop(columns=['index'])
pred_df_past_20 = pd.DataFrame(df_past['Date'])
pred_df_past_20 = pred_df_past_20.join(df_past['Team'])
pred_df_past_20 = pred_df_past_20.join(df_past['Opponent'])
pred_df_past_20 = pred_df_past_20.join(pd.DataFrame(df_pasts, columns=['Result encoded']))
pred_df_past_20
# Check the % of right
dfL1_test = dfL1_largedim
dfL1_test = dfL1_test[dfL1_test['Home'] == 1]
dfL1_test = dfL1_test.reset_index().drop(columns=['index'])
print(f"The % of match well predicted with the weighted mean of old match when n = {n_last_games} is {(np.sum(dfL1_test['Result'] == pred_df_past_20['Result encoded']) / len(dfL1_test['Result'])):.2f}")

                        Team                Opponent       Date  Home  Result  \
922   Olympique de Marseille                  Angers 2021-05-16     1       1   
928                     Metz  Olympique de Marseille 2021-05-23     1       0   
937                AS Monaco  Olympique de Marseille 2021-09-11     1      -1   
957              Montpellier  Olympique de Marseille 2021-08-08     1      -1   
967   Olympique de Marseille                Bordeaux 2021-08-15     1       0   
977                 OGC Nice  Olympique de Marseille 2021-08-22     1      -1   
980   Olympique de Marseille           Saint-Étienne 2021-08-28     1       1   
996   Olympique de Marseille           Stade Rennais 2021-09-19     1       1   
1003                  Angers  Olympique de Marseille 2021-09-22     1       0   
1016  Olympique de Marseille                 RC Lens 2021-09-26     1      -1   
1025               Lille OSC  Olympique de Marseille 2021-10-03     1       1   
1034  Olympique de Marseille

KeyboardInterrupt: 

<h2 style='color:yellow'>Training the model on weighted average</h2>

In [9]:
df_past = df.iloc[:,:4]
# Encode the date
df_past = _encode_dates(df_past)

# The data we do not have - Weighted mean on the n_last_games of the team
n_last_games = 20

numerical_features_home = [
    'Ball possession home', 'Total shots home', 'Shots on target home', 'Shots off target home', 'Blocked shots home', 'Corner kicks home', 'Offsides home', 'Fouls home', 'Yellow cards home',
    'Shots inside box home', 'Shots outside box home', 'Goalkeeper saves home', 'Passes home', 'Acc. passes home', 'Duels won home', 'Aerials won home',
    'Hit woodwork home', 'Red cards home', 'Big chances home', 'Big chances missed home', 'Long balls home', 'Crosses home', 'Dribbles home', 'Tackles home',
    'Interceptions home', 'Clearances home', 'Acc. passes prop home','Crosses prop home', 'Long balls prop home','Dribbles prop home'
]

numerical_features_ext = [
    'Ball possession ext', 'Total shots ext', 'Shots on target ext', 'Shots off target ext', 'Blocked shots ext', 'Corner kicks ext', 'Offsides ext', 'Fouls ext', 'Yellow cards ext',
    'Shots inside box ext', 'Shots outside box ext', 'Goalkeeper saves ext', 'Passes ext', 'Acc. passes ext', 'Duels won ext', 'Aerials won ext',
    'Hit woodwork ext', 'Red cards ext', 'Big chances ext', 'Big chances missed ext', 'Long balls ext', 'Crosses ext', 'Dribbles ext', 'Tackles ext',
    'Interceptions ext', 'Clearances ext', 'Acc. passes prop ext','Crosses prop ext', 'Long balls prop ext','Dribbles prop ext'
]

# Method to get the weighted average

# Donner plus d'importance au match récent et au match ayant le même statut (domicile extérieur)

def create_weigted_mean_columns_home(team, date, dataset, n, feature, bool):
    # Find the n last games of the team in the dataset or max last games you can if you have less than n (exemple : new club in the league)
    if dataset[((dataset['Team'] == team) | (dataset['Opponent'] == team)) & (dataset['Date'] <= date)].shape[0] < n:
        n = dataset[((dataset['Team'] == team) | (dataset['Opponent'] == team)) & (dataset['Date'] <= date)].shape[0] + 1
    data_team = dataset[((dataset['Team'] == team) | (dataset['Opponent'] == team)) & (dataset['Date'] <= date)].iloc[-n:,:]
    # We add a columns both - it is a if statement to get the value of the team that we will use to compute the weighted average
    data_team[f'{feature} both'] = np.where(data_team['Team'] == team, data_team[feature], data_team[feature])
    
    # Can be discussed : Helps to ponderate to give additional importance to home or away game, depending if the game we want to predict is home or away
    # For now it is computed s.t the last match (if not of the same cat) and the last match of the same category has the same weight
    m = n / (n + 1)
    
    # Get the weighted average - Note that the weight are a if statement to valorize the category of match (away/home) according to what we want to predict
    # And that we use the index normalized to ponderate the importance of the date (the last match is more important thant the match two months ago)
    # If the team is home for the game we want to predict, we apply a m factor to the weight of the previous matchs played home
    if bool :
        data_team['Weights'] = np.where(data_team['Team'] == team, m * (data_team['Team'].reset_index().index + 1), data_team['Team'].reset_index().index + 1)
    else:
        data_team['Weights'] = np.where(data_team['Opponent'] == team, m * (data_team['Team'].reset_index().index + 1), data_team['Team'].reset_index().index + 1)
    
    w_average = np.average(data_team[f'{feature} both'], weights=data_team['Weights']) 
    return w_average


for feature in numerical_features_home:
    bool = True
    df_past[feature] = df_past.apply(lambda x: create_weigted_mean_columns_home(x[0], x[2], df, n_last_games, feature, bool), axis=1)

for feature in numerical_features_ext:
    bool = False
    df_past[feature] = df_past.apply(lambda x: create_weigted_mean_columns_home(x[1], x[2], df, n_last_games, feature, bool), axis=1)



# Même ordre que le training
df_past = df_past[X_nn.columns]
#df_pasts = result.predict(df_past)
#df_past = df_past.reset_index().drop(columns=['index'])
#pred_df_past_20 = pd.DataFrame(df_past['Date'])
#pred_df_past_20 = pred_df_past_20.join(df_past['Team'])
#pred_df_past_20 = pred_df_past_20.join(df_past['Opponent'])
#pred_df_past_20 = pred_df_past_20.join(pd.DataFrame(df_pasts, columns=['Result encoded']))
#pred_df_past_20
# Check the % of right
#dfL1_test = dfL1_largedim
#dfL1_test = dfL1_test[dfL1_test['Home'] == 1]
#dfL1_test = dfL1_test.reset_index().drop(columns=['index'])
#print(f"The % of match well predicted with the weighted mean of old match when n = {n_last_games} is {(np.sum(dfL1_test['Result'] == pred_df_past_20['Result encoded']) / len(dfL1_test['Result'])):.2f}")

In [11]:
df_past

Unnamed: 0,Team,Opponent,Date,Home,Ball possession ext,Ball possession home,Total shots ext,Total shots home,Shots on target ext,Shots on target home,...,Crosses prop ext,Crosses prop home,Long balls prop ext,Long balls prop home,Dribbles prop ext,Dribbles prop home,Year,Month,Day,Weekday
0,Olympique de Marseille,Toulouse,2018-08-10,1,0.400000,0.600000,5.000000,23.000000,1.000000,10.000000,...,0.250000,0.200000,0.480000,0.580000,0.420000,0.460000,2018,8,10,4
1,FC Nantes,AS Monaco,2018-08-11,1,0.340000,0.660000,11.000000,16.000000,6.000000,4.000000,...,0.290000,0.260000,0.490000,0.580000,0.750000,0.350000,2018,8,11,5
2,Angers,Nîmes Olympique,2018-08-11,1,0.440000,0.560000,14.000000,20.000000,4.000000,7.000000,...,0.380000,0.190000,0.370000,0.310000,0.440000,0.590000,2018,8,11,5
3,Lille OSC,Stade Rennais,2018-08-11,1,0.440000,0.560000,8.000000,15.000000,2.000000,6.000000,...,0.160000,0.400000,0.420000,0.520000,0.500000,0.770000,2018,8,11,5
4,Montpellier,Dijon,2018-08-11,1,0.460000,0.540000,10.000000,17.000000,4.000000,7.000000,...,0.200000,0.270000,0.450000,0.530000,0.640000,1.000000,2018,8,11,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1107,Metz,Lorient,2021-12-12,1,0.520151,0.519754,10.497673,12.808905,3.595859,4.413497,...,0.241764,0.227920,0.530635,0.496765,0.496191,0.500647,2021,12,12,6
1108,Stade Rennais,OGC Nice,2021-12-12,1,0.446106,0.524521,9.234694,15.295365,3.206401,4.627533,...,0.236721,0.255823,0.504105,0.584444,0.563124,0.514980,2021,12,12,6
1109,Troyes,Bordeaux,2021-12-12,1,0.482318,0.520940,12.704720,14.021027,4.778191,4.582457,...,0.255052,0.256783,0.534778,0.533563,0.563385,0.602499,2021,12,12,6
1110,Strasbourg,Olympique de Marseille,2021-12-12,1,0.470659,0.559036,10.282266,13.875379,3.681523,4.384812,...,0.265279,0.263834,0.536288,0.545234,0.505362,0.586289,2021,12,12,6


In [12]:
# load dataset
X_nn_wa = df_past
X_nn_wa = _encode_dates(X_nn_wa)
y_nn_wa = df['Result']
# encode class values as integers
encoder = LabelEncoder()
encoder.fit(y_nn_wa)
encoded_Y_wa = encoder.transform(y_nn_wa)
# convert integers to dummy variables (i.e. one hot encoded)
dummy_y_wa = np_utils.to_categorical(encoded_Y_wa)

In [13]:
X_train_wa, X_test_wa, y_train_wa, y_test_wa = train_test_split(X_nn_wa, y_nn_wa, test_size=0.2, random_state=42)
result_wa = pipe.fit(X_train_wa, y_train_wa)

print(f'Train set, Accuracy_score={accuracy_score(y_train_wa, result_wa.predict(X_train_wa)):.2f}')
print(f'Test set, Accuracy_score={accuracy_score(y_test_wa, result_wa.predict(X_test_wa)):.2f}')

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
Train set, Accuracy_score=0.99
Test set, Accuracy_score=0.44


In [25]:
# Try with adding the minus to capture the away/home feature
numerical_features = [
    'Ball possession', 'Total shots', 'Shots on target', 'Shots off target', 'Blocked shots', 'Corner kicks', 'Offsides', 'Fouls', 'Yellow cards',
    'Shots inside box', 'Shots outside box', 'Goalkeeper saves', 'Passes', 'Acc. passes', 'Duels won', 'Aerials won',
    'Hit woodwork', 'Red cards', 'Big chances', 'Big chances missed', 'Long balls', 'Crosses', 'Dribbles', 'Tackles',
    'Interceptions', 'Clearances', 'Acc. passes prop','Crosses prop', 'Long balls prop','Dribbles prop'
]

df_past_copy = df_past.copy()
for feature in numerical_features:
    df_past_copy[feature] = df_past_copy[f'{feature} home'] - df_past_copy[f'{feature} ext']
    df_past_copy.drop(columns=[f'{feature} home', f'{feature} ext'], inplace = True)

df_past_copy

Unnamed: 0,Team,Opponent,Date,Home,Year,Month,Day,Weekday,Ball possession,Total shots,...,Long balls,Crosses,Dribbles,Tackles,Interceptions,Clearances,Acc. passes prop,Crosses prop,Long balls prop,Dribbles prop
0,Olympique de Marseille,Toulouse,2018-08-10,1,2018,8,10,4,0.200000,18.000000,...,10.000000,2.000000,-5.000000,10.000000,-8.000000,-7.000000,0.060000,-0.050000,0.100000,0.040000
1,FC Nantes,AS Monaco,2018-08-11,1,2018,8,11,5,0.320000,5.000000,...,15.000000,3.000000,0.000000,-6.000000,8.000000,-17.000000,0.170000,-0.030000,0.090000,-0.400000
2,Angers,Nîmes Olympique,2018-08-11,1,2018,8,11,5,0.120000,6.000000,...,-14.000000,-2.000000,5.000000,-2.000000,-12.000000,-8.000000,0.080000,-0.190000,-0.060000,0.150000
3,Lille OSC,Stade Rennais,2018-08-11,1,2018,8,11,5,0.120000,7.000000,...,7.000000,1.000000,5.000000,2.000000,8.000000,6.000000,0.060000,0.240000,0.100000,0.270000
4,Montpellier,Dijon,2018-08-11,1,2018,8,11,5,0.080000,7.000000,...,5.000000,5.000000,1.000000,-1.000000,-5.000000,-17.000000,0.030000,0.070000,0.080000,0.360000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1107,Metz,Lorient,2021-12-12,1,2021,12,12,6,-0.000397,2.311232,...,-7.030050,0.584242,-0.731959,-0.927139,0.774160,-3.451997,-0.001011,-0.013844,-0.033870,0.004456
1108,Stade Rennais,OGC Nice,2021-12-12,1,2021,12,12,6,0.078415,6.060671,...,5.380808,3.412326,-1.027312,-3.252104,-4.023224,-7.732101,0.019839,0.019103,0.080340,-0.048144
1109,Troyes,Bordeaux,2021-12-12,1,2021,12,12,6,0.038622,1.316308,...,-4.378177,1.937458,-1.710317,1.114323,-0.549212,-6.152034,0.018180,0.001731,-0.001215,0.039114
1110,Strasbourg,Olympique de Marseille,2021-12-12,1,2021,12,12,6,0.088376,3.593113,...,3.645897,2.364795,0.690347,-0.313492,-0.389337,-2.455896,-0.005936,-0.001444,0.008946,0.080927


In [27]:
preprocessor2 = ColumnTransformer(
    [
        ("date", OrdinalEncoder(), date_features),
        ("cat", OneHotEncoder(handle_unknown='ignore'), categorical_features),
        ('numeric', StandardScaler(), numerical_features),
        ('period', 'passthrough', passthrough_features)
    ]
)

In [28]:
# load dataset
X_minus = df_past_copy
X_minus = _encode_dates(X_minus)
y_minus = df['Result']
# encode class values as integers
encoder = LabelEncoder()
encoder.fit(y_minus)
encoded_Y_minus = encoder.transform(y_minus)

pipe_minus = make_pipeline(preprocessor2, estimator)
# convert integers to dummy variables (i.e. one hot encoded)
dummy_y_minus = np_utils.to_categorical(encoded_Y_minus)
X_train_minus, X_test_minus, y_train_minus, y_test_minus = train_test_split(X_minus, y_minus, test_size=0.2, random_state=42)
result_minus = pipe_minus.fit(X_train_minus, y_train_minus)

print(f'Train set, Accuracy_score={accuracy_score(y_train_minus, result_minus.predict(X_train_minus)):.2f}')
print(f'Test set, Accuracy_score={accuracy_score(y_test_minus, result_minus.predict(X_test_minus)):.2f}')

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
Train set, Accuracy_score=0.91
Test set, Accuracy_score=0.47


In [35]:
for feature in numerical_features:
    prediction[feature] = prediction[f'{feature} home'] - prediction[f'{feature} ext']
    prediction.drop(columns=[f'{feature} home', f'{feature} ext'], inplace = True)

In [36]:
predictions = result_minus.predict(prediction)
prediction = prediction.reset_index().drop(columns=['index'])
pred_df = pd.DataFrame(prediction['Date'])
pred_df = pred_df.join(prediction['Team'])
pred_df = pred_df.join(prediction['Opponent'])
pred_df = pred_df.join(pd.DataFrame(predictions, columns=['Result encoded']))
pred_df

Unnamed: 0,Date,Team,Opponent,Result encoded
0,2021-12-22,AS Monaco,Stade Rennais,-1
1,2021-12-22,Bordeaux,Lille OSC,-1
2,2021-12-22,Clermont Foot 63,Strasbourg,-1
3,2021-12-22,Lorient,Paris Saint-Germain,0
4,2021-12-22,Montpellier,Angers,-1
5,2021-12-22,OGC Nice,RC Lens,-1
6,2021-12-22,Olympique Lyonnais,Metz,0
7,2021-12-22,Olympique de Marseille,Stade de Reims,1
8,2021-12-22,Saint-Étienne,FC Nantes,1
9,2021-12-22,Troyes,Stade Brestois 29,-1
