In [2]:
# All imports

import sys 
import os
import os.path
import time
import numpy as np
import csv
import datetime
import pandas as pd
import pickle
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
import lightgbm as lgb
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

# For the scrapper

import selenium
from selenium import webdriver
import io
import requests
from webdriver_manager.chrome import ChromeDriverManager
from selenium.common.exceptions import ElementClickInterceptedException
from selenium.webdriver.support.ui import WebDriverWait
from datetime import date, timedelta
from selenium.webdriver.common.action_chains import ActionChains
import pickle

# Import data_cleaning notebooks 

import import_ipynb
import data_cleaning_sofa_L1
from data_cleaning_sofa_L1 import dfL1_largedim
from data_cleaning_sofa_PL import dfPL_smalldim, dfPL_largedim

# For Excel

import openpyxl

# Multi-class classification with Keras
import pandas
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import np_utils
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline

<h2 style='color:yellow'>Creating the compiled dataset</h2>
<p style='font-style:italic'>In order to predict the result, we first need to have a dataset with both the stats of the home and away team.<p>

In [4]:
# Encoding dates method

def _encode_dates(X):
    X = X.copy()  # modify a copy of X
    # Encode the date information from the DateOfDeparture columns
    X.loc[:, 'Year'] = X['Date'].dt.year
    X.loc[:, 'Month'] = X['Date'].dt.month
    X.loc[:, 'Day'] = X['Date'].dt.day
    X.loc[:, 'Weekday'] = X['Date'].dt.weekday

    # Finally we can drop the original columns from the dataframe
    return X

In [5]:
# We create a dictionnary with all the values home/away on a single row - Each row correspond to a football game

features = [
'Ball possession', 'Total shots', 'Shots on target', 'Shots off target', 'Blocked shots', 'Corner kicks', 'Offsides', 'Fouls', 'Yellow cards',
'Shots inside box', 'Shots outside box', 'Goalkeeper saves', 'Passes', 'Acc. passes', 'Duels won', 'Aerials won',
'Hit woodwork', 'Red cards', 'Big chances', 'Big chances missed', 'Long balls', 'Crosses', 'Dribbles', 'Tackles',
'Interceptions', 'Clearances', 'Acc. passes prop' ,'Crosses prop', 'Long balls prop','Dribbles prop', 
'Score difference', 'Result'
]

dic_largeX = {}

# Iterate on all rows
for index, row in dfL1_largedim.iterrows():
    if index < len(dfL1_largedim) - 1:
        # Each time the date is equal and the team equal opponent
        if (dfL1_largedim.loc[index, 'Team'] == dfL1_largedim.loc[index + 1, 'Opponent']) & (dfL1_largedim.loc[index, 'Date'] == dfL1_largedim.loc[index + 1, 'Date']):
            # Boucler sur toutes les valeurs communes pour les ajouter avec même nom '_opp'
            dic_largeX[index] = {}
            dic_largeX[index]['Team'] = dfL1_largedim.loc[index, 'Team']
            dic_largeX[index]['Opponent'] = dfL1_largedim.loc[index, 'Opponent']
            dic_largeX[index]['Date'] = dfL1_largedim.loc[index, 'Date']
            dic_largeX[index]['Home'] = dfL1_largedim.loc[index, 'Home']
            dic_largeX[index]['Result'] = dfL1_largedim.loc[index, 'Result']
            dic_largeX[index]['Score difference'] = dfL1_largedim.loc[index, 'Score difference']
            # The values which are different for home/away
            for feature in features:
                dic_largeX[index][f"{feature} ext"] = dfL1_largedim.loc[index + 1, feature]
                dic_largeX[index][f'{feature} home'] = dfL1_largedim.loc[index, feature]


# Create a data frame from the dictionnary
df = pd.DataFrame.from_dict(dic_largeX, orient='index')
df = df.reset_index().drop(columns=['index'])

df_past = df.iloc[:,:4]
# Encode the date
df_past = _encode_dates(df_past)

# The data we do not have - Weighted mean on the n_last_games of the team
n_last_games = 20

numerical_features_home = [
    'Ball possession home', 'Total shots home', 'Shots on target home', 'Shots off target home', 'Blocked shots home', 'Corner kicks home', 'Offsides home', 'Fouls home', 'Yellow cards home',
    'Shots inside box home', 'Shots outside box home', 'Goalkeeper saves home', 'Passes home', 'Acc. passes home', 'Duels won home', 'Aerials won home',
    'Hit woodwork home', 'Red cards home', 'Big chances home', 'Big chances missed home', 'Long balls home', 'Crosses home', 'Dribbles home', 'Tackles home',
    'Interceptions home', 'Clearances home', 'Acc. passes prop home','Crosses prop home', 'Long balls prop home','Dribbles prop home'
]

numerical_features_ext = [
    'Ball possession ext', 'Total shots ext', 'Shots on target ext', 'Shots off target ext', 'Blocked shots ext', 'Corner kicks ext', 'Offsides ext', 'Fouls ext', 'Yellow cards ext',
    'Shots inside box ext', 'Shots outside box ext', 'Goalkeeper saves ext', 'Passes ext', 'Acc. passes ext', 'Duels won ext', 'Aerials won ext',
    'Hit woodwork ext', 'Red cards ext', 'Big chances ext', 'Big chances missed ext', 'Long balls ext', 'Crosses ext', 'Dribbles ext', 'Tackles ext',
    'Interceptions ext', 'Clearances ext', 'Acc. passes prop ext','Crosses prop ext', 'Long balls prop ext','Dribbles prop ext'
]

# Method to get the weighted average

# Donner plus d'importance au match récent et au match ayant le même statut (domicile extérieur)

def create_weigted_mean_columns_home(team, date, dataset, n, feature, bool):
    # Find the n last games of the team in the dataset or max last games you can if you have less than n (exemple : new club in the league)
    if dataset[((dataset['Team'] == team) | (dataset['Opponent'] == team)) & (dataset['Date'] <= date)].shape[0] < n:
        n = dataset[((dataset['Team'] == team) | (dataset['Opponent'] == team)) & (dataset['Date'] <= date)].shape[0] + 1
    data_team = dataset[((dataset['Team'] == team) | (dataset['Opponent'] == team)) & (dataset['Date'] <= date)].iloc[-n:,:]
    # We add a columns both - it is a if statement to get the value of the team that we will use to compute the weighted average
    data_team[f'{feature} both'] = np.where(data_team['Team'] == team, data_team[feature], data_team[feature])
    
    # Can be discussed : Helps to ponderate to give additional importance to home or away game, depending if the game we want to predict is home or away
    # For now it is computed s.t the last match (if not of the same cat) and the last match of the same category has the same weight
    m = n / (n + 1)
    
    # Get the weighted average - Note that the weight are a if statement to valorize the category of match (away/home) according to what we want to predict
    # And that we use the index normalized to ponderate the importance of the date (the last match is more important thant the match two months ago)
    # If the team is home for the game we want to predict, we apply a m factor to the weight of the previous matchs played home
    if bool :
        data_team['Weights'] = np.where(data_team['Team'] == team, m * (data_team['Team'].reset_index().index + 1), data_team['Team'].reset_index().index + 1)
    else:
        data_team['Weights'] = np.where(data_team['Opponent'] == team, m * (data_team['Team'].reset_index().index + 1), data_team['Team'].reset_index().index + 1)
    
    w_average = np.average(data_team[f'{feature} both'], weights=data_team['Weights']) 
    return w_average
def create_weigted_mean_columns_last3(team, date, dataset, feature, bool):
    n = 3
    # Find the n last games of the team in the dataset or max last games you can if you have less than n (exemple : new club in the league)
    if dataset[((dataset['Team'] == team) | (dataset['Opponent'] == team)) & (dataset['Date'] <= date)].shape[0] < 3:
        n = dataset[((dataset['Team'] == team) | (dataset['Opponent'] == team)) & (dataset['Date'] <= date)].shape[0] + 1
    data_team = dataset[((dataset['Team'] == team) | (dataset['Opponent'] == team)) & (dataset['Date'] <= date)].iloc[-n:,:]
    # We add a columns both - it is a if statement to get the value of the team that we will use to compute the weighted average
    data_team[f'{feature} both'] = np.where(data_team['Team'] == team, data_team[feature], data_team[feature])
    
    m = n / (n + 1)
    
    # Get the weighted average - Note that the weight are a if statement to valorize the category of match (away/home) according to what we want to predict
    # And that we use the index normalized to ponderate the importance of the date (the last match is more important thant the match two months ago)
    # If the team is home for the game we want to predict, we apply a m factor to the weight of the previous matchs played home
    if bool :
        data_team['Weights'] = np.where(data_team['Team'] == team, m * (data_team['Team'].reset_index().index + 1), data_team['Team'].reset_index().index + 1)
    else:
        data_team['Weights'] = np.where(data_team['Opponent'] == team, m * (data_team['Team'].reset_index().index + 1), data_team['Team'].reset_index().index + 1)
    
    w_average = np.average(data_team[f'{feature} both'], weights=data_team['Weights']) 
    return w_average

for feature in numerical_features_home:
    bool = True
    df_past[feature] = df_past.apply(lambda x: create_weigted_mean_columns_home(x[0], x[2], df, n_last_games, feature, bool), axis=1)
    df_past[f'{feature} last3'] = df_past.apply(lambda x: create_weigted_mean_columns_last3(x[0], x[2], df, feature, bool), axis=1)

for feature in numerical_features_ext:
    bool = False
    df_past[feature] = df_past.apply(lambda x: create_weigted_mean_columns_home(x[1], x[2], df, n_last_games, feature, bool), axis=1)
    df_past[f'{feature} last3'] = df_past.apply(lambda x: create_weigted_mean_columns_last3(x[1], x[2], df, feature, bool), axis=1)



# Même ordre que le training
order = df.drop(columns=['Result', 'Score difference', 'Result home', 'Score difference home', 'Result ext', 'Score difference ext'])
order = _encode_dates(order)
#df_past = df_past[order.columns]


# Adding the minus to capture the away/home feature
numerical_features = [
    'Ball possession', 'Total shots', 'Shots on target', 'Shots off target', 'Blocked shots', 'Corner kicks', 'Offsides', 'Fouls', 'Yellow cards',
    'Shots inside box', 'Shots outside box', 'Goalkeeper saves', 'Passes', 'Acc. passes', 'Duels won', 'Aerials won',
    'Hit woodwork', 'Red cards', 'Big chances', 'Big chances missed', 'Long balls', 'Crosses', 'Dribbles', 'Tackles',
    'Interceptions', 'Clearances', 'Acc. passes prop','Crosses prop', 'Long balls prop','Dribbles prop'
]

for feature in numerical_features:
    df_past[feature] = df_past[f'{feature} home'] - df_past[f'{feature} ext']
    df_past[f'{feature} last3'] = df_past[f'{feature} home last3'] - df_past[f'{feature} ext last3']
    df_past.drop(columns=[f'{feature} home', f'{feature} ext', f'{feature} home last3', f'{feature} ext last3'], inplace = True)

df_past

Unnamed: 0,Team,Opponent,Date,Home,Year,Month,Day,Weekday,Ball possession,Ball possession last3,...,Clearances,Clearances last3,Acc. passes prop,Acc. passes prop last3,Crosses prop,Crosses prop last3,Long balls prop,Long balls prop last3,Dribbles prop,Dribbles prop last3
0,Olympique de Marseille,Toulouse,2018-08-10,1,2018,8,10,4,0.200000,0.200000,...,-7.000000,-7.000000,0.060000,0.060000,-0.050000,-0.050000,0.100000,0.100000,0.040000,0.040000
1,FC Nantes,AS Monaco,2018-08-11,1,2018,8,11,5,0.320000,0.320000,...,-17.000000,-17.000000,0.170000,0.170000,-0.030000,-0.030000,0.090000,0.090000,-0.400000,-0.400000
2,Angers,Nîmes Olympique,2018-08-11,1,2018,8,11,5,0.120000,0.120000,...,-8.000000,-8.000000,0.080000,0.080000,-0.190000,-0.190000,-0.060000,-0.060000,0.150000,0.150000
3,Lille OSC,Stade Rennais,2018-08-11,1,2018,8,11,5,0.120000,0.120000,...,6.000000,6.000000,0.060000,0.060000,0.240000,0.240000,0.100000,0.100000,0.270000,0.270000
4,Montpellier,Dijon,2018-08-11,1,2018,8,11,5,0.080000,0.080000,...,-17.000000,-17.000000,0.030000,0.030000,0.070000,0.070000,0.080000,0.080000,0.360000,0.360000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1116,OGC Nice,RC Lens,2021-12-22,1,2021,12,22,2,0.064522,0.068132,...,-5.185804,-5.331579,0.012943,0.007947,0.043131,0.053342,0.053873,0.190079,-0.101870,-0.150579
1117,Olympique Lyonnais,Metz,2021-12-22,1,2021,12,22,2,0.056555,0.148643,...,-5.508380,-7.514286,0.045122,0.075714,0.062926,0.100238,0.035241,0.053214,-0.051857,-0.095976
1118,Olympique de Marseille,Stade de Reims,2021-12-22,1,2021,12,22,2,0.081663,0.232167,...,-9.463676,-16.369048,0.076685,0.087167,0.082377,0.102524,0.075018,0.181571,0.089140,0.120167
1119,Saint-Étienne,FC Nantes,2021-12-22,1,2021,12,22,2,-0.006833,-0.031000,...,-1.347155,2.650000,0.010291,-0.007500,-0.016222,0.046500,0.042506,-0.008500,0.136215,0.063000


<h2 style='color:yellow'>Preprocessing</h2>

In [6]:
# Preprocessor
numerical_features = [
    'Ball possession', 'Total shots', 'Shots on target', 'Shots off target', 'Blocked shots', 'Corner kicks', 'Offsides', 'Fouls', 'Yellow cards',
    'Shots inside box', 'Shots outside box', 'Goalkeeper saves', 'Passes', 'Acc. passes', 'Duels won', 'Aerials won',
    'Hit woodwork', 'Red cards', 'Big chances', 'Big chances missed', 'Long balls', 'Crosses', 'Dribbles', 'Tackles',
    'Interceptions', 'Clearances', 'Acc. passes prop','Crosses prop', 'Long balls prop','Dribbles prop',
    'Ball possession last3', 'Total shots last3', 'Shots on target last3', 'Shots off target last3', 'Blocked shots last3', 'Corner kicks last3', 'Offsides last3', 'Fouls last3', 'Yellow cards last3',
    'Shots inside box last3', 'Shots outside box last3', 'Goalkeeper saves last3', 'Passes last3', 'Acc. passes last3', 'Duels won last3', 'Aerials won last3',
    'Hit woodwork last3', 'Red cards last3', 'Big chances last3', 'Big chances missed last3', 'Long balls last3', 'Crosses last3', 'Dribbles last3', 'Tackles last3',
    'Interceptions last3', 'Clearances last3', 'Acc. passes prop last3','Crosses prop last3', 'Long balls prop last3','Dribbles prop last3'
]
categorical_features = ['Team', 'Opponent']
#date_features = ['Day']

preprocessor = ColumnTransformer(
    [
        ("cat", OneHotEncoder(handle_unknown='ignore'), categorical_features),
        ('numeric', StandardScaler(), numerical_features),
    ]
)

<h2 style='color:yellow'>Neural network - Multi Class classifier</h2>

In [7]:
# Model Neural network

# load dataset
X_minus = df_past
X_minus = _encode_dates(X_minus)
y_minus = df['Result']
# encode class values as integers
encoder = LabelEncoder()
encoder.fit(y_minus)
encoded_Y_minus = encoder.transform(y_minus)

# define baseline model
def baseline_model():
	# create model
	model = Sequential()
	model.add(Dense(64, activation='relu')) 
	model.add(Dense(64, activation='relu')) 
	model.add(Dense(64, activation='relu')) 
	model.add(Dense(3, activation='softmax')) 
	# Compile model
	model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
	return model
 
estimator = KerasClassifier(build_fn=baseline_model, epochs=40, batch_size=5, verbose=0)

pipe_minus = make_pipeline(preprocessor, estimator)
# convert integers to dummy variables (i.e. one hot encoded)
dummy_y_minus = np_utils.to_categorical(encoded_Y_minus)
X_train_minus, X_test_minus, y_train_minus, y_test_minus = train_test_split(X_minus, y_minus, test_size=0.2, random_state=42)
result_minus = pipe_minus.fit(X_train_minus, y_train_minus)

print(f'Train set, Accuracy_score={accuracy_score(y_train_minus, result_minus.predict(X_train_minus)):.2f}')
print(f'Test set, Accuracy_score={accuracy_score(y_test_minus, result_minus.predict(X_test_minus)):.2f}')


  estimator = KerasClassifier(build_fn=baseline_model, epochs=40, batch_size=5, verbose=0)
2021-12-25 14:56:06.601502: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Train set, Accuracy_score=1.00
Test set, Accuracy_score=0.54


In [8]:
# Cross-validation
kfold = KFold(n_splits=5, shuffle=True)
results = cross_val_score(pipe_minus,
X_minus, dummy_y_minus, cv=kfold)
results

array([0.49333334, 0.5223214 , 0.5714286 , 0.50446427, 0.5223214 ])

In [9]:
results.mean()

0.522773802280426

<h2 style='color:yellow'>Get the predictions</h2>

In [11]:
# Variables pour le scrapper : Choisir la ligue

# Ligue : à changer selon besoin
league_str = "a[href='/tournament/football/france/ligue-1/34']"
# Les matchs (format : a>div>div.Cell-sc ...)
match_str = 'a>div>div.Cell-sc-t6h3ns-0'
# La date (format : div>div.classe)
date_str = 'div>div.ixemiF'
# Le nom des équipes (format : div.classe > div.Content-...)
team_names_str = 'div.bGpZoa>div.Content-sc-1morvta-0'
# Le score
score_str = 'div.jUEsho>div.Content-sc-1morvta-0'
# Le button stat(format : .classe)
buttons_str = ".iCdnqS"
# Les stats (format : .styles__StatisticsItemContent...)
stats_str = '.styles__StatisticsItemContent-sc-1imujgi-0'
# Croix (format : path[d:'XX X XXX ...'])
cross_str = "path[d='M4 4 L20 20 M4 20 L20 4']"

<h2 style='color:yellow'>Scraper for match you want to predict</h2>
<p style='font-style:italic'>Usually you should seek for the next day of the league<p>

In [12]:
# Scrapper for prediction
driver = webdriver.Chrome(ChromeDriverManager().install())

#Specify Search URL
search_url='https://www.sofascore.com/'


#Variables des sélecteurs 
filehandler = open ('predict.pickle', 'w') 

# Scrapper
def daterange(start_date, end_date,filehandler):
    with open('teams_stats_sofascore_L1.csv','wb') as f:
        dic_matchs = {}
        i = 0
        for n in range(int((end_date - start_date).days)):
            date = start_date + timedelta(n)
            search_url = f"https://www.sofascore.com/{date}"
            driver.get(search_url)
            ligue_tags = driver.find_elements_by_css_selector(league_str)
            if len(ligue_tags) < 2:
                driver.execute_script("window.scrollBy(0, arguments[0]);", 1000)
                ligue_tags = driver.find_elements_by_css_selector(league_str)
            if len(ligue_tags) >= 2:
                ligue_tag = ligue_tags[1].find_element_by_xpath('..').find_element_by_xpath('..').find_element_by_xpath('..').find_element_by_xpath('..')
                test_tag = ligue_tags[1].find_element_by_xpath('..').find_element_by_xpath('..').find_element_by_xpath('..').find_element_by_xpath('..')
                ligue_tag_style = ligue_tag.get_attribute("style")
                test_tag_style = ligue_tag.get_attribute("style")
                matchs=[]
                matchs_childs=[]
                while test_tag_style[:12] == ligue_tag_style[:12]:
                    matchs.append(test_tag)
                    #Si jamais le tag de Ligue 1 est trop bas
                    if test_tag.location['y'] > 1000 :
                        desired_y1 = (test_tag.size['height'] / 2) + test_tag.location['y']
                        current_y1 = (driver.execute_script('return window.innerHeight') / 2) + driver.execute_script('return window.pageYOffset')
                        scroll_y1_by = desired_y1 - current_y1
                        driver.execute_script("window.scrollBy(0, arguments[0]);", scroll_y1_by)
                    #time.sleep(2)
                    test_tag_style = test_tag.find_element_by_xpath("following-sibling::div").get_attribute('style')
                    test_tag = test_tag.find_element_by_xpath("following-sibling::div")
                del matchs[0]
                for match in matchs:
                    if match:
                        matchs_childs.append(match.find_element_by_css_selector(f"{match_str}"))
                for game in matchs_childs:
                    print
                    dic_matchs[i] = {}
                    dic_matchs[i+1] = {}
                    date = game.find_element_by_css_selector(f"{date_str}")
                    team_names = game.find_elements_by_css_selector(f"{team_names_str}")
                    score = game.find_elements_by_css_selector(f"{score_str}")
                    dic_matchs[i]['Date'] = date.get_attribute('innerHTML')
                    dic_matchs[i+1]['Date'] = date.get_attribute('innerHTML')
                    dic_matchs[i]['Home'] = 1
                    dic_matchs[i+1]['Home'] = 0
                    dic_matchs[i]['Team'] = team_names[0].text
                    dic_matchs[i]['Opponent'] = team_names[1].text
                    dic_matchs[i+1]['Team'] = team_names[1].text
                    dic_matchs[i+1]['Opponent'] = team_names[0].text
                    desired_y = (game.size['height'] / 2) + game.location['y']
                    current_y = (driver.execute_script('return window.innerHeight') / 2) + driver.execute_script('return window.pageYOffset')
                    scroll_y_by = desired_y - current_y
                    driver.execute_script("window.scrollBy(0, arguments[0]);", scroll_y_by)
                    game.click()
                    i = i + 2
                    with open('predict.pickle', 'wb') as handle:
                        pickle.dump(dic_matchs, handle, protocol=pickle.HIGHEST_PROTOCOL)
        return dic_matchs


start_date = datetime.date(2022,1,7)
end_date = datetime.date(2022,1,10)

global_dic = {}
ligue_date = datetime.date(2022,1,9)

# Gestion des erreurs 
dic_matchs = daterange(start_date, end_date,filehandler)
with open('predict.pickle', 'rb') as handle:
    dic_match = pickle.load(handle)
    i = 0
    for key, value in dic_match.items():
        global_dic[len(global_dic) + i] = value
        i = i + 1
    last_date_raw = dic_match[len(dic_match)-1]['Date']
    last_date = last_date_raw[:2] + '-' + last_date_raw[3:5]+ '-' + last_date_raw[6:]
    last_date_2 = datetime.datetime.strptime(last_date, '%d-%m-%y').date()
    start_date = last_date_2



Current google-chrome version is 96.0.4664
Get LATEST chromedriver version for 96.0.4664 google-chrome
Driver [/Users/charlesproye/.wdm/drivers/chromedriver/mac64/96.0.4664.45/chromedriver] found in cache
  driver = webdriver.Chrome(ChromeDriverManager().install())
  ligue_tags = driver.find_elements_by_css_selector(league_str)


In [13]:
# Create prediction DataFrame
prediction = pd.DataFrame.from_dict(dic_matchs, orient='index')
# Only home game (to be discussed)
prediction = prediction[prediction['Home'] == 1]
# Encode the date
prediction['Date'] = prediction['Date'].apply(lambda x: datetime.datetime.strptime(x, "%d/%m/%y"))
prediction = _encode_dates(prediction)

In [14]:
# The data we do not have - Weighted mean on the n_last_games of the team
n_last_games = 20

numerical_features_home = [
    'Ball possession home', 'Total shots home', 'Shots on target home', 'Shots off target home', 'Blocked shots home', 'Corner kicks home', 'Offsides home', 'Fouls home', 'Yellow cards home',
    'Shots inside box home', 'Shots outside box home', 'Goalkeeper saves home', 'Passes home', 'Acc. passes home', 'Duels won home', 'Aerials won home',
    'Hit woodwork home', 'Red cards home', 'Big chances home', 'Big chances missed home', 'Long balls home', 'Crosses home', 'Dribbles home', 'Tackles home',
    'Interceptions home', 'Clearances home', 'Acc. passes prop home','Crosses prop home', 'Long balls prop home','Dribbles prop home'
]

numerical_features_ext = [
    'Ball possession ext', 'Total shots ext', 'Shots on target ext', 'Shots off target ext', 'Blocked shots ext', 'Corner kicks ext', 'Offsides ext', 'Fouls ext', 'Yellow cards ext',
    'Shots inside box ext', 'Shots outside box ext', 'Goalkeeper saves ext', 'Passes ext', 'Acc. passes ext', 'Duels won ext', 'Aerials won ext',
    'Hit woodwork ext', 'Red cards ext', 'Big chances ext', 'Big chances missed ext', 'Long balls ext', 'Crosses ext', 'Dribbles ext', 'Tackles ext',
    'Interceptions ext', 'Clearances ext', 'Acc. passes prop ext','Crosses prop ext', 'Long balls prop ext','Dribbles prop ext'
]

# Method to get the weighted average
def create_weigted_mean_columns_pred(team, dataset, n, feature, bool):
    # Find the n last games of the team in the dataset or max last games you can if you have less than n (exemple : new club in the league)
    if dataset[(dataset['Team'] == team) | (dataset['Opponent'] == team)].shape[0] < n:
        n = dataset[(dataset['Team'] == team) | (dataset['Opponent'] == team)].shape[0]
    data_team = dataset[(dataset['Team'] == team) | (dataset['Opponent'] == team)].iloc[-n:,:]
    
    # We add a columns both - it is a if statement to get the value of the team that we will use to compute the weighted average
    data_team[f'{feature} both'] = np.where(data_team['Team'] == team, data_team[feature], data_team[feature])
    
    # Can be discussed : Helps to ponderate to give additional importance to home or away game, depending if the game we want to predict is home or away
    # For now it is computed s.t the last match (if not of the same cat) and the last match of the same category has the same weight
    m = n / (n + 1)
    
    # Get the weighted average - Note that the weight are a if statement to valorize the category of match (away/home) according to what we want to predict
    # And that we use the index normalized to ponderate the importance of the date (the last match is more important thant the match two months ago)
    # If the team is home for the game we want to predict, we apply a m factor to the weight of the previous matchs played home
    if bool :
        data_team['Weights'] = np.where(data_team['Team'] == team, m * (data_team['Team'].reset_index().index + 1), data_team['Team'].reset_index().index + 1)
    else:
        data_team['Weights'] = np.where(data_team['Opponent'] == team, m * (data_team['Team'].reset_index().index + 1), data_team['Team'].reset_index().index + 1)
    
    w_average = np.average(data_team[f'{feature} both'], weights=data_team['Weights']) 
    return w_average
def create_weigted_mean_columns_pred_last3(team, dataset, feature, bool):
    n = 3
    if dataset[(dataset['Team'] == team) | (dataset['Opponent'] == team)].shape[0] < n:
        n = dataset[(dataset['Team'] == team) | (dataset['Opponent'] == team)].shape[0]
    data_team = dataset[(dataset['Team'] == team) | (dataset['Opponent'] == team)].iloc[-n:,:]
    
    # We add a columns both - it is a if statement to get the value of the team that we will use to compute the weighted average
    data_team[f'{feature} both'] = np.where(data_team['Team'] == team, data_team[feature], data_team[feature])
    
    # Can be discussed : Helps to ponderate to give additional importance to home or away game, depending if the game we want to predict is home or away
    # For now it is computed s.t the last match (if not of the same cat) and the last match of the same category has the same weight
    m = n / (n + 1)
    
    # Get the weighted average - Note that the weight are a if statement to valorize the category of match (away/home) according to what we want to predict
    # And that we use the index normalized to ponderate the importance of the date (the last match is more important thant the match two months ago)
    # If the team is home for the game we want to predict, we apply a m factor to the weight of the previous matchs played home
    if bool :
        data_team['Weights'] = np.where(data_team['Team'] == team, m * (data_team['Team'].reset_index().index + 1), data_team['Team'].reset_index().index + 1)
    else:
        data_team['Weights'] = np.where(data_team['Opponent'] == team, m * (data_team['Team'].reset_index().index + 1), data_team['Team'].reset_index().index + 1)
    
    w_average = np.average(data_team[f'{feature} both'], weights=data_team['Weights']) 
    return w_average


# Add the weigthed features to the prediction data_frame
for feature in numerical_features_home:
    bool = True
    prediction[feature] = prediction['Team'].apply(lambda x: create_weigted_mean_columns_pred(x, df, n_last_games, feature, bool))
    prediction[f'{feature} last3'] = prediction['Team'].apply(lambda x: create_weigted_mean_columns_pred_last3(x, df, feature, bool))

for feature in numerical_features_ext:
    bool = False
    prediction[feature] = prediction['Opponent'].apply(lambda x: create_weigted_mean_columns_pred(x, df, n_last_games, feature, bool))
    prediction[f'{feature} last3'] = prediction['Opponent'].apply(lambda x: create_weigted_mean_columns_pred_last3(x, df, feature, bool))


# Get the minus format

numerical_features = [
    'Ball possession', 'Total shots', 'Shots on target', 'Shots off target', 'Blocked shots', 'Corner kicks', 'Offsides', 'Fouls', 'Yellow cards',
    'Shots inside box', 'Shots outside box', 'Goalkeeper saves', 'Passes', 'Acc. passes', 'Duels won', 'Aerials won',
    'Hit woodwork', 'Red cards', 'Big chances', 'Big chances missed', 'Long balls', 'Crosses', 'Dribbles', 'Tackles',
    'Interceptions', 'Clearances', 'Acc. passes prop','Crosses prop', 'Long balls prop','Dribbles prop'
]

print(prediction)

for feature in numerical_features:
    prediction[feature] = prediction[f'{feature} home'] - prediction[f'{feature} ext']
    prediction[f'{feature} last3'] = prediction[f'{feature} home last3'] - prediction[f'{feature} ext last3']
    prediction.drop(columns=[f'{feature} home', f'{feature} ext'], inplace = True)

  prediction[f'{feature} last3'] = prediction['Opponent'].apply(lambda x: create_weigted_mean_columns_pred_last3(x, df, feature, bool))
  prediction[feature] = prediction['Opponent'].apply(lambda x: create_weigted_mean_columns_pred(x, df, n_last_games, feature, bool))


         Date  Home                Team                Opponent  Year  Month  \
0  2022-01-07     1            Bordeaux  Olympique de Marseille  2022      1   
2  2022-01-08     1           Lille OSC                 Lorient  2022      1   
4  2022-01-08     1             RC Lens           Stade Rennais  2022      1   
6  2022-01-09     1   Stade Brestois 29                OGC Nice  2022      1   
8  2022-01-09     1              Angers           Saint-Étienne  2022      1   
10 2022-01-09     1    Clermont Foot 63          Stade de Reims  2022      1   
12 2022-01-09     1                Metz              Strasbourg  2022      1   
14 2022-01-09     1         Montpellier                  Troyes  2022      1   
16 2022-01-09     1           FC Nantes               AS Monaco  2022      1   
18 2022-01-09     1  Olympique Lyonnais     Paris Saint-Germain  2022      1   

    Day  Weekday  Ball possession home  Ball possession home last3  ...  \
0     7        4              0.509242      

  prediction[feature] = prediction[f'{feature} home'] - prediction[f'{feature} ext']
  prediction[f'{feature} last3'] = prediction[f'{feature} home last3'] - prediction[f'{feature} ext last3']


In [15]:
# Predict the result
predictions = pipe_minus.predict(prediction)
# Present the result as a Data Frame with Date/Team and Opponent
prediction = prediction.reset_index().drop(columns=['index'])
pred_df = pd.DataFrame(prediction['Date'])
pred_df = pred_df.join(prediction['Team'])
pred_df = pred_df.join(prediction['Opponent'])
pred_df = pred_df.join(pd.DataFrame(predictions, columns=['Result encoded']))
pred_df

Unnamed: 0,Date,Team,Opponent,Result encoded
0,2022-01-07,Bordeaux,Olympique de Marseille,-1
1,2022-01-08,Lille OSC,Lorient,1
2,2022-01-08,RC Lens,Stade Rennais,-1
3,2022-01-09,Stade Brestois 29,OGC Nice,-1
4,2022-01-09,Angers,Saint-Étienne,1
5,2022-01-09,Clermont Foot 63,Stade de Reims,0
6,2022-01-09,Metz,Strasbourg,-1
7,2022-01-09,Montpellier,Troyes,0
8,2022-01-09,FC Nantes,AS Monaco,1
9,2022-01-09,Olympique Lyonnais,Paris Saint-Germain,1


<h2 style='color:yellow'>Get it on Excel</h2>

In [16]:
league_name = 'Ligue1'
day_number = 20
n = n_last_games

with pd.ExcelWriter('bet_excel.xlsx', mode='a', engine='openpyxl') as writer:
    pred_df.to_excel(writer, sheet_name=f'{league_name} - Journée {day_number} (n = {n})')