In [126]:
# All imports

import sys 
import os
import os.path
import time
import numpy as np
import csv
import datetime
import pandas as pd
import pickle
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
import lightgbm as lgb
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

# For the scrapper

import selenium
from selenium import webdriver
import io
import requests
from webdriver_manager.chrome import ChromeDriverManager
from selenium.common.exceptions import ElementClickInterceptedException
from selenium.webdriver.support.ui import WebDriverWait
from datetime import date, timedelta
from selenium.webdriver.common.action_chains import ActionChains
import pickle

# Import data_cleaning notebooks 

import import_ipynb
import data_cleaning_sofa_L1
from data_cleaning_sofa_L1 import dfL1_largedim
from data_cleaning_sofa_PL import dfPL_smalldim, dfPL_largedim

# For Excel

import openpyxl

<h2 style='color:yellow'>Creating the compiled dataset</h2>
<p style='font-style:italic'>In order to predict the result of the game or the score difference (we will do both), we first need to have a dataset with both the stats of the home and away team.<p>

In [127]:
# We create a dictionnary with all the values home/away on a single row - Each row correspond to a football game

features = [
'Ball possession', 'Total shots', 'Shots on target', 'Shots off target', 'Blocked shots', 'Corner kicks', 'Offsides', 'Fouls', 'Yellow cards',
'Shots inside box', 'Shots outside box', 'Goalkeeper saves', 'Passes', 'Acc. passes', 'Duels won', 'Aerials won',
'Hit woodwork', 'Red cards', 'Big chances', 'Big chances missed', 'Long balls', 'Crosses', 'Dribbles', 'Tackles',
'Interceptions', 'Clearances', 'Acc. passes prop' ,'Crosses prop', 'Long balls prop','Dribbles prop', 
'Score difference', 'Result'
]

dic_largeX = {}

# Iterate on all rows
for index, row in dfL1_largedim.iterrows():
    if index < len(dfL1_largedim) - 1:
        # Each time the date is equal and the team equal opponent
        if (dfL1_largedim.loc[index, 'Team'] == dfL1_largedim.loc[index + 1, 'Opponent']) & (dfL1_largedim.loc[index, 'Date'] == dfL1_largedim.loc[index + 1, 'Date']):
            # Boucler sur toutes les valeurs communes pour les ajouter avec même nom '_opp'
            dic_largeX[index] = {}
            dic_largeX[index]['Team'] = dfL1_largedim.loc[index, 'Team']
            dic_largeX[index]['Opponent'] = dfL1_largedim.loc[index, 'Opponent']
            dic_largeX[index]['Date'] = dfL1_largedim.loc[index, 'Date']
            dic_largeX[index]['Home'] = dfL1_largedim.loc[index, 'Home']
            dic_largeX[index]['Result'] = dfL1_largedim.loc[index, 'Result']
            dic_largeX[index]['Score difference'] = dfL1_largedim.loc[index, 'Score difference']
            # The values which are different for home/away
            for feature in features:
                dic_largeX[index][f"{feature} ext"] = dfL1_largedim.loc[index + 1, feature]
                dic_largeX[index][f'{feature} home'] = dfL1_largedim.loc[index, feature]


# Create a data frame from the dictionnary
df = pd.DataFrame.from_dict(dic_largeX, orient='index')
df = df.reset_index().drop(columns=['index'])
df

Unnamed: 0,Team,Opponent,Date,Home,Result,Score difference,Ball possession ext,Ball possession home,Total shots ext,Total shots home,...,Crosses prop ext,Crosses prop home,Long balls prop ext,Long balls prop home,Dribbles prop ext,Dribbles prop home,Score difference ext,Score difference home,Result ext,Result home
0,Olympique de Marseille,Toulouse,2018-08-10,1,1,4.0,0.40,0.60,5.0,23.0,...,0.25,0.20,0.48,0.58,0.42,0.46,-4.0,4.0,-1,1
1,FC Nantes,AS Monaco,2018-08-11,1,-1,-2.0,0.34,0.66,11.0,16.0,...,0.29,0.26,0.49,0.58,0.75,0.35,2.0,-2.0,1,-1
2,Angers,Nîmes Olympique,2018-08-11,1,-1,-1.0,0.44,0.56,14.0,20.0,...,0.38,0.19,0.37,0.31,0.44,0.59,1.0,-1.0,1,-1
3,Lille OSC,Stade Rennais,2018-08-11,1,1,2.0,0.44,0.56,8.0,15.0,...,0.16,0.40,0.42,0.52,0.50,0.77,-2.0,2.0,-1,1
4,Montpellier,Dijon,2018-08-11,1,-1,-1.0,0.46,0.54,10.0,17.0,...,0.20,0.27,0.45,0.53,0.64,1.00,1.0,-1.0,1,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1107,Metz,Lorient,2021-12-12,1,1,3.0,0.67,0.33,18.0,4.0,...,0.15,0.33,0.59,0.28,0.64,0.20,-3.0,3.0,-1,1
1108,Stade Rennais,OGC Nice,2021-12-12,1,-1,-1.0,0.33,0.67,8.0,21.0,...,0.25,0.31,0.46,0.81,0.78,0.44,1.0,-1.0,1,-1
1109,Troyes,Bordeaux,2021-12-12,1,-1,-1.0,0.35,0.65,7.0,20.0,...,0.14,0.13,0.44,0.51,0.56,0.58,1.0,-1.0,1,-1
1110,Strasbourg,Olympique de Marseille,2021-12-12,1,-1,-2.0,0.51,0.49,8.0,11.0,...,0.43,0.17,0.49,0.61,0.63,0.62,2.0,-2.0,1,-1


<h2 style='color:yellow'> Preprocessing</h2>

In [128]:
# Distinguish numerical, categorical, date and other features for the prprocessing
numerical_features = [
    'Ball possession home', 'Total shots home', 'Shots on target home', 'Shots off target home', 'Blocked shots home', 'Corner kicks home', 'Offsides home', 'Fouls home', 'Yellow cards home',
    'Shots inside box home', 'Shots outside box home', 'Goalkeeper saves home', 'Passes home', 'Acc. passes home', 'Duels won home', 'Aerials won home',
    'Hit woodwork home', 'Red cards home', 'Big chances home', 'Big chances missed home', 'Long balls home', 'Crosses home', 'Dribbles home', 'Tackles home',
    'Interceptions home', 'Clearances home', 'Acc. passes prop home','Crosses prop home', 'Long balls prop home','Dribbles prop home', 'Ball possession ext', 'Total shots ext', 'Shots on target ext', 'Shots off target ext', 'Blocked shots ext', 'Corner kicks ext', 'Offsides ext', 'Fouls ext', 'Yellow cards ext',
    'Shots inside box ext', 'Shots outside box ext', 'Goalkeeper saves ext', 'Passes ext', 'Acc. passes ext', 'Duels won ext', 'Aerials won ext',
    'Hit woodwork ext', 'Red cards ext', 'Big chances ext', 'Big chances missed ext', 'Long balls ext', 'Crosses ext', 'Dribbles ext', 'Tackles ext',
    'Interceptions ext', 'Clearances ext', 'Acc. passes prop ext','Crosses prop ext', 'Long balls prop ext','Dribbles prop ext'
]
categorical_features = ['Team', 'Opponent']
date_features = ['Day']
passthrough_features = ['Home']

In [129]:
# Encoding dates method

def _encode_dates(X):
    X = X.copy()  # modify a copy of X
    # Encode the date information from the DateOfDeparture columns
    X.loc[:, 'Year'] = X['Date'].dt.year
    X.loc[:, 'Month'] = X['Date'].dt.month
    X.loc[:, 'Day'] = X['Date'].dt.day
    X.loc[:, 'Weekday'] = X['Date'].dt.weekday

    # Finally we can drop the original columns from the dataframe
    return X

# Encode the X
df = _encode_dates(df)

In [130]:
 # Create the preprocessor

preprocessor = ColumnTransformer(
    [
        ("date", OrdinalEncoder(), date_features),
        ("cat", OneHotEncoder(handle_unknown='ignore'), categorical_features),
        ('numeric', StandardScaler(), numerical_features),
        ('period', 'passthrough', passthrough_features)
    ]
)

<h2 style='color:yellow'>Model - Result</h2>

In [131]:
# Train test split
X = df.drop(columns=['Result', 'Score difference', 'Result home', 'Score difference home', 'Result ext', 'Score difference ext'])
y = df['Result']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [132]:
params = {
    'colsample_bytree': 0.9,
    'learning_rate': 0.01, 
    'max_depth': -1, 
    'min_child_samples': 111, 
    'min_child_weight': 1, 
    'n_estimators': 500, 
    'num_leaves': 32, 
    'reg_alpha': 0, 
    'reg_lambda': 20, 
    'subsample': 0.2
}

regressor = lgb.LGBMClassifier(**params, random_state=21)

pipe = make_pipeline(preprocessor, regressor)
result = pipe.fit(X_train, y_train)

print(f'Train set, Accuracy_score={accuracy_score(y_train, result.predict(X_train)):.2f}')
print(f'Test set, Accuracy_score={accuracy_score(y_test, result.predict(X_test)):.2f}')

Train set, Accuracy_score=0.79
Test set, Accuracy_score=0.68


<h2 style='color:yellow'>Model - Score difference</h2>

In [17]:
# Train test split
df = df[df['Score difference'].notna()]
X_sd = df.drop(columns=['Result', 'Score difference', 'Result home', 'Score difference home', 'Result ext', 'Score difference ext'])
y_sd = df['Score difference']

X_train_sd, X_test_sd, y_train_sd, y_test_sd = train_test_split(X_sd, y_sd, test_size=0.2, random_state=42)


In [18]:
params_sd = {
    'colsample_bytree': 0.9,
    'learning_rate': 0.01, 
    'max_depth': -1, 
    'min_child_samples': 111, 
    'min_child_weight': 1, 
    'n_estimators': 500, 
    'num_leaves': 32, 
    'reg_alpha': 0, 
    'reg_lambda': 20, 
    'subsample': 0.2
}

regressor_sd = lgb.LGBMClassifier(**params_sd, random_state=21)

pipe_sd = make_pipeline(preprocessor, regressor_sd)
result_sd = pipe_sd.fit(X_train_sd, y_train_sd)

print(f'Train set, Accuracy_score={accuracy_score(y_train_sd, result_sd.predict(X_train_sd)):.2f}')
print(f'Test set, Accuracy_score={accuracy_score(y_test_sd, result_sd.predict(X_test_sd)):.2f}')

Train set, Accuracy_score=0.58
Test set, Accuracy_score=0.37


<h2 style='color:yellow'>Get the predictions</h2>

In [133]:
# Variables pour le scrapper : Choisir la ligue

# Ligue : à changer selon besoin
league_str = "a[href='/tournament/football/france/ligue-1/34']"
# Les matchs (format : a>div>div.Cell-sc ...)
match_str = 'a>div>div.Cell-sc-t6h3ns-0'
# La date (format : div>div.classe)
date_str = 'div>div.ixemiF'
# Le nom des équipes (format : div.classe > div.Content-...)
team_names_str = 'div.bGpZoa>div.Content-sc-1morvta-0'
# Le score
score_str = 'div.jUEsho>div.Content-sc-1morvta-0'
# Le button stat(format : .classe)
buttons_str = ".iCdnqS"
# Les stats (format : .styles__StatisticsItemContent...)
stats_str = '.styles__StatisticsItemContent-sc-1imujgi-0'
# Croix (format : path[d:'XX X XXX ...'])
cross_str = "path[d='M4 4 L20 20 M4 20 L20 4']"

<h2 style='color:yellow'>Scraper for match you want to predict</h2>
<p style='font-style:italic'>Usually you should seek for the next day of the league<p>

In [21]:
## Peut on aussi entrainer le DF avec 5 derniers résultats ??? Pour pouvoir le mettre dans la filenmaeion ???


# Retake a scrapper
driver = webdriver.Chrome(ChromeDriverManager().install())

#Specify Search URL
search_url='https://www.sofascore.com/'


#Variables des sélecteurs 
filehandler = open ('predict.pickle', 'w') 

# Scrapper
def daterange(start_date, end_date,filehandler):
    with open('teams_stats_sofascore_L1.csv','wb') as f:
        dic_matchs = {}
        i = 0
        for n in range(int((end_date - start_date).days)):
            date = start_date + timedelta(n)
            search_url = f"https://www.sofascore.com/{date}"
            driver.get(search_url)
            ligue_tags = driver.find_elements_by_css_selector(league_str)
            if len(ligue_tags) < 2:
                driver.execute_script("window.scrollBy(0, arguments[0]);", 1000)
                ligue_tags = driver.find_elements_by_css_selector(league_str)
            if len(ligue_tags) >= 2:
                ligue_tag = ligue_tags[1].find_element_by_xpath('..').find_element_by_xpath('..').find_element_by_xpath('..').find_element_by_xpath('..')
                test_tag = ligue_tags[1].find_element_by_xpath('..').find_element_by_xpath('..').find_element_by_xpath('..').find_element_by_xpath('..')
                ligue_tag_style = ligue_tag.get_attribute("style")
                test_tag_style = ligue_tag.get_attribute("style")
                matchs=[]
                matchs_childs=[]
                while test_tag_style[:12] == ligue_tag_style[:12]:
                    matchs.append(test_tag)
                    #Si jamais le tag de Ligue 1 est trop bas
                    if test_tag.location['y'] > 1000 :
                        desired_y1 = (test_tag.size['height'] / 2) + test_tag.location['y']
                        current_y1 = (driver.execute_script('return window.innerHeight') / 2) + driver.execute_script('return window.pageYOffset')
                        scroll_y1_by = desired_y1 - current_y1
                        driver.execute_script("window.scrollBy(0, arguments[0]);", scroll_y1_by)
                    #time.sleep(2)
                    test_tag_style = test_tag.find_element_by_xpath("following-sibling::div").get_attribute('style')
                    test_tag = test_tag.find_element_by_xpath("following-sibling::div")
                del matchs[0]
                for match in matchs:
                    if match:
                        matchs_childs.append(match.find_element_by_css_selector(f"{match_str}"))
                for game in matchs_childs:
                    print
                    dic_matchs[i] = {}
                    dic_matchs[i+1] = {}
                    date = game.find_element_by_css_selector(f"{date_str}")
                    team_names = game.find_elements_by_css_selector(f"{team_names_str}")
                    score = game.find_elements_by_css_selector(f"{score_str}")
                    dic_matchs[i]['Date'] = date.get_attribute('innerHTML')
                    dic_matchs[i+1]['Date'] = date.get_attribute('innerHTML')
                    dic_matchs[i]['Home'] = 1
                    dic_matchs[i+1]['Home'] = 0
                    dic_matchs[i]['Team'] = team_names[0].text
                    dic_matchs[i]['Opponent'] = team_names[1].text
                    dic_matchs[i+1]['Team'] = team_names[1].text
                    dic_matchs[i+1]['Opponent'] = team_names[0].text
                    desired_y = (game.size['height'] / 2) + game.location['y']
                    current_y = (driver.execute_script('return window.innerHeight') / 2) + driver.execute_script('return window.pageYOffset')
                    scroll_y_by = desired_y - current_y
                    driver.execute_script("window.scrollBy(0, arguments[0]);", scroll_y_by)
                    game.click()
                    i = i + 2
                    with open('predict.pickle', 'wb') as handle:
                        pickle.dump(dic_matchs, handle, protocol=pickle.HIGHEST_PROTOCOL)
        return dic_matchs


start_date = datetime.date(2021,12,21)
end_date = datetime.date(2021,12,23)

global_dic = {}
ligue_date = datetime.date(2021,12,22)

# Gestion des erreurs 
dic_matchs = daterange(start_date, end_date,filehandler)
with open('predict.pickle', 'rb') as handle:
    dic_match = pickle.load(handle)
    i = 0
    for key, value in dic_match.items():
        global_dic[len(global_dic) + i] = value
        i = i + 1
    last_date_raw = dic_match[len(dic_match)-1]['Date']
    last_date = last_date_raw[:2] + '-' + last_date_raw[3:5]+ '-' + last_date_raw[6:]
    last_date_2 = datetime.datetime.strptime(last_date, '%d-%m-%y').date()
    start_date = last_date_2




Current google-chrome version is 96.0.4664
Get LATEST chromedriver version for 96.0.4664 google-chrome
Driver [/Users/charlesproye/.wdm/drivers/chromedriver/mac64/96.0.4664.45/chromedriver] found in cache
  driver = webdriver.Chrome(ChromeDriverManager().install())
  ligue_tags = driver.find_elements_by_css_selector(league_str)
  ligue_tags = driver.find_elements_by_css_selector(league_str)


In [134]:
# Create prediction DataFrame
prediction = pd.DataFrame.from_dict(dic_matchs, orient='index')
# Only home game (to be discussed)
prediction = prediction[prediction['Home'] == 1]
# Encode the date
prediction['Date'] = prediction['Date'].apply(lambda x: datetime.datetime.strptime(x, "%d/%m/%y"))
prediction = _encode_dates(prediction)

# The data we do not have - Weighted mean on the n_last_games of the team
n_last_games = 5

numerical_features_home = [
    'Ball possession home', 'Total shots home', 'Shots on target home', 'Shots off target home', 'Blocked shots home', 'Corner kicks home', 'Offsides home', 'Fouls home', 'Yellow cards home',
    'Shots inside box home', 'Shots outside box home', 'Goalkeeper saves home', 'Passes home', 'Acc. passes home', 'Duels won home', 'Aerials won home',
    'Hit woodwork home', 'Red cards home', 'Big chances home', 'Big chances missed home', 'Long balls home', 'Crosses home', 'Dribbles home', 'Tackles home',
    'Interceptions home', 'Clearances home', 'Acc. passes prop home','Crosses prop home', 'Long balls prop home','Dribbles prop home'
]

numerical_features_ext = [
    'Ball possession ext', 'Total shots ext', 'Shots on target ext', 'Shots off target ext', 'Blocked shots ext', 'Corner kicks ext', 'Offsides ext', 'Fouls ext', 'Yellow cards ext',
    'Shots inside box ext', 'Shots outside box ext', 'Goalkeeper saves ext', 'Passes ext', 'Acc. passes ext', 'Duels won ext', 'Aerials won ext',
    'Hit woodwork ext', 'Red cards ext', 'Big chances ext', 'Big chances missed ext', 'Long balls ext', 'Crosses ext', 'Dribbles ext', 'Tackles ext',
    'Interceptions ext', 'Clearances ext', 'Acc. passes prop ext','Crosses prop ext', 'Long balls prop ext','Dribbles prop ext'
]

# Method to get the weighted average

# Donner plus d'importance au match récent et au match ayant le même statut (domicile extérieur)

def create_weigted_mean_columns_home(team, dataset, n, feature, bool):
    
    # Find the n last games of the team in the dataset or max last games you can if you have less than n (exemple : new club in the league)
    if dataset[(dataset['Team'] == team) | (dataset['Opponent'] == team)].shape[0] < n:
        n = dataset[(dataset['Team'] == team) | (dataset['Opponent'] == team)].shape[0]
    data_team = dataset[(dataset['Team'] == team) | (dataset['Opponent'] == team)].iloc[-n:,:]
    
    # We add a columns both - it is a if statement to get the value of the team that we will use to compute the weighted average
    data_team[f'{feature} both'] = np.where(data_team['Team'] == team, data_team[feature], data_team[feature])
    
    # Can be discussed : Helps to ponderate to give additional importance to home or away game, depending if the game we want to predict is home or away
    # For now it is computed s.t the last match (if not of the same cat) and the last match of the same category has the same weight
    m = n / (n + 1)
    
    # Get the weighted average - Note that the weight are a if statement to valorize the category of match (away/home) according to what we want to predict
    # And that we use the index normalized to ponderate the importance of the date (the last match is more important thant the match two months ago)
    # If the team is home for the game we want to predict, we apply a m factor to the weight of the previous matchs played home
    if bool :
        data_team['Weights'] = np.where(data_team['Team'] == team, m * (data_team['Team'].reset_index().index + 1), data_team['Team'].reset_index().index + 1)
    else:
        data_team['Weights'] = np.where(data_team['Opponent'] == team, m * (data_team['Team'].reset_index().index + 1), data_team['Team'].reset_index().index + 1)
    
    w_average = np.average(data_team[f'{feature} both'], weights=data_team['Weights']) 
    return w_average


for feature in numerical_features_home:
    bool = True
    prediction[feature] = prediction['Team'].apply(lambda x: create_weigted_mean_columns_home(x, df, n_last_games, feature, bool))

for feature in numerical_features_ext:
    bool = False
    prediction[feature] = prediction['Opponent'].apply(lambda x: create_weigted_mean_columns_home(x, df, n_last_games, feature, bool))



# Même ordre que le training
prediction = prediction[X.columns]

In [135]:
predictions = result.predict(prediction)
prediction = prediction.reset_index().drop(columns=['index'])
pred_df = pd.DataFrame(prediction['Date'])
pred_df = pred_df.join(prediction['Team'])
pred_df = pred_df.join(prediction['Opponent'])
pred_df = pred_df.join(pd.DataFrame(predictions, columns=['Result encoded']))
pred_df

Unnamed: 0,Date,Team,Opponent,Result encoded
0,2021-12-22,AS Monaco,Stade Rennais,-1
1,2021-12-22,Bordeaux,Lille OSC,1
2,2021-12-22,Clermont Foot 63,Strasbourg,0
3,2021-12-22,Lorient,Paris Saint-Germain,-1
4,2021-12-22,Montpellier,Angers,-1
5,2021-12-22,OGC Nice,RC Lens,-1
6,2021-12-22,Olympique Lyonnais,Metz,1
7,2021-12-22,Olympique de Marseille,Stade de Reims,1
8,2021-12-22,Saint-Étienne,FC Nantes,1
9,2021-12-22,Troyes,Stade Brestois 29,-1


<h2 style='color:yellow'>Get it on Excel</h2>

In [125]:
league_name = 'Ligue1'
day_number = 19
n = n_last_games

with pd.ExcelWriter('bet_excel.xlsx', mode='a', engine='openpyxl') as writer:
    pred_df.to_excel(writer, sheet_name=f'{league_name} - Journée {day_number} (n = {n})')

### Idées et choses à faires:
- Pourrait on comparer la côte à la sécurité de la classification pour essayer de réaliser des arbitrages ? (récupérer cette valeur)
- Chercher à trouver le meilleur modèle et à le tuner
- Parier quand le modèle prévoit des grosses côtes