In [417]:
import os
import os.path
import selenium
from selenium import webdriver
import time
import io
import requests
from webdriver_manager.chrome import ChromeDriverManager
from selenium.common.exceptions import ElementClickInterceptedException
import numpy as np
import csv
from selenium.webdriver.support.ui import WebDriverWait
import datetime
from datetime import date, timedelta
from selenium.webdriver.common.action_chains import ActionChains
import pandas as pd
import pickle

Remark : *All df_smalldim references are kept in comment as it concerned the data before 2016 (less data), it would be useful if we want to use  the set of data from 2016 to 2018.*

In [418]:
# Load the CSV
dfL1 = pd.read_csv('./sofascore_L1_20182021.csv')
# Check NAN
print(dfL1.isna().sum())
# Drop duplicates columns due to Selenium error (stuck at a day, do this day again)
dfL1 = dfL1.drop_duplicates().reset_index()

Date                     0
Home                     0
Team                     0
Opponent                 0
Score team              54
Score opponent          54
Ball possession         54
Total shots             56
Shots on target         56
Shots off target        56
Blocked shots           56
Corner kicks            56
Offsides               232
Fouls                   56
Yellow cards           126
Big chances            144
Big chances missed     408
Shots inside box        56
Shots outside box       58
Goalkeeper saves        65
Passes                  54
Acc. passes             54
Long balls              54
Crosses                 56
Dribbles                56
Possession lost         54
Duels won               54
Aerials won             56
Tackles                 54
Interceptions           56
Clearances              56
Counter attacks       1219
Red cards             2009
Hit woodwork          1401
dtype: int64


In [419]:
# Drop useless columns (too small number of data)
dfL1.drop(columns=['Counter attacks', 'Possession lost'], inplace=True)

In [420]:
# Transform Date into date time
dfL1['Date'] = dfL1['Date'].apply(lambda x: datetime.datetime.strptime(x, "%d/%m/%y"))

In [421]:
# Change red yellow cards and offsides
dfL1['Red cards'] = dfL1['Red cards'].apply(lambda x: 0 if np.isnan(x) else x)
dfL1['Yellow cards'] = dfL1['Yellow cards'].apply(lambda x: 0 if np.isnan(x) else x)
dfL1['Offsides'] = dfL1['Offsides'].apply(lambda x: 0 if np.isnan(x) else x)
dfL1['Hit woodwork'] = dfL1['Hit woodwork'].apply(lambda x: 0 if np.isnan(x) else x)
dfL1['Goalkeeper saves'] = dfL1['Goalkeeper saves'].apply(lambda x: 0 if np.isnan(x) else x)
dfL1['Shots outside box'] = dfL1['Shots outside box'].apply(lambda x: 0 if np.isnan(x) else x)


In [422]:
# Separate DF with old and new data
#dfL1_smalldim = dfL1[np.isnan(dfL1['Clearances'])].reset_index()
dfL1_largedim = dfL1[~np.isnan(dfL1['Clearances'])].reset_index()
# Small dim get rid of useless colums
#dfL1_smalldim.drop(columns=['Long balls', 'Crosses', 'Dribbles', 'Tackles', 'Interceptions', 'Clearances', 'Big chances', 'Big chances missed'], inplace=True)

In [423]:
# L1 large_dim change NaN to zero (Big Chances, Big Chances missed)
dfL1_largedim['Big chances'] = dfL1_largedim['Big chances'].apply(lambda x: 0 if np.isnan(x) else x)
dfL1_largedim['Big chances missed'] = dfL1_largedim['Big chances missed'].apply(lambda x: 0 if np.isnan(x) else x)

In [424]:
# Get rid of the postponed / canceled games
#dfL1_smalldim = dfL1_smalldim[dfL1_smalldim['Score opponent'].notna()]
#dfL1_smalldim = dfL1_smalldim[dfL1_smalldim['Ball possession'].notna()]
#dfL1_smalldim = dfL1_smalldim[dfL1_smalldim['Passes'].notna()]

In [425]:
# Two useless columns
#dfL1_smalldim.drop(columns=['level_0', 'index'], inplace=True)
dfL1_largedim.drop(columns=['level_0', 'index'], inplace=True)

In [426]:
# Visualise the types
#print(dfL1_smalldim.dtypes)
dfL1_largedim.dtypes

Date                  datetime64[ns]
Home                           int64
Team                          object
Opponent                      object
Score team                   float64
Score opponent               float64
Ball possession               object
Total shots                  float64
Shots on target              float64
Shots off target             float64
Blocked shots                float64
Corner kicks                 float64
Offsides                     float64
Fouls                        float64
Yellow cards                 float64
Big chances                  float64
Big chances missed           float64
Shots inside box             float64
Shots outside box            float64
Goalkeeper saves             float64
Passes                       float64
Acc. passes                   object
Long balls                    object
Crosses                       object
Dribbles                      object
Duels won                    float64
Aerials won                  float64
T

In [427]:
# Functions helping transform the values (used in next cell)

# For Acc. passes (small and large)
def spliter_cleaner(x):
    x = str(x)
    x_list = x.replace('(','').replace(')','').replace('%','').split(' ')
    return int(x_list[0]), int(x_list[1])/100

# For Long Balls, crosses and dribbles
def spliter_cleaner_large(x):
    x = str(x)
    abs = x.replace('(','').replace(')','').replace('%','').split('/')[0]
    prop = x.replace('(','').replace(')','').replace('%','').split('/')[1].split(' ')[1]
    return int(abs), int(prop)/100

# For result
def result(x):
    if x > 0:
        x = 1
    elif x == 0 :
        x = 0
    else:
        x = -1
    return x

In [428]:
dfL1_largedim.isna().sum()

Date                  0
Home                  0
Team                  0
Opponent              0
Score team            2
Score opponent        2
Ball possession       0
Total shots           0
Shots on target       0
Shots off target      0
Blocked shots         0
Corner kicks          0
Offsides              0
Fouls                 0
Yellow cards          0
Big chances           0
Big chances missed    0
Shots inside box      0
Shots outside box     0
Goalkeeper saves      0
Passes                0
Acc. passes           0
Long balls            0
Crosses               0
Dribbles              0
Duels won             0
Aerials won           0
Tackles               0
Interceptions         0
Clearances            0
Red cards             0
Hit woodwork          0
dtype: int64

In [429]:
# Deal with the types of values
#dfL1_smalldim['Team'] = dfL1_smalldim['Team'].apply(lambda x: str(x))
#dfL1_smalldim['Opponent'] = dfL1_smalldim['Opponent'].apply(lambda x: str(x))
#dfL1_smalldim['Ball possession'] = dfL1_smalldim['Ball possession'].apply(lambda x: float(str(x).replace('%',''))/100)
dfL1_largedim['Ball possession'] = dfL1_largedim['Ball possession'].apply(lambda x: float(str(x).replace('%',''))/100)
#dfL1_smalldim['Acc. passes'], dfL1_smalldim['Acc. passes prop'] = \
    #zip(*dfL1_smalldim['Acc. passes'].map(spliter_cleaner))
dfL1_largedim['Acc. passes'], dfL1_largedim['Acc. passes prop'] = \
    zip(*dfL1_largedim['Acc. passes'].map(spliter_cleaner))
#dfL1_smalldim['Score difference'] = dfL1_smalldim['Score team'] - dfL1_smalldim['Score opponent']
dfL1_largedim['Score difference'] = dfL1_largedim['Score team'] - dfL1_largedim['Score opponent']
#dfL1_smalldim['Result'] = dfL1_smalldim['Score difference'].apply(lambda x: result(x))
dfL1_largedim['Result'] = dfL1_largedim['Score difference'].apply(lambda x: result(x))
dfL1_largedim['Crosses'], dfL1_largedim['Crosses prop'] = \
    zip(*dfL1_largedim['Crosses'].map(spliter_cleaner_large))
dfL1_largedim['Long balls'], dfL1_largedim['Long balls prop'] = \
    zip(*dfL1_largedim['Long balls'].map(spliter_cleaner_large))
dfL1_largedim['Dribbles'], dfL1_largedim['Dribbles prop'] = \
    zip(*dfL1_largedim['Dribbles'].map(spliter_cleaner_large))

In [430]:
# Final shapes
#dfL1_smalldim = dfL1_smalldim.reset_index()
dfL1_largedim = dfL1_largedim.reset_index()
#dfL1_smalldim = dfL1_smalldim.drop(columns=['index'])
dfL1_largedim = dfL1_largedim.drop(columns=['index'])
#print(dfL1_smalldim.shape)
print(dfL1_largedim.shape)

(2349, 38)


In [431]:
dfL1_largedim

Unnamed: 0,Date,Home,Team,Opponent,Score team,Score opponent,Ball possession,Total shots,Shots on target,Shots off target,...,Interceptions,Clearances,Red cards,Hit woodwork,Acc. passes prop,Score difference,Result,Crosses prop,Long balls prop,Dribbles prop
0,2018-08-10,1,Olympique de Marseille,Toulouse,4.0,0.0,0.60,23.0,10.0,9.0,...,7.0,13.0,0.0,0.0,0.85,4.0,1,0.20,0.58,0.46
1,2018-08-10,0,Toulouse,Olympique de Marseille,0.0,4.0,0.40,5.0,1.0,4.0,...,15.0,20.0,0.0,0.0,0.79,-4.0,-1,0.25,0.48,0.42
2,2018-08-11,1,FC Nantes,AS Monaco,1.0,3.0,0.66,16.0,4.0,10.0,...,18.0,14.0,0.0,0.0,0.82,-2.0,-1,0.26,0.58,0.35
3,2018-08-11,0,AS Monaco,FC Nantes,3.0,1.0,0.34,11.0,6.0,4.0,...,10.0,31.0,0.0,0.0,0.65,2.0,1,0.29,0.49,0.75
4,2018-08-11,1,Angers,Nîmes Olympique,3.0,4.0,0.56,20.0,7.0,7.0,...,3.0,15.0,0.0,1.0,0.77,-1.0,-1,0.19,0.31,0.59
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2344,2021-12-12,0,Bordeaux,Troyes,2.0,1.0,0.35,7.0,4.0,1.0,...,13.0,36.0,0.0,0.0,0.71,1.0,1,0.14,0.44,0.56
2345,2021-12-12,1,Strasbourg,Olympique de Marseille,0.0,2.0,0.49,11.0,3.0,6.0,...,7.0,9.0,0.0,0.0,0.83,-2.0,-1,0.17,0.61,0.62
2346,2021-12-12,0,Olympique de Marseille,Strasbourg,2.0,0.0,0.51,8.0,3.0,4.0,...,9.0,25.0,0.0,0.0,0.83,2.0,1,0.43,0.49,0.63
2347,2021-12-12,1,Paris Saint-Germain,AS Monaco,2.0,0.0,0.53,9.0,2.0,4.0,...,16.0,12.0,0.0,1.0,0.88,2.0,1,0.33,0.53,0.64
