In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.ensemble import RandomForestRegressor
from scipy.stats import poisson
from sklearn.metrics import f1_score
from sklearn.linear_model import RidgeClassifier
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import MinMaxScaler

In [2]:
data=pd.read_csv(r'C:\Users\enriq\Apple\matches_pl_final.csv')
data.columns = [c.lower() for c in data.columns]

In [3]:
data['comp'].unique()

array(['Premier League', 'Europa Lg', 'EFL Cup', 'FA Cup',
       'Community Shield', 'Champions Lg', 'Conf Lg', 'Super Cup'],
      dtype=object)

In [4]:
data=data[data['comp']=='Premier League']

In [5]:
data=data.drop(['attendance','captain','formation','referee','match report','notes',],axis=1)
data

Unnamed: 0,unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,...,lost,blocks,sh,pass,int,tkl+int,clr,err,season,team
0,0,2022-08-05,20:00,Premier League,Matchweek 1,Fri,Away,W,2.0,0.0,...,16.0,17.0,6.0,11.0,9.0,38.0,24.0,0.0,2023,Arsenal
1,1,2022-08-13,15:00,Premier League,Matchweek 2,Sat,Home,W,4.0,2.0,...,3.0,9.0,1.0,8.0,8.0,16.0,12.0,0.0,2023,Arsenal
2,2,2022-08-20,17:30,Premier League,Matchweek 3,Sat,Away,W,3.0,0.0,...,7.0,11.0,0.0,11.0,10.0,24.0,20.0,1.0,2023,Arsenal
3,3,2022-08-27,17:30,Premier League,Matchweek 4,Sat,Home,W,2.0,1.0,...,4.0,17.0,2.0,15.0,7.0,22.0,5.0,1.0,2023,Arsenal
4,4,2022-08-31,19:30,Premier League,Matchweek 5,Wed,Home,W,2.0,1.0,...,5.0,4.0,0.0,4.0,8.0,18.0,13.0,0.0,2023,Arsenal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1879,38,2022-04-30,15:00,Premier League,Matchweek 35,Sat,Away,L,0,2,...,9.0,12.0,7.0,5.0,14.0,26.0,9.0,0.0,2022,NorwichCity
1880,39,2022-05-08,14:00,Premier League,Matchweek 36,Sun,Home,L,0,4,...,9.0,16.0,4.0,12.0,9.0,21.0,20.0,1.0,2022,NorwichCity
1881,40,2022-05-11,19:45,Premier League,Matchweek 21,Wed,Away,L,0,3,...,9.0,14.0,7.0,7.0,5.0,24.0,37.0,0.0,2022,NorwichCity
1882,41,2022-05-15,14:00,Premier League,Matchweek 37,Sun,Away,D,1,1,...,19.0,8.0,5.0,3.0,9.0,33.0,23.0,0.0,2022,NorwichCity


In [6]:
#Loss or draw = 0, win=1
data["target"] = (data["result"] == "W").astype("int")
data["venue_code"] = data["venue"].astype("category").cat.codes
data["opp_code"] = data["opponent"].astype("category").cat.codes

In [7]:
data

Unnamed: 0,unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,...,pass,int,tkl+int,clr,err,season,team,target,venue_code,opp_code
0,0,2022-08-05,20:00,Premier League,Matchweek 1,Fri,Away,W,2.0,0.0,...,11.0,9.0,38.0,24.0,0.0,2023,Arsenal,1,0,7
1,1,2022-08-13,15:00,Premier League,Matchweek 2,Sat,Home,W,4.0,2.0,...,8.0,8.0,16.0,12.0,0.0,2023,Arsenal,1,1,11
2,2,2022-08-20,17:30,Premier League,Matchweek 3,Sat,Away,W,3.0,0.0,...,11.0,10.0,24.0,20.0,1.0,2023,Arsenal,1,0,2
3,3,2022-08-27,17:30,Premier League,Matchweek 4,Sat,Home,W,2.0,1.0,...,15.0,7.0,22.0,5.0,1.0,2023,Arsenal,1,1,9
4,4,2022-08-31,19:30,Premier League,Matchweek 5,Wed,Home,W,2.0,1.0,...,4.0,8.0,18.0,13.0,0.0,2023,Arsenal,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1879,38,2022-04-30,15:00,Premier League,Matchweek 35,Sat,Away,L,0,2,...,5.0,14.0,26.0,9.0,0.0,2022,NorwichCity,0,0,1
1880,39,2022-05-08,14:00,Premier League,Matchweek 36,Sun,Home,L,0,4,...,12.0,9.0,21.0,20.0,1.0,2022,NorwichCity,0,1,21
1881,40,2022-05-11,19:45,Premier League,Matchweek 21,Wed,Away,L,0,3,...,7.0,5.0,24.0,37.0,0.0,2022,NorwichCity,0,0,11
1882,41,2022-05-15,14:00,Premier League,Matchweek 37,Sun,Away,D,1,1,...,3.0,9.0,33.0,23.0,0.0,2022,NorwichCity,0,0,22


In [8]:
#Data is available up to week 27
data[(data['round']=='Matchweek 27') &  (data['season']==2023)]

Unnamed: 0,unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,...,pass,int,tkl+int,clr,err,season,team,target,venue_code,opp_code
36,36,2023-03-12,14:00,Premier League,Matchweek 27,Sun,Away,,,,...,,,,,,2023,Arsenal,0,0,9
89,40,2023-03-11,17:30,Premier League,Matchweek 27,Sat,Away,,,,...,,,,,,2023,ManchesterCity,0,0,7
146,43,2023-03-12,14:00,Premier League,Matchweek 27,Sun,Home,,,,...,,,,,,2023,ManchesterUnited,0,1,18
199,38,2023-03-11,15:00,Premier League,Matchweek 27,Sat,Home,,,,...,,,,,,2023,TottenhamHotspur,0,1,17
249,38,2023-03-11,12:30,Premier League,Matchweek 27,Sat,Away,,,,...,,,,,,2023,Liverpool,0,0,2
296,33,2023-03-12,16:30,Premier League,Matchweek 27,Sun,Home,,,,...,,,,,,2023,NewcastleUnited,0,1,22
340,31,2023-03-12,14:00,Premier League,Matchweek 27,Sun,Home,,,,...,,,,,,2023,Fulham,0,1,0
383,30,2023-03-11,15:00,Premier League,Matchweek 27,Sat,Away,,,,...,,,,,,2023,BrightonandHoveAlbion,0,0,10
425,27,2023-03-11,15:00,Premier League,Matchweek 27,Sat,Away,,,,...,,,,,,2023,Brentford,0,0,8
474,35,2023-03-11,15:00,Premier League,Matchweek 27,Sat,Away,,,,...,,,,,,2023,Chelsea,0,0,11


In [9]:
#Select the data up to the week before the week you want to predict
dataf=data[data['date']<'2023-03-13']

In [10]:
dataf["g/sot"] = dataf["g/sot"].fillna(dataf["g/sot"].mean())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataf["g/sot"] = dataf["g/sot"].fillna(dataf["g/sot"].mean())


In [11]:
dataf.columns

Index(['unnamed: 0', 'date', 'time', 'comp', 'round', 'day', 'venue', 'result',
       'gf', 'ga', 'opponent', 'xg_x', 'xga', 'poss', 'gls', 'sh_x', 'sot',
       'sot%', 'g/sh', 'g/sot', 'dist', 'fk', 'pk', 'pkatt', 'xg_y', 'npxg',
       'npxg/sh', 'g-xg', 'np:g-xg', 'cmp', 'cmp.1', 'cmp.2', 'cmp.3', 'att_x',
       'att_x.1', 'att_x.2', 'att_x.3', 'cmp%', 'cmp%.1', 'cmp%.2', 'cmp%.3',
       'totdist', 'prgdist', 'ast', 'xag', 'xa', 'kp', '1/3', 'ppa', 'crspa',
       'prgp', 'sca', 'passlive', 'passlive.1', 'passdead', 'passdead.1', 'to',
       'to.1', 'fld', 'fld.1', 'def', 'def.1', 'gca', 'passlive.2',
       'passlive.3', 'passdead.2', 'passdead.3', 'to.2', 'to.3', 'sh_y',
       'sh_y.1', 'fld.2', 'fld.3', 'def.2', 'def.3', 'tkl', 'tkl.1', 'tklw',
       'def 3rd', 'mid 3rd', 'att 3rd', 'tkl.2', 'tkl.3', 'att_y', 'tkl%',
       'lost', 'blocks', 'sh', 'pass', 'int', 'tkl+int', 'clr', 'err',
       'season', 'team', 'target', 'venue_code', 'opp_code'],
      dtype='object')

In [12]:
dataf.shape

(1284, 98)

eliminate_columns=['date', 'time', 'comp', 'round', 'day','venue','season','unnamed: 0','team']
columns_data_ridge=dataf.drop(eliminate_columns,axis=1).columns

data_ridge=dataf[['unnamed: 0','date','team']+list(columns_data_ridge)]
data_ridge

In [13]:
eliminate_columns=['date', 'time', 'comp', 'round', 'day','venue','season','unnamed: 0','team','gf','ga','opponent','result']
data_ridge=dataf.drop(eliminate_columns,axis=1)


In [14]:
def rolling(df,cols,new_cols):
    df=df.sort_values('date')
    rolling_stats=df[cols].rolling(3,closed='left').mean()
    df[new_cols] = rolling_stats
    df = df.dropna(subset=new_cols)
    return df
    

In [15]:
#cols=[ "gf", "ga","sot%","g/sot","xg_x","xga","poss",'prgp','g/sh','dist','cmp','att_x','int','clr','lost']

cols=data_ridge.columns

new_cols = [f"{c}_rolling" for c in cols]


In [16]:
dataf[cols].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1284 entries, 0 to 1883
Data columns (total 85 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   xg_x        1262 non-null   float64
 1   xga         1262 non-null   float64
 2   poss        1262 non-null   float64
 3   gls         1262 non-null   float64
 4   sh_x        1262 non-null   float64
 5   sot         1262 non-null   float64
 6   sot%        1262 non-null   float64
 7   g/sh        1262 non-null   float64
 8   g/sot       1284 non-null   float64
 9   dist        1262 non-null   float64
 10  fk          1262 non-null   float64
 11  pk          1262 non-null   float64
 12  pkatt       1262 non-null   float64
 13  xg_y        1262 non-null   float64
 14  npxg        1262 non-null   float64
 15  npxg/sh     1262 non-null   float64
 16  g-xg        1262 non-null   float64
 17  np:g-xg     1262 non-null   float64
 18  cmp         1262 non-null   float64
 19  cmp.1       1262 non-null  

camibar dataf por data

In [17]:
matches_rolling = dataf.groupby("team").apply(lambda x: rolling(x, cols, new_cols))
matches_rolling = matches_rolling.droplevel('team')

In [18]:
matches_rolling.shape

(1211, 183)

In [19]:
rr = RidgeClassifier(alpha=1)

split = TimeSeriesSplit(n_splits=3)

sfs = SequentialFeatureSelector(rr, 
                                n_features_to_select=15, 
                                direction="forward",
                                cv=split,
                                n_jobs=1
                               )

In [20]:
matches_rolling.columns

Index(['unnamed: 0', 'date', 'time', 'comp', 'round', 'day', 'venue', 'result',
       'gf', 'ga',
       ...
       'blocks_rolling', 'sh_rolling', 'pass_rolling', 'int_rolling',
       'tkl+int_rolling', 'clr_rolling', 'err_rolling', 'target_rolling',
       'venue_code_rolling', 'opp_code_rolling'],
      dtype='object', length=183)

In [21]:
removed_columns=['unnamed: 0','date', 'time', 'comp', 'round', 'day','venue','season','unnamed: 0','team','gf','ga','opponent','result','g/sot', 'g/sh_rolling', 'g/sot_rolling', 'pk_rolling', 'pkatt_rolling', 'npxg/sh_rolling', 'fld.1_rolling', 'def.1_rolling', 'fld.3_rolling', 'def.3_rolling', 'target_rolling', 'venue_code_rolling','target']
selected_columns = matches_rolling.columns[~matches_rolling.columns.isin(removed_columns)]
selected_columns

Index(['xg_x', 'xga', 'poss', 'gls', 'sh_x', 'sot', 'sot%', 'g/sh', 'dist',
       'fk',
       ...
       'tkl%_rolling', 'lost_rolling', 'blocks_rolling', 'sh_rolling',
       'pass_rolling', 'int_rolling', 'tkl+int_rolling', 'clr_rolling',
       'err_rolling', 'opp_code_rolling'],
      dtype='object', length=157)

In [22]:
# Select the numeric columns
num_cols = matches_rolling.select_dtypes(include=['float64', 'int64'])

# Filter columns that have values ​​between 0 and 1
cols_filt = (num_cols >= 0) & (num_cols <= 1)
cols_filt = cols_filt.all()

# Get the names of the columns that meet the condition
cols_names = cols_filt[cols_filt == True].index.tolist()

print(cols_names)

['g/sot', 'g/sh_rolling', 'g/sot_rolling', 'pk_rolling', 'pkatt_rolling', 'npxg/sh_rolling', 'fld.1_rolling', 'def.1_rolling', 'fld.3_rolling', 'def.3_rolling', 'target_rolling', 'venue_code_rolling']


In [23]:
#Standardization
scaler = MinMaxScaler()
matches_rolling[selected_columns] = scaler.fit_transform(matches_rolling[selected_columns])

In [25]:
matches_rolling=matches_rolling.dropna()

In [26]:
sfs.fit(matches_rolling[selected_columns],matches_rolling['target'] )

In [27]:
matches_rolling.dropna().shape

(1191, 183)

In [28]:
predictors=list(selected_columns[sfs.get_support()])
predictors

['xga',
 'gls',
 'g/sh',
 'crspa',
 'passdead',
 'def.1',
 'passdead.2',
 'def.3',
 'att 3rd',
 'clr',
 '1/3_rolling',
 'crspa_rolling',
 'to_rolling',
 'to.2_rolling',
 'err_rolling']

xGA: Expected Goals Against. It's a metric that estimates the number of goals a team should have conceded based on the quality of the scoring opportunities they have faced.

GLS: Goals. It represents the number of goals scored by a team or player.

G/Sh: Goals per Shot. It's a metric that calculates the efficiency of a player or team in converting shots into goals.

CrSpA: Crosses into the Area. It represents the number of times a team or player has attempted to send a cross into the penalty area from the flanks of the field.

PassDead: Dead Passes. It's the number of passes that don't reach their intended target and go out of play.

Def.1, PassDead.2, Def.3: These are likely defensive metrics that track various defensive actions, such as clearances, interceptions, tackles, and blocks.

Att 3rd: Passes into the Attacking Third. It represents the number of passes a team or player has made into the attacking third of the field.

Clr: Clearances. It's the number of times a player or team has successfully cleared the ball away from their own penalty area.

1/3_rolling, CrSpA_rolling, TO_rolling, TO.2_rolling, Err_rolling: Rolling averages of the above-mentioned metrics, calculated over a specific number of matches.

In [29]:
train = matches_rolling[matches_rolling["date"] < '2023-01-01']
test = matches_rolling[matches_rolling["date"] > '2023-01-01']

In [30]:
matches_rolling

Unnamed: 0,unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,...,blocks_rolling,sh_rolling,pass_rolling,int_rolling,tkl+int_rolling,clr_rolling,err_rolling,target_rolling,venue_code_rolling,opp_code_rolling
1169,4,2021-09-11,15:00,Premier League,Matchweek 4,Sat,Home,W,1,0,...,0.603448,0.769231,0.414634,0.319149,0.267327,0.396226,0.166667,0.000000,0.333333,0.267857
1170,5,2021-09-18,15:00,Premier League,Matchweek 5,Sat,Away,W,1,0,...,0.689655,1.000000,0.390244,0.234043,0.297030,0.424528,0.166667,0.333333,0.666667,0.500000
1172,7,2021-09-26,16:30,Premier League,Matchweek 6,Sun,Home,W,3,1,...,0.534483,0.769231,0.317073,0.340426,0.326733,0.679245,0.333333,0.666667,0.333333,0.482143
1173,8,2021-10-02,17:30,Premier League,Matchweek 7,Sat,Away,D,0,0,...,0.500000,0.461538,0.463415,0.297872,0.435644,0.584906,0.166667,1.000000,0.666667,0.589286
1174,9,2021-10-18,20:00,Premier League,Matchweek 8,Mon,Home,D,2,2,...,0.500000,0.500000,0.439024,0.468085,0.435644,0.603774,0.166667,0.666667,0.333333,0.375000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
596,27,2023-02-11,15:00,Premier League,Matchweek 23,Sat,Away,W,2,1,...,0.741379,0.846154,0.560976,0.191489,0.287129,0.792453,0.500000,0.666667,0.666667,0.696429
597,28,2023-02-18,15:00,Premier League,Matchweek 24,Sat,Home,L,0,1,...,0.637931,0.730769,0.487805,0.234043,0.366337,0.745283,0.500000,0.666667,0.333333,0.642857
598,29,2023-02-24,20:00,Premier League,Matchweek 25,Fri,Away,D,1,1,...,0.551724,0.769231,0.341463,0.042553,0.306931,0.679245,0.333333,0.666667,0.666667,0.446429
599,30,2023-03-01,20:00,Premier League,Matchweek 7,Wed,Away,L,0,2,...,0.379310,0.423077,0.317073,0.170213,0.386139,0.462264,0.166667,0.333333,0.333333,0.392857


Random Forest Model

In [31]:
model = RandomForestClassifier(n_estimators=50, min_samples_split=10, random_state=1)
train = matches_rolling[matches_rolling["date"] < '2023-01-01']
test = matches_rolling[(matches_rolling["date"] > '2023-01-01')& (matches_rolling["date"]<'2023-03-11')].dropna()

In [32]:
model.fit(train[predictors],train['target'])

In [33]:
predictors

['xga',
 'gls',
 'g/sh',
 'crspa',
 'passdead',
 'def.1',
 'passdead.2',
 'def.3',
 'att 3rd',
 'clr',
 '1/3_rolling',
 'crspa_rolling',
 'to_rolling',
 'to.2_rolling',
 'err_rolling']

In [34]:
preds=model.predict(test[predictors])

In [36]:
f1 = f1_score(test["target"], preds, average='binary')
f1

0.6909090909090909