# Championship 2023-2024 Predictions

In [1]:
# import data
import pandas as pd
import numpy as np

#df = pd.read_csv('F2.csv')
#df = pd.read_csv('ligue_2.csv')
df = pd.read_csv('scrapers/englandm/E0.csv')

#df = pd.read_csv('f2_new.csv')
df.tail()

Unnamed: 0,Div,Date,Time,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,...,AvgC<2.5,AHCh,B365CAHH,B365CAHA,PCAHH,PCAHA,MaxCAHH,MaxCAHA,AvgCAHH,AvgCAHA
177,B1,30/01/2024,19:30,Westerlo,Cercle Brugge,4,2,H,4,1,...,2.05,0.5,1.73,2.08,1.76,2.15,1.84,2.15,1.75,2.08
178,B1,31/01/2024,17:45,Charleroi,Eupen,1,0,H,0,0,...,2.08,-0.5,1.8,2.05,1.81,2.08,1.85,2.12,1.8,2.03
179,B1,31/01/2024,17:45,Oud-Heverlee Leuven,Genk,2,1,H,0,0,...,2.39,0.75,1.85,2.0,1.86,2.03,2.04,2.07,1.89,1.92
180,B1,31/01/2024,20:00,St. Gilloise,RWD Molenbeek,3,2,H,2,0,...,2.9,-2.25,1.93,1.93,1.93,1.93,2.01,1.96,1.93,1.88
181,B1,31/01/2024,20:00,Standard,Antwerp,0,1,A,0,0,...,1.73,0.25,1.8,2.05,1.83,2.08,1.9,2.13,1.82,2.0


In [2]:
## Data Cleaning

In [3]:
# steps:
# 1. Check for missing values
missing_values = df.isnull().sum()

# 2. Check for duplicates
duplicate_rows = df.duplicated().sum()

# Display the results of the checks
missing_values, duplicate_rows

(Div         0
 Date        0
 Time        0
 HomeTeam    0
 AwayTeam    0
            ..
 PCAHA       0
 MaxCAHH     0
 MaxCAHA     0
 AvgCAHH     0
 AvgCAHA     0
 Length: 105, dtype: int64,
 0)

In [4]:
# Checking for missing values
missing_values = df.isnull().sum()




# Checking the balance of the target variable 'FTR'
target_distribution = df['FTR'].value_counts()

# drop the div column
df.drop('Div', axis=1, inplace=True)
df.drop('Time', axis=1, inplace=True)

missing_values, target_distribution


(Div         0
 Date        0
 Time        0
 HomeTeam    0
 AwayTeam    0
            ..
 PCAHA       0
 MaxCAHH     0
 MaxCAHA     0
 AvgCAHH     0
 AvgCAHA     0
 Length: 105, dtype: int64,
 FTR
 H    81
 D    53
 A    48
 Name: count, dtype: int64)

In [5]:
# covert date to datetime
df['Date'] = pd.to_datetime(df['Date'], format='%d/%m/%Y')

## Feature engineering

### steps:
* Encode Team Names: Use label encoding for 'HomeTeam' and 'AwayTeam'. This will convert team names into numeric values, making them usable for the model.

* Recent Form: Calculate the recent form for each team based on the last 5 matches. We'll use the 'FTR' column to determine wins (W), losses (L), and draws (D). This feature will provide insight into the current performance of the teams.

* Average Goals per Game: Compute the average goals scored per game for both home and away teams. This feature helps understand the offensive strength of the teams.

* Team Points: Calculate the total points accumulated by each team so far in the season. Points are awarded based on wins (3 points), draws (1 point), and losses (0 points).

* Head-to-Head Statistics: Analyze the outcomes of matches between the same pairs of teams earlier in the season.

* Other Statistical Features: Depending on the data available, we can include additional features like average possession, number of shots on target, defensive strength, etc.

In [6]:
# Reset the index of the DataFrame (if necessary)
df.reset_index(drop=True, inplace=True)

# Rolling averages for goals
window_sizes = [3, 5]
for window in window_sizes:
    df[f'HomeGoalsScoredAvg_{window}'] = df.groupby('HomeTeam')['FTHG'].transform(lambda x: x.rolling(window, min_periods=1).mean())
    df[f'AwayGoalsScoredAvg_{window}'] = df.groupby('AwayTeam')['FTAG'].transform(lambda x: x.rolling(window, min_periods=1).mean())
    df[f'HomeGoalsConcededAvg_{window}'] = df.groupby('HomeTeam')['FTAG'].transform(lambda x: x.rolling(window, min_periods=1).mean())
    df[f'AwayGoalsConcededAvg_{window}'] = df.groupby('AwayTeam')['FTHG'].transform(lambda x: x.rolling(window, min_periods=1).mean())



In [7]:

# Variance in team performance
df['HomeGoalsScoredVariance'] = df.groupby('HomeTeam')['FTHG'].transform(lambda x: x.rolling(5, min_periods=1).var())
df['AwayGoalsScoredVariance'] = df.groupby('AwayTeam')['FTAG'].transform(lambda x: x.rolling(5, min_periods=1).var())




In [8]:


from sklearn.preprocessing import LabelEncoder

# Initialize the LabelEncoder
label_encoder = LabelEncoder()

# Combine all unique team names from both HomeTeam and AwayTeam columns
all_teams = pd.concat([df['HomeTeam'], df['AwayTeam']]).unique()

# Fit the LabelEncoder with all unique team names
label_encoder.fit(all_teams)

# Transform both HomeTeam and AwayTeam using the fitted LabelEncoder
df['HomeTeam_encoded'] = label_encoder.transform(df['HomeTeam'])
df['AwayTeam_encoded'] = label_encoder.transform(df['AwayTeam'])

# # Now you can transform individual team names
# home_team_encoded = label_encoder.transform(['St Etienne'])[0]
# away_team_encoded = label_encoder.transform(['Laval'])[0]

# # Debugging: Print encoded values
# print(f"Encoded Home Team: {home_team_encoded}, Encoded Away Team: {away_team_encoded}")

print(label_encoder.classes_)


['Anderlecht' 'Antwerp' 'Cercle Brugge' 'Charleroi' 'Club Brugge' 'Eupen'
 'Genk' 'Gent' 'Kortrijk' 'Mechelen' 'Oud-Heverlee Leuven' 'RWD Molenbeek'
 'St Truiden' 'St. Gilloise' 'Standard' 'Westerlo']


In [9]:
df.tail()

Unnamed: 0,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,HS,...,HomeGoalsConcededAvg_3,AwayGoalsConcededAvg_3,HomeGoalsScoredAvg_5,AwayGoalsScoredAvg_5,HomeGoalsConcededAvg_5,AwayGoalsConcededAvg_5,HomeGoalsScoredVariance,AwayGoalsScoredVariance,HomeTeam_encoded,AwayTeam_encoded
177,2024-01-30,Westerlo,Cercle Brugge,4,2,H,4,1,H,9,...,0.666667,2.333333,2.6,1.2,1.6,2.0,1.3,0.7,15,2
178,2024-01-31,Charleroi,Eupen,1,0,H,0,0,D,12,...,1.666667,1.333333,1.8,0.2,2.0,1.6,1.2,0.2,3,5
179,2024-01-31,Oud-Heverlee Leuven,Genk,2,1,H,0,0,D,12,...,0.666667,1.666667,1.4,1.2,1.0,1.4,1.3,1.2,10,6
180,2024-01-31,St. Gilloise,RWD Molenbeek,3,2,H,2,0,H,11,...,1.0,3.0,2.2,0.8,0.8,3.0,0.7,0.7,13,11
181,2024-01-31,Standard,Antwerp,0,1,A,0,0,D,15,...,1.0,1.333333,0.4,1.0,0.6,1.2,0.3,1.5,14,1


In [10]:
# Encoding the target variable 'FTR'
from sklearn.preprocessing import LabelEncoder

# Initialize the LabelEncoder
label_encoder = LabelEncoder()

# Fit and transform the target variable using label_encoder_y
df['FTR_encoded'] = label_encoder.fit_transform(df['FTR'])




In [11]:

# Average points per game
df['HomeTeamPoints'] = df.groupby('HomeTeam')['FTR_encoded'].transform(lambda x: x.expanding().sum().shift(1)) / df.groupby('HomeTeam')['FTR_encoded'].transform(lambda x: x.expanding().count().shift(1))
df['AwayTeamPoints'] = df.groupby('AwayTeam')['FTR_encoded'].transform(lambda x: x.expanding().sum().shift(1)) / df.groupby('AwayTeam')['FTR_encoded'].transform(lambda x: x.expanding().count().shift(1))



In [12]:
# Average goals per game
df['HomeTeamAvgGoals'] = df.groupby('HomeTeam')['FTHG'].transform(lambda x: x.expanding().mean().shift(1)) # Average goals scored by the home team
df['AwayTeamAvgGoals'] = df.groupby('AwayTeam')['FTAG'].transform(lambda x: x.expanding().mean().shift(1)) # Average goals scored by the away team



In [13]:
# recent form
def calculate_form_points(team, df):
    # Get the last 5 matches of the home and away teams
    home_matches = df[df['HomeTeam'] == team].tail(5)
    away_matches = df[df['AwayTeam'] == team].tail(5)

    # Calculate the points obtained in the last 5 matches
    home_points = home_matches['FTR_encoded'].sum()
    away_points = away_matches['FTR_encoded'].sum()

    # Calculate the average points obtained
    home_avg_points = home_points / 15
    away_avg_points = away_points / 15

    # Return the average points
    return home_avg_points, away_avg_points

# Calculate the average points obtained by each team in the last 5 matches
df['HomeTeamRecentForm'], df['AwayTeamRecentForm'] = zip(*df['HomeTeam'].apply(lambda x: calculate_form_points(x, df)))



## League Standings

In [44]:
# Assuming df is your DataFrame with the match data
df['HomeWin'] = df['FTHG'] > df['FTAG']
df['AwayWin'] = df['FTAG'] > df['FTHG']
df['Draw'] = df['FTHG'] == df['FTAG']

# Assign points based on outcomes
df['HomePoints'] = df['HomeWin'] * 3 + df['Draw'] * 1
df['AwayPoints'] = df['AwayWin'] * 3 + df['Draw'] * 1

# Combine home and away games into a single DataFrame for point calculation
points_df = pd.concat([
    df[['Date', 'HomeTeam', 'HomePoints']].rename(columns={'HomeTeam': 'Team', 'HomePoints': 'Points'}),
    df[['Date', 'AwayTeam', 'AwayPoints']].rename(columns={'AwayTeam': 'Team', 'AwayPoints': 'Points'})
])

# Sort by match date to ensure chronological order
points_df.sort_values(by=['Date'], inplace=True)

# Calculate cumulative points for each team
points_df['CumulativePoints'] = points_df.groupby('Team')['Points'].cumsum()

# Prepare for merging by shifting cumulative points to represent standings before the match
points_df['CumulativePoints'] = points_df.groupby('Team')['CumulativePoints'].shift(fill_value=0)

# Merge back into the original DataFrame to associate standings with each match
df = df.merge(points_df[['Date', 'Team', 'CumulativePoints']], 
              left_on=['Date', 'HomeTeam'], 
              right_on=['Date', 'Team'], 
              how='left').rename(columns={'CumulativePoints': 'HomeTeamStandingBeforeMatch'}).drop('Team', axis=1)

df = df.merge(points_df[['Date', 'Team', 'CumulativePoints']], 
              left_on=['Date', 'AwayTeam'], 
              right_on=['Date', 'Team'], 
              how='left').rename(columns={'CumulativePoints': 'AwayTeamStandingBeforeMatch'}).drop('Team', axis=1)

# Display the updated DataFrame with team standings before each match

# Display the DataFrame 
df.tail(20)

# get the league standings




Unnamed: 0,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,HS,...,Draw,HomePoints,AwayPoints,HomeCumPoints,AwayCumPoints,HomeTeamStandingBeforeMatch,HomeTeamStandingBeforeMatch.1,AwayTeamStandingBeforeMatch,HomeTeamStandingBeforeMatch.2,AwayTeamStandingBeforeMatch.1
162,2024-01-20,Standard,Kortrijk,0,1,A,0,1,A,20,...,False,0,3,14,2,23,23,10,23,10
163,2024-01-20,Club Brugge,Westerlo,3,0,H,1,0,H,14,...,False,3,0,20,11,34,34,21,34,21
164,2024-01-21,Antwerp,Charleroi,4,1,H,2,0,H,20,...,False,3,0,22,3,32,32,21,32,21
165,2024-01-21,St. Gilloise,St Truiden,2,1,H,1,1,D,16,...,False,3,0,25,12,48,48,27,48,27
166,2024-01-21,Oud-Heverlee Leuven,Anderlecht,1,1,D,1,0,H,8,...,True,1,1,11,16,16,16,42,16,42
167,2024-01-24,RWD Molenbeek,Eupen,0,1,A,0,0,D,8,...,False,0,3,13,7,21,21,15,21,15
168,2024-01-26,Cercle Brugge,Standard,1,1,D,0,0,D,26,...,True,1,1,18,9,32,32,23,32,23
169,2024-01-27,Kortrijk,Oud-Heverlee Leuven,0,0,D,0,0,D,13,...,True,1,1,8,5,13,13,17,13,17
170,2024-01-27,Charleroi,Club Brugge,1,4,A,0,3,A,14,...,False,0,3,18,14,21,21,37,21,37
171,2024-01-27,Gent,Westerlo,2,2,D,1,1,D,14,...,True,1,1,22,11,38,38,21,38,21


### League standings

In [47]:
# getting the league standings
# Aggregate points for each team
home_points = df.groupby('HomeTeam')['HomePoints'].sum().reset_index()
away_points = df.groupby('AwayTeam')['AwayPoints'].sum().reset_index()

# Merge and sum the points for total points
points = pd.merge(home_points, away_points, left_on='HomeTeam', right_on='AwayTeam', how='outer', suffixes=('_home', '_away')).fillna(0)
points['Team'] = points['HomeTeam'].combine_first(points['AwayTeam'])
points['TotalPoints'] = points['HomePoints'] + points['AwayPoints']

# Calculate goal difference for tiebreakers
home_goals = df.groupby('HomeTeam').agg({'FTHG':'sum', 'FTAG':'sum'}).reset_index().rename(columns={'FTHG': 'GoalsFor_home', 'FTAG': 'GoalsAgainst_home'})
away_goals = df.groupby('AwayTeam').agg({'FTAG':'sum', 'FTHG':'sum'}).reset_index().rename(columns={'FTAG': 'GoalsFor_away', 'FTHG': 'GoalsAgainst_away'})

goals = pd.merge(home_goals, away_goals, left_on='HomeTeam', right_on='AwayTeam', how='outer', suffixes=('_home', '_away')).fillna(0)
goals['Team'] = goals['HomeTeam'].combine_first(goals['AwayTeam'])
goals['GoalDifference'] = (goals['GoalsFor_home'] + goals['GoalsFor_away']) - (goals['GoalsAgainst_home'] + goals['GoalsAgainst_away'])

# Merge points and goals to get a complete standing
standings = pd.merge(points[['Team', 'TotalPoints']], goals[['Team', 'GoalDifference']], on='Team')

# Sort teams by Total Points and Goal Difference
standings.sort_values(by=['TotalPoints', 'GoalDifference'], ascending=[False, False], inplace=True)

# Reset index to get ranking
standings.reset_index(drop=True, inplace=True)

# Display the league standings
print(standings.head(20))

                   Team  TotalPoints  GoalDifference
0          St. Gilloise           55              26
1            Anderlecht           44              18
2           Club Brugge           41              27
3                  Gent           39              15
4               Antwerp           38              20
5                  Genk           36              20
6         Cercle Brugge           33               1
7              Mechelen           28              -2
8            St Truiden           28              -7
9              Westerlo           25              -6
10             Standard           24             -12
11            Charleroi           24             -14
12  Oud-Heverlee Leuven           21             -12
13                Eupen           21             -21
14        RWD Molenbeek           21             -22
15             Kortrijk           15             -31


In [14]:
from sklearn.model_selection import train_test_split

# Convert 'Date' to datetime and extract useful features
df['Date'] = pd.to_datetime(df['Date'], format='%d/%m/%Y')
df['DayOfWeek'] = df['Date'].dt.dayofweek  # 0: Monday, 6: Sunday
df['IsWeekend'] = df['DayOfWeek'].isin([5, 6]).astype(int)  # 1 for weekend, 0 for weekdays



In [15]:
#select relevant columns or features and the target
features = [
    'HomeTeam_encoded', 'AwayTeam_encoded', 'HomeTeamRecentForm', 'AwayTeamRecentForm', 
    'HomeTeamAvgGoals', 'AwayTeamAvgGoals', 'HomeTeamPoints', 'AwayTeamPoints', 'HomeGoalsScoredVariance', 'AwayGoalsScoredVariance', 'HomeGoalsScoredAvg_3', 'AwayGoalsScoredAvg_3', 'HomeGoalsConcededAvg_3', 'AwayGoalsConcededAvg_3', 'HomeGoalsScoredAvg_5', 'AwayGoalsScoredAvg_5', 'HomeGoalsConcededAvg_5', 'AwayGoalsConcededAvg_5',
]

target = 'FTR_encoded'

# Split the data into training and test sets
X = df[features]
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
train_test_split(X, y, test_size=0.2, random_state=42)

# Previewing the processed features
X_train.head(7), y_train.head(7)

# encoding the categorical variables
from sklearn.preprocessing import LabelEncoder

# Initialize the LabelEncoder
label_encoder = LabelEncoder()

# Fit and transform the target variable using label_encoder_y
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)


In [16]:
# scaling the features
from sklearn.preprocessing import StandardScaler

# Initialize the StandardScaler
scaler = StandardScaler()

# Fit the StandardScaler
scaler.fit(X_train)

# Transform both the training and testing sets
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)



### Create Model


In [17]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Initialize the models
rf_model = RandomForestClassifier(random_state=42)
xgb_model = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='mlogloss')
lgbm_model = LGBMClassifier(random_state=42)   
catboost_model = CatBoostClassifier(random_state=42, verbose=False)


# Train the models
#rf_model.fit(X_train, y_train_encoded)
xgb_model.fit(X_train_scaled, y_train_encoded)
lgbm_model.fit(X_train_scaled, y_train_encoded)
catboost_model.fit(X_train_scaled, y_train_encoded)







[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000080 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 349
[LightGBM] [Info] Number of data points in the train set: 145, number of used features: 18
[LightGBM] [Info] Start training from score -1.339148
[LightGBM] [Info] Start training from score -1.263162
[LightGBM] [Info] Start training from score -0.787079


<catboost.core.CatBoostClassifier at 0x26aeb2170d0>

In [18]:
from sklearn.metrics import roc_auc_score

#evaluate the models using the metrics
models = [ xgb_model, lgbm_model, catboost_model]
model_names = [ 'XGBoost', 'LightGBM', 'CatBoost', 'AdaBoost']

for model, name in zip(models, model_names):
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='macro', zero_division=0)
    recall = recall_score(y_test, y_pred, average='macro', zero_division=0)
    f1 = f1_score(y_test, y_pred, average='macro', zero_division=0)
    print(f"{name} Model Metrics")
    print(f"Accuracy: {accuracy}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1 Score: {f1}")
    print("\n")
    
    

XGBoost Model Metrics
Accuracy: 0.35135135135135137
Precision: 0.3920940170940171
Recall: 0.31666666666666665
F1 Score: 0.298499061913696


LightGBM Model Metrics
Accuracy: 0.3783783783783784
Precision: 0.3757936507936508
Recall: 0.41111111111111104
F1 Score: 0.3761872909698997


CatBoost Model Metrics
Accuracy: 0.32432432432432434
Precision: 0.1111111111111111
Recall: 0.3333333333333333
F1 Score: 0.16666666666666666



# creating objective function and hyper opts for hyperparaemters

In [19]:
from hyperopt import hp
import numpy as np
# Define the hyperparameter space
space = {
    'iterations': hp.quniform('iterations', 100, 1000, 50),
    'depth': hp.choice('depth', np.arange(3, 11, dtype=int)),
    'learning_rate': hp.loguniform('learning_rate', np.log(0.01), np.log(0.2)),
    'l2_leaf_reg': hp.uniform('l2_leaf_reg', 1, 10),
    'bagging_temperature': hp.uniform('bagging_temperature', 0, 1),
    'random_strength': hp.uniform('random_strength', 0, 1),
    

}


In [20]:
from sklearn.model_selection import cross_val_score
from hyperopt import STATUS_OK
from sklearn.model_selection import StratifiedKFold


def objective(params):
    params['iterations'] = int(params['iterations'])
    clf = CatBoostClassifier(**params, loss_function='MultiClass', verbose=False)
    score = cross_val_score(clf, X_train_scaled, y_train_encoded, scoring='accuracy', cv=StratifiedKFold(5)).mean()
    return {'loss': -score, 'status': STATUS_OK}



In [21]:
# run the hyperparameter optimization
from hyperopt import tpe, Trials, fmin

trials = Trials()
best = fmin(fn=objective, space=space, algo=tpe.suggest, max_evals=10, trials=trials)


100%|██████████| 10/10 [02:04<00:00, 12.46s/trial, best loss: -0.6413793103448275]


In [22]:
from sklearn.metrics import classification_report

best_params = {k: int(v) if k in ['iterations', 'depth'] else v for k, v in best.items()}
final_model = CatBoostClassifier(**best_params, loss_function='MultiClass', eval_metric='Accuracy', verbose=False)
final_model.fit(X_train_scaled, y_train_encoded)

# Evaluate the model
y_pred = final_model.predict(X_test_scaled)
print(classification_report(y_test, y_pred, zero_division=0))

              precision    recall  f1-score   support

           0       1.00      0.40      0.57        10
           1       0.57      0.33      0.42        12
           2       0.58      1.00      0.73        15

    accuracy                           0.62        37
   macro avg       0.72      0.58      0.57        37
weighted avg       0.69      0.62      0.59        37


# Xgboost with params

In [23]:
# hyperparameter tuning for xgboost using hyperopt
from hyperopt import hp
import numpy as np
# Define the hyperparameter space

space = {
    'n_estimators': hp.quniform('n_estimators', 100, 1000, 50),
    'max_depth': hp.choice('max_depth', np.arange(3, 11, dtype=int)),
    'learning_rate': hp.loguniform('learning_rate', np.log(0.01), np.log(0.2)),
    'reg_lambda': hp.uniform('reg_lambda', 1, 10),
    'reg_alpha': hp.uniform('reg_alpha', 1, 10),
    'gamma': hp.uniform('gamma', 0, 1),
    'min_child_weight': hp.uniform('min_child_weight', 0, 10)
    
}

In [24]:
from hyperopt import STATUS_OK
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from xgboost import XGBClassifier

def objective(params):
    params['n_estimators'] = int(params['n_estimators'])
    clf = XGBClassifier(**params, objective='multi:softmax', num_class=3, eval_metric='mlogloss', verbosity=0)
    score = cross_val_score(clf, X_train_scaled, y_train_encoded, scoring='accuracy', cv=StratifiedKFold(10)).mean()
    return {'loss': -score, 'status': STATUS_OK}

In [25]:
# run the hyperparameter optimization
from hyperopt import tpe, Trials, fmin

trials = Trials()
best = fmin(fn=objective, space=space, algo=tpe.suggest, max_evals=100, trials=trials)

from sklearn.metrics import classification_report

best_params = {k: int(v) if k in ['n_estimators', 'max_depth'] else v for k, v in best.items()}
best_model_2 = XGBClassifier(**best_params, objective='multi:softmax', num_class=3, eval_metric='mlogloss', verbosity=0)
best_model_2.fit(X_train_scaled, y_train_encoded)

100%|██████████| 100/100 [02:56<00:00,  1.77s/trial, best loss: -0.6614285714285715]


In [26]:
# Evaluate the model
y_pred = best_model_2.predict(X_test_scaled)
print(classification_report(y_test, y_pred, zero_division=0))


      

              precision    recall  f1-score   support

           0       1.00      0.50      0.67        10
           1       0.43      0.25      0.32        12
           2       0.56      0.93      0.70        15

    accuracy                           0.59        37
   macro avg       0.66      0.56      0.56        37
weighted avg       0.64      0.59      0.57        37


In [27]:

from sklearn.preprocessing import LabelEncoder

# Initialize the LabelEncoder
label_encoder = LabelEncoder()

# Combine all unique team names from both HomeTeam and AwayTeam columns
all_teams = pd.concat([df['HomeTeam'], df['AwayTeam']]).unique()

# Fit the LabelEncoder with all unique team names
label_encoder.fit(all_teams)

# Transform both HomeTeam and AwayTeam using the fitted LabelEncoder
df['HomeTeam_encoded'] = label_encoder.transform(df['HomeTeam'])
df['AwayTeam_encoded'] = label_encoder.transform(df['AwayTeam'])

# # Now you can transform individual team names
# home_team_encoded = label_encoder.transform(['St Etienne'])[0]
# away_team_encoded = label_encoder.transform(['Laval'])[0]
# 
# # Debugging: Print encoded values
# print(f"Encoded Home Team: {home_team_encoded}, Encoded Away Team: {away_team_encoded}")

print(label_encoder.classes_)


['Anderlecht' 'Antwerp' 'Cercle Brugge' 'Charleroi' 'Club Brugge' 'Eupen'
 'Genk' 'Gent' 'Kortrijk' 'Mechelen' 'Oud-Heverlee Leuven' 'RWD Molenbeek'
 'St Truiden' 'St. Gilloise' 'Standard' 'Westerlo']


In [28]:
import pandas as pd

# Create an empty list to store match results
match_results = []
def predict_match(home_team, away_team, xgb_model, df, label_encoder):
     # Debugging: Check if the team names are in label_encoder's classes
    if home_team not in label_encoder.classes_:
        raise ValueError(f"Home team name '{home_team}' not recognized.")
    if away_team not in label_encoder.classes_:
        raise ValueError(f"Away team name '{away_team}' not recognized.")

    # Transform team names
    home_team_encoded = label_encoder.transform([home_team])[0]
    away_team_encoded = label_encoder.transform([away_team])[0]

    # Debugging: Print encoded values
   # print(f"Encoded Home Team: {home_team_encoded}, Encoded Away Team: {away_team_encoded}")

    # Prepare the match data with relevant features
    match_data = {
        'HomeTeam_encoded': label_encoder.transform([home_team])[0],
        'AwayTeam_encoded': label_encoder.transform([away_team])[0],
        'HomeTeamRecentForm': calculate_form_points(home_team, df)[0],
        'AwayTeamRecentForm': calculate_form_points(away_team, df)[1],
        'HomeTeamAvgGoals': df[df['HomeTeam'] == home_team]['FTHG'].mean(),
        'AwayTeamAvgGoals': df[df['AwayTeam'] == away_team]['FTAG'].mean(),
        'HomeTeamPoints': df[df['HomeTeam'] == home_team]['FTR_encoded'].sum() / df[df['HomeTeam'] == home_team]['FTR_encoded'].count(),
        'AwayTeamPoints': df[df['AwayTeam'] == away_team]['FTR_encoded'].sum() / df[df['AwayTeam'] == away_team]['FTR_encoded'].count(),
        'HomeGoalsScoredVariance': df[df['HomeTeam'] == home_team]['FTHG'].var(),
        'AwayGoalsScoredVariance': df[df['AwayTeam'] == away_team]['FTAG'].var(),
        'HomeGoalsScoredAvg_3': df[df['HomeTeam'] == home_team]['FTHG'].rolling(3, min_periods=1).mean().iloc[-1],
        'AwayGoalsScoredAvg_3': df[df['AwayTeam'] == away_team]['FTAG'].rolling(3, min_periods=1).mean().iloc[-1],
        'HomeGoalsConcededAvg_3': df[df['HomeTeam'] == home_team]['FTAG'].rolling(3, min_periods=1).mean().iloc[-1],
        'AwayGoalsConcededAvg_3': df[df['AwayTeam'] == away_team]['FTHG'].rolling(3, min_periods=1).mean().iloc[-1],
        'HomeGoalsScoredAvg_5': df[df['HomeTeam'] == home_team]['FTHG'].rolling(5, min_periods=1).mean().iloc[-1],
        'AwayGoalsScoredAvg_5': df[df['AwayTeam'] == away_team]['FTAG'].rolling(5, min_periods=1).mean().iloc[-1],
        'HomeGoalsConcededAvg_5': df[df['HomeTeam'] == home_team]['FTAG'].rolling(5, min_periods=1).mean().iloc[-1],
        'AwayGoalsConcededAvg_5': df[df['AwayTeam'] == away_team]['FTHG'].rolling(5, min_periods=1).mean().iloc[-1],
        
        # Add other features as required by the model, calculated or retrieved as done during training
        # ...
    }

    match_df = pd.DataFrame([match_data])

    # Make predictions
    probabilities = xgb_model.predict_proba(match_df)[0]
    home_team_win_prob = probabilities[0]
    draw_prob = probabilities[1]
    away_team_win_prob = probabilities[2]

    #print(f"{home_team} win probability: {home_team_win_prob}")
    #print(f"Draw probability: {draw_prob}")
    #print(f"{away_team} win probability: {away_team_win_prob}")
    
    match_results.append({
        'HomeTeam': home_team,
        'AwayTeam': away_team,
        'HomeTeamWinProbability': home_team_win_prob,
        'AwayTeamWinProbability': away_team_win_prob,
        'DrawProbability': draw_prob,
    })

    return home_team_win_prob, draw_prob, away_team_win_prob



print('##########################')
print('Today\'s matches')


predict_match('Mechelen', 'Anderlecht', xgb_model, df, label_encoder)

predict_match('St Truiden', "Gent", xgb_model, df, label_encoder)

# Create a DataFrame from the match results list
match_results_df = pd.DataFrame(match_results)

# Print the DataFrame
match_results_df.head()

##########################
Today's matches


Unnamed: 0,HomeTeam,AwayTeam,HomeTeamWinProbability,AwayTeamWinProbability,DrawProbability
0,Mechelen,Anderlecht,0.062796,0.636924,0.300279
1,St Truiden,Gent,0.137343,0.589409,0.273248


In [29]:

#

# xgboost with hyperprams predictions

In [30]:

def predict_match(home_team, away_team, best_model_2, df, label_encoder):
     # Debugging: Check if the team names are in label_encoder's classes
    if home_team not in label_encoder.classes_:
        raise ValueError(f"Home team name '{home_team}' not recognized.")
    if away_team not in label_encoder.classes_:
        raise ValueError(f"Away team name '{away_team}' not recognized.")

    # Transform team names
    home_team_encoded = label_encoder.transform([home_team])[0]
    away_team_encoded = label_encoder.transform([away_team])[0]

    # Debugging: Print encoded values
    print(f"Encoded Home Team: {home_team_encoded}, Encoded Away Team: {away_team_encoded}")

    # Prepare the match data with relevant features
    match_data = {
        'HomeTeam_encoded': label_encoder.transform([home_team])[0],
        'AwayTeam_encoded': label_encoder.transform([away_team])[0],
        'HomeTeamRecentForm': calculate_form_points(home_team, df)[0],
        'AwayTeamRecentForm': calculate_form_points(away_team, df)[1],
        'HomeTeamAvgGoals': df[df['HomeTeam'] == home_team]['FTHG'].mean(),
        'AwayTeamAvgGoals': df[df['AwayTeam'] == away_team]['FTAG'].mean(),
        'HomeTeamPoints': df[df['HomeTeam'] == home_team]['FTR_encoded'].sum() / df[df['HomeTeam'] == home_team]['FTR_encoded'].count(),
        'AwayTeamPoints': df[df['AwayTeam'] == away_team]['FTR_encoded'].sum() / df[df['AwayTeam'] == away_team]['FTR_encoded'].count(),
        'HomeGoalsScoredVariance': df[df['HomeTeam'] == home_team]['FTHG'].var(),
        'AwayGoalsScoredVariance': df[df['AwayTeam'] == away_team]['FTAG'].var(),
        'HomeGoalsScoredAvg_3': df[df['HomeTeam'] == home_team]['FTHG'].rolling(3, min_periods=1).mean().iloc[-1],
        'AwayGoalsScoredAvg_3': df[df['AwayTeam'] == away_team]['FTAG'].rolling(3, min_periods=1).mean().iloc[-1],
        'HomeGoalsConcededAvg_3': df[df['HomeTeam'] == home_team]['FTAG'].rolling(3, min_periods=1).mean().iloc[-1],
        'AwayGoalsConcededAvg_3': df[df['AwayTeam'] == away_team]['FTHG'].rolling(3, min_periods=1).mean().iloc[-1],
        'HomeGoalsScoredAvg_5': df[df['HomeTeam'] == home_team]['FTHG'].rolling(5, min_periods=1).mean().iloc[-1],
        'AwayGoalsScoredAvg_5': df[df['AwayTeam'] == away_team]['FTAG'].rolling(5, min_periods=1).mean().iloc[-1],
        'HomeGoalsConcededAvg_5': df[df['HomeTeam'] == home_team]['FTAG'].rolling(5, min_periods=1).mean().iloc[-1],
        'AwayGoalsConcededAvg_5': df[df['AwayTeam'] == away_team]['FTHG'].rolling(5, min_periods=1).mean().iloc[-1],
        
        # Add other features as required by the model, calculated or retrieved as done during training
        # ...
    }

    match_df = pd.DataFrame([match_data])

    # Make predictions
    probabilities = best_model_2.predict_proba(match_df)[0]
    home_team_win_prob = probabilities[0]
    draw_prob = probabilities[1]
    away_team_win_prob = probabilities[2]

    print(f"{home_team} win probability: {home_team_win_prob}")
    print(f"Draw probability: {draw_prob}")
    print(f"{away_team} win probability: {away_team_win_prob}")
     
     # put the predictions in a dataframe
     
    #pd.DataFrame({'home_team': home_team, 'away_team': away_team, 'home_team_win_prob': home_team_win_prob, 'draw_prob': draw_prob, 'away_team_win_prob': away_team_win_prob})

    return home_team_win_prob, draw_prob, away_team_win_prob



print('##########################')
print('Today\'s matches')

predict_match('Mechelen', 'Anderlecht', best_model_2, df, label_encoder)

predict_match('St Truiden', "Gent", best_model_2, df, label_encoder)

##########################
Today's matches
Encoded Home Team: 9, Encoded Away Team: 0
Mechelen win probability: 0.12479749321937561
Draw probability: 0.284352570772171
Anderlecht win probability: 0.5908498764038086
Encoded Home Team: 12, Encoded Away Team: 7
St Truiden win probability: 0.1592721939086914
Draw probability: 0.23598389327526093
Gent win probability: 0.6047438979148865


(0.1592722, 0.2359839, 0.6047439)

# catboost with hyperparameter

In [31]:
def predict_match(home_team, away_team, final_model, df, label_encoder):
     # Debugging: Check if the team names are in label_encoder's classes
    if home_team not in label_encoder.classes_:
        raise ValueError(f"Home team name '{home_team}' not recognized.")
    if away_team not in label_encoder.classes_:
        raise ValueError(f"Away team name '{away_team}' not recognized.")

    # Transform team names
    home_team_encoded = label_encoder.transform([home_team])[0]
    away_team_encoded = label_encoder.transform([away_team])[0]

    # Debugging: Print encoded values
    print(f"Encoded Home Team: {home_team_encoded}, Encoded Away Team: {away_team_encoded}")

    # Prepare the match data with relevant features
    match_data = {
        'HomeTeam_encoded': label_encoder.transform([home_team])[0],
        'AwayTeam_encoded': label_encoder.transform([away_team])[0],
        'HomeTeamRecentForm': calculate_form_points(home_team, df)[0],
        'AwayTeamRecentForm': calculate_form_points(away_team, df)[1],
        'HomeTeamAvgGoals': df[df['HomeTeam'] == home_team]['FTHG'].mean(),
        'AwayTeamAvgGoals': df[df['AwayTeam'] == away_team]['FTAG'].mean(),
        'HomeTeamPoints': df[df['HomeTeam'] == home_team]['FTR_encoded'].sum() / df[df['HomeTeam'] == home_team]['FTR_encoded'].count(),
        'AwayTeamPoints': df[df['AwayTeam'] == away_team]['FTR_encoded'].sum() / df[df['AwayTeam'] == away_team]['FTR_encoded'].count(),
        'HomeGoalsScoredVariance': df[df['HomeTeam'] == home_team]['FTHG'].var(),
        'AwayGoalsScoredVariance': df[df['AwayTeam'] == away_team]['FTAG'].var(),
        'HomeGoalsScoredAvg_3': df[df['HomeTeam'] == home_team]['FTHG'].rolling(3, min_periods=1).mean().iloc[-1],
        'AwayGoalsScoredAvg_3': df[df['AwayTeam'] == away_team]['FTAG'].rolling(3, min_periods=1).mean().iloc[-1],
        'HomeGoalsConcededAvg_3': df[df['HomeTeam'] == home_team]['FTAG'].rolling(3, min_periods=1).mean().iloc[-1],
        'AwayGoalsConcededAvg_3': df[df['AwayTeam'] == away_team]['FTHG'].rolling(3, min_periods=1).mean().iloc[-1],
        'HomeGoalsScoredAvg_5': df[df['HomeTeam'] == home_team]['FTHG'].rolling(5, min_periods=1).mean().iloc[-1],
        'AwayGoalsScoredAvg_5': df[df['AwayTeam'] == away_team]['FTAG'].rolling(5, min_periods=1).mean().iloc[-1],
        'HomeGoalsConcededAvg_5': df[df['HomeTeam'] == home_team]['FTAG'].rolling(5, min_periods=1).mean().iloc[-1],
        'AwayGoalsConcededAvg_5': df[df['AwayTeam'] == away_team]['FTHG'].rolling(5, min_periods=1).mean().iloc[-1],
        
        # Add other features as required by the model, calculated or retrieved as done during training
        # ...
    }

    match_df = pd.DataFrame([match_data])

    # Make predictions
    probabilities = final_model.predict_proba(match_df)[0]
    home_team_win_prob = probabilities[0]
    draw_prob = probabilities[1]
    away_team_win_prob = probabilities[2]

    print(f"{home_team} win probability: {home_team_win_prob}")
    print(f"Draw probability: {draw_prob}")
    print(f"{away_team} win probability: {away_team_win_prob}")
     
    

    return home_team_win_prob, draw_prob, away_team_win_prob

print('Today\'s matches')

predict_match('Mechelen', 'Anderlecht', final_model, df, label_encoder)

predict_match('St Truiden', "Gent", final_model, df, label_encoder)

Today's matches
Encoded Home Team: 9, Encoded Away Team: 0
Mechelen win probability: 0.09577716007527402
Draw probability: 0.2327348715891032
Anderlecht win probability: 0.6714879683356229
Encoded Home Team: 12, Encoded Away Team: 7
St Truiden win probability: 0.1434450214536715
Draw probability: 0.47630570534734273
Gent win probability: 0.38024927319898566


(0.1434450214536715, 0.47630570534734273, 0.38024927319898566)

In [32]:

def predict_match(home_team, away_team, catboost_model, df, label_encoder):
     # Debugging: Check if the team names are in label_encoder's classes
    if home_team not in label_encoder.classes_:
        raise ValueError(f"Home team name '{home_team}' not recognized.")
    if away_team not in label_encoder.classes_:
        raise ValueError(f"Away team name '{away_team}' not recognized.")

    # Transform team names
    home_team_encoded = label_encoder.transform([home_team])[0]
    away_team_encoded = label_encoder.transform([away_team])[0]

    # Debugging: Print encoded values
    print(f"Encoded Home Team: {home_team_encoded}, Encoded Away Team: {away_team_encoded}")

    # Prepare the match data with relevant features
    match_data = {
        'HomeTeam_encoded': label_encoder.transform([home_team])[0],
        'AwayTeam_encoded': label_encoder.transform([away_team])[0],
        'HomeTeamRecentForm': calculate_form_points(home_team, df)[0],
        'AwayTeamRecentForm': calculate_form_points(away_team, df)[1],
        'HomeTeamAvgGoals': df[df['HomeTeam'] == home_team]['FTHG'].mean(),
        'AwayTeamAvgGoals': df[df['AwayTeam'] == away_team]['FTAG'].mean(),
        'HomeTeamPoints': df[df['HomeTeam'] == home_team]['FTR_encoded'].sum() / df[df['HomeTeam'] == home_team]['FTR_encoded'].count(),
        'AwayTeamPoints': df[df['AwayTeam'] == away_team]['FTR_encoded'].sum() / df[df['AwayTeam'] == away_team]['FTR_encoded'].count(),
        'HomeGoalsScoredVariance': df[df['HomeTeam'] == home_team]['FTHG'].var(),
        'AwayGoalsScoredVariance': df[df['AwayTeam'] == away_team]['FTAG'].var(),
        'HomeGoalsScoredAvg_3': df[df['HomeTeam'] == home_team]['FTHG'].rolling(3, min_periods=1).mean().iloc[-1],
        'AwayGoalsScoredAvg_3': df[df['AwayTeam'] == away_team]['FTAG'].rolling(3, min_periods=1).mean().iloc[-1],
        'HomeGoalsConcededAvg_3': df[df['HomeTeam'] == home_team]['FTAG'].rolling(3, min_periods=1).mean().iloc[-1],
        'AwayGoalsConcededAvg_3': df[df['AwayTeam'] == away_team]['FTHG'].rolling(3, min_periods=1).mean().iloc[-1],
        'HomeGoalsScoredAvg_5': df[df['HomeTeam'] == home_team]['FTHG'].rolling(5, min_periods=1).mean().iloc[-1],
        'AwayGoalsScoredAvg_5': df[df['AwayTeam'] == away_team]['FTAG'].rolling(5, min_periods=1).mean().iloc[-1],
        'HomeGoalsConcededAvg_5': df[df['HomeTeam'] == home_team]['FTAG'].rolling(5, min_periods=1).mean().iloc[-1],
        'AwayGoalsConcededAvg_5': df[df['AwayTeam'] == away_team]['FTHG'].rolling(5, min_periods=1).mean().iloc[-1],
        
        # Add other features as required by the model, calculated or retrieved as done during training
        # ...
    }

    match_df = pd.DataFrame([match_data])

    # Make predictions
    probabilities = catboost_model.predict_proba(match_df)[0]
    home_team_win_prob = probabilities[0]
    draw_prob = probabilities[1]
    away_team_win_prob = probabilities[2]

    print(f"{home_team} win probability: {home_team_win_prob}")
    print(f"Draw probability: {draw_prob}")
    print(f"{away_team} win probability: {away_team_win_prob}")
     
    pd.DataFrame({'home_team': home_team, 'away_team': away_team, 'home_team_win_prob': home_team_win_prob, 'draw_prob': draw_prob, 'away_team_win_prob': away_team_win_prob}, index=[0])

    return home_team_win_prob, draw_prob, away_team_win_prob



print('##########################')
print('Today\'s matches')

predict_match('Mechelen', 'Anderlecht', catboost_model, df, label_encoder)

predict_match('St Truiden', "Gent", catboost_model, df, label_encoder)

##########################
Today's matches
Encoded Home Team: 9, Encoded Away Team: 0
Mechelen win probability: 0.096129436751518
Draw probability: 0.21086885681921316
Anderlecht win probability: 0.6930017064292688
Encoded Home Team: 12, Encoded Away Team: 7
St Truiden win probability: 0.10641426472190156
Draw probability: 0.6200067888978263
Gent win probability: 0.2735789463802721


(0.10641426472190156, 0.6200067888978263, 0.2735789463802721)

In [33]:
from sklearn.impute import SimpleImputer
import numpy as np

# Initialize the imputer (you can change the strategy to 'median' or 'most_frequent' if more appropriate)
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')

# Fit on the training data and transform both training and testing data
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)




## using random forest

In [34]:
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV



# Then scale the imputed data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_imputed)
X_test_scaled = scaler.transform(X_test_imputed)

# Hyperparameter Tuning
model = RandomForestClassifier(random_state=42)
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
    
}
grid_search = GridSearchCV(model, param_grid, cv=3, scoring='f1_macro')
grid_search.fit(X_train_scaled, y_train_encoded)

# Best Model Evaluation
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test_scaled)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.83      0.50      0.62        10
           1       0.56      0.42      0.48        12
           2       0.64      0.93      0.76        15

    accuracy                           0.65        37
   macro avg       0.68      0.62      0.62        37
weighted avg       0.66      0.65      0.63        37


In [35]:

def predict_match(home_team, away_team, best_model, df, label_encoder,scaler):
     # Debugging: Check if the team names are in label_encoder's classes
    if home_team not in label_encoder.classes_:
        raise ValueError(f"Home team name '{home_team}' not recognized.")
    if away_team not in label_encoder.classes_:
        raise ValueError(f"Away team name '{away_team}' not recognized.")

    # Transform team names
    home_team_encoded = label_encoder.transform([home_team])[0]
    away_team_encoded = label_encoder.transform([away_team])[0]

    # Debugging: Print encoded values
    print(f"Encoded Home Team: {home_team_encoded}, Encoded Away Team: {away_team_encoded}")

    # Prepare the match data with relevant features
    match_data = {
        'HomeTeam_encoded': label_encoder.transform([home_team])[0],
        'AwayTeam_encoded': label_encoder.transform([away_team])[0],
        'HomeTeamRecentForm': calculate_form_points(home_team, df)[0],
        'AwayTeamRecentForm': calculate_form_points(away_team, df)[1],
        'HomeTeamAvgGoals': df[df['HomeTeam'] == home_team]['FTHG'].mean(),
        'AwayTeamAvgGoals': df[df['AwayTeam'] == away_team]['FTAG'].mean(),
        'HomeTeamPoints': df[df['HomeTeam'] == home_team]['FTR_encoded'].sum() / df[df['HomeTeam'] == home_team]['FTR_encoded'].count(),
        'AwayTeamPoints': df[df['AwayTeam'] == away_team]['FTR_encoded'].sum() / df[df['AwayTeam'] == away_team]['FTR_encoded'].count(),
        'HomeGoalsScoredVariance': df[df['HomeTeam'] == home_team]['FTHG'].var(),
        'AwayGoalsScoredVariance': df[df['AwayTeam'] == away_team]['FTAG'].var(),
        'HomeGoalsScoredAvg_3': df[df['HomeTeam'] == home_team]['FTHG'].rolling(3, min_periods=1).mean().iloc[-1],
        'AwayGoalsScoredAvg_3': df[df['AwayTeam'] == away_team]['FTAG'].rolling(3, min_periods=1).mean().iloc[-1],
        'HomeGoalsConcededAvg_3': df[df['HomeTeam'] == home_team]['FTAG'].rolling(3, min_periods=1).mean().iloc[-1],
        'AwayGoalsConcededAvg_3': df[df['AwayTeam'] == away_team]['FTHG'].rolling(3, min_periods=1).mean().iloc[-1],
        'HomeGoalsScoredAvg_5': df[df['HomeTeam'] == home_team]['FTHG'].rolling(5, min_periods=1).mean().iloc[-1],
        'AwayGoalsScoredAvg_5': df[df['AwayTeam'] == away_team]['FTAG'].rolling(5, min_periods=1).mean().iloc[-1],
        'HomeGoalsConcededAvg_5': df[df['HomeTeam'] == home_team]['FTAG'].rolling(5, min_periods=1).mean().iloc[-1],
        'AwayGoalsConcededAvg_5': df[df['AwayTeam'] == away_team]['FTHG'].rolling(5, min_periods=1).mean().iloc[-1],
        
        # Add other features as required by the model, calculated or retrieved as done during training
        # ...
    }

    match_df = pd.DataFrame([match_data])
    match_scaled = scaler.transform(match_df)
    # Make predictions
    home_team_win_prob = best_model.predict_proba(match_scaled)[0][0]
    away_team_win_prob = best_model.predict_proba(match_scaled)[0][2]
    draw_prob = best_model.predict_proba(match_scaled)[0][1]
    # Print the results
    print(f"{home_team} win probability: {home_team_win_prob}")
    print(f"{away_team} win probability: {away_team_win_prob}")
    print(f"Draw probability: {draw_prob}")
    #pd.DataFrame({'home_team': home_team, 'away_team': away_team, 'home_team_win_prob': home_team_win_prob, 'draw_prob': draw_prob, 'away_team_win_prob': away_team_win_prob})
    
    return home_team_win_prob, away_team_win_prob, draw_prob



print('##########################')
print('Today\'s matches')

predict_match('Mechelen', 'Anderlecht', best_model, df, label_encoder,scaler)

predict_match('St Truiden', "Gent", best_model, df, label_encoder,scaler)

##########################
Today's matches
Encoded Home Team: 9, Encoded Away Team: 0
Mechelen win probability: 0.065
Anderlecht win probability: 0.42916666666666664
Draw probability: 0.5058333333333334
Encoded Home Team: 12, Encoded Away Team: 7
St Truiden win probability: 0.145
Gent win probability: 0.5557936507936507
Draw probability: 0.2992063492063492




(0.145, 0.5557936507936507, 0.2992063492063492)

In [36]:
# # save the 3 models
# import joblib
# 
# # Save the model as a pickle file
# joblib.dump(best_model, 'models/best_model.pkl')
# # joblib.dump(label_encoder, 'label_encoder.pkl')
# # joblib.dump(scaler, 'scaler.pkl')
#     
# # save the xgb model
# import pickle
# 
# # Save the model as a pickle file
# pickle.dump(xgb_model, open('models/xgb_model.pkl', 'wb'))
# # pickle.dump(label_encoder, open('label_encoder.pkl', 'wb'))
# # pickle.dump(scaler, open('scaler.pkl', 'wb'))
# 
# # save the catboost model
# import pickle
# 
# # Save the model as a pickle file
# pickle.dump(catboost_model, open('models/catboost_model.pkl', 'wb'))
# pickle.dump(label_encoder, open('models/label_encoder.pkl', 'wb'))
# pickle.dump(scaler, open('models/scaler.pkl', 'wb'))
# 
