# Championship 2023-2024 Predictions

In [615]:
# import data
import pandas as pd
import numpy as np

df = pd.read_csv('F2.csv')
df.tail()

Unnamed: 0,Div,Date,Time,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,...,AvgC<2.5,AHCh,B365CAHH,B365CAHA,PCAHH,PCAHA,MaxCAHH,MaxCAHA,AvgCAHH,AvgCAHA
185,F2,19/12/2023,19:45,Dunkerque,Bordeaux,0,2,A,0,1,...,1.93,0.5,1.95,1.9,1.96,1.93,1.96,2.0,1.87,1.92
186,F2,19/12/2023,19:45,Laval,Auxerre,1,3,A,1,1,...,1.69,0.5,1.8,2.05,1.83,2.07,1.83,2.11,1.78,2.0
187,F2,19/12/2023,19:45,Paris FC,Quevilly Rouen,2,2,D,1,1,...,1.71,-0.25,2.0,1.85,2.04,1.85,2.05,1.89,1.95,1.84
188,F2,19/12/2023,19:45,Pau FC,Troyes,1,1,D,0,0,...,2.09,-0.25,1.85,2.0,1.86,2.03,1.91,2.14,1.8,2.02
189,F2,19/12/2023,19:45,St Etienne,Bastia,3,2,H,2,0,...,1.73,-0.5,1.8,2.05,1.81,2.07,1.88,2.08,1.81,1.97


In [616]:
## Data Cleaning

In [617]:
# steps:
# 1. Check for missing values
missing_values = df.isnull().sum()

# 2. Check for duplicates
duplicate_rows = df.duplicated().sum()

# Display the results of the checks
missing_values, duplicate_rows

(Div         0
 Date        0
 Time        0
 HomeTeam    0
 AwayTeam    0
            ..
 PCAHA       0
 MaxCAHH     0
 MaxCAHA     0
 AvgCAHH     0
 AvgCAHA     0
 Length: 105, dtype: int64,
 0)

In [618]:
# Checking for missing values
missing_values = df.isnull().sum()




# Checking the balance of the target variable 'FTR'
target_distribution = df['FTR'].value_counts()

# drop the div column
df.drop('Div', axis=1, inplace=True)
df.drop('Time', axis=1, inplace=True)

missing_values, target_distribution


(Div         0
 Date        0
 Time        0
 HomeTeam    0
 AwayTeam    0
            ..
 PCAHA       0
 MaxCAHH     0
 MaxCAHA     0
 AvgCAHH     0
 AvgCAHA     0
 Length: 105, dtype: int64,
 FTR
 H    74
 A    59
 D    57
 Name: count, dtype: int64)

In [619]:
# covert date to datetime
df['Date'] = pd.to_datetime(df['Date'], format='%d/%m/%Y')

## Feature engineering

### steps:
* Encode Team Names: Use label encoding for 'HomeTeam' and 'AwayTeam'. This will convert team names into numeric values, making them usable for the model.

* Recent Form: Calculate the recent form for each team based on the last 5 matches. We'll use the 'FTR' column to determine wins (W), losses (L), and draws (D). This feature will provide insight into the current performance of the teams.

* Average Goals per Game: Compute the average goals scored per game for both home and away teams. This feature helps understand the offensive strength of the teams.

* Team Points: Calculate the total points accumulated by each team so far in the season. Points are awarded based on wins (3 points), draws (1 point), and losses (0 points).

* Head-to-Head Statistics: Analyze the outcomes of matches between the same pairs of teams earlier in the season.

* Other Statistical Features: Depending on the data available, we can include additional features like average possession, number of shots on target, defensive strength, etc.

In [620]:
# Reset the index of the DataFrame (if necessary)
df.reset_index(drop=True, inplace=True)

# Rolling averages for goals
window_sizes = [3, 5]
for window in window_sizes:
    df[f'HomeGoalsScoredAvg_{window}'] = df.groupby('HomeTeam')['FTHG'].transform(lambda x: x.rolling(window, min_periods=1).mean())
    df[f'AwayGoalsScoredAvg_{window}'] = df.groupby('AwayTeam')['FTAG'].transform(lambda x: x.rolling(window, min_periods=1).mean())
    df[f'HomeGoalsConcededAvg_{window}'] = df.groupby('HomeTeam')['FTAG'].transform(lambda x: x.rolling(window, min_periods=1).mean())
    df[f'AwayGoalsConcededAvg_{window}'] = df.groupby('AwayTeam')['FTHG'].transform(lambda x: x.rolling(window, min_periods=1).mean())



In [621]:

# Variance in team performance
df['HomeGoalsScoredVariance'] = df.groupby('HomeTeam')['FTHG'].transform(lambda x: x.rolling(5, min_periods=1).var())
df['AwayGoalsScoredVariance'] = df.groupby('AwayTeam')['FTAG'].transform(lambda x: x.rolling(5, min_periods=1).var())




In [622]:


from sklearn.preprocessing import LabelEncoder

# Initialize the LabelEncoder
label_encoder = LabelEncoder()

# Combine all unique team names from both HomeTeam and AwayTeam columns
all_teams = pd.concat([df['HomeTeam'], df['AwayTeam']]).unique()

# Fit the LabelEncoder with all unique team names
label_encoder.fit(all_teams)

# Transform both HomeTeam and AwayTeam using the fitted LabelEncoder
df['HomeTeam_encoded'] = label_encoder.transform(df['HomeTeam'])
df['AwayTeam_encoded'] = label_encoder.transform(df['AwayTeam'])

# Now you can transform individual team names
home_team_encoded = label_encoder.transform(['St Etienne'])[0]
away_team_encoded = label_encoder.transform(['Laval'])[0]

# Debugging: Print encoded values
print(f"Encoded Home Team: {home_team_encoded}, Encoded Away Team: {away_team_encoded}")

print(label_encoder.classes_)


Encoded Home Team: 17, Encoded Away Team: 12
['Ajaccio' 'Amiens' 'Angers' 'Annecy' 'Auxerre' 'Bastia' 'Bordeaux' 'Caen'
 'Concarneau' 'Dunkerque' 'Grenoble' 'Guingamp' 'Laval' 'Paris FC'
 'Pau FC' 'Quevilly Rouen' 'Rodez' 'St Etienne' 'Troyes' 'Valenciennes']


In [623]:
df.tail()

Unnamed: 0,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,HS,...,HomeGoalsConcededAvg_3,AwayGoalsConcededAvg_3,HomeGoalsScoredAvg_5,AwayGoalsScoredAvg_5,HomeGoalsConcededAvg_5,AwayGoalsConcededAvg_5,HomeGoalsScoredVariance,AwayGoalsScoredVariance,HomeTeam_encoded,AwayTeam_encoded
185,2023-12-19,Dunkerque,Bordeaux,0,2,A,0,1,A,14,...,3.0,1.333333,0.2,1.4,2.6,1.8,0.2,0.8,9,6
186,2023-12-19,Laval,Auxerre,1,3,A,1,1,D,5,...,1.666667,0.666667,0.6,1.4,1.8,1.0,0.3,1.3,12,4
187,2023-12-19,Paris FC,Quevilly Rouen,2,2,D,1,1,D,10,...,1.333333,1.666667,1.6,1.8,1.2,2.0,0.3,1.2,13,15
188,2023-12-19,Pau FC,Troyes,1,1,D,0,0,D,19,...,1.0,0.666667,2.4,1.0,1.8,1.0,1.8,0.5,14,18
189,2023-12-19,St Etienne,Bastia,3,2,H,2,0,H,12,...,2.333333,1.333333,1.4,1.6,1.6,1.2,1.3,4.3,17,5


In [624]:
# Encoding the target variable 'FTR'
from sklearn.preprocessing import LabelEncoder

# Initialize the LabelEncoder
label_encoder = LabelEncoder()

# Fit and transform the target variable using label_encoder_y
df['FTR_encoded'] = label_encoder.fit_transform(df['FTR'])




In [625]:

# Average points per game
df['HomeTeamPoints'] = df.groupby('HomeTeam')['FTR_encoded'].transform(lambda x: x.expanding().sum().shift(1)) / df.groupby('HomeTeam')['FTR_encoded'].transform(lambda x: x.expanding().count().shift(1))
df['AwayTeamPoints'] = df.groupby('AwayTeam')['FTR_encoded'].transform(lambda x: x.expanding().sum().shift(1)) / df.groupby('AwayTeam')['FTR_encoded'].transform(lambda x: x.expanding().count().shift(1))



In [626]:
# Average goals per game
df['HomeTeamAvgGoals'] = df.groupby('HomeTeam')['FTHG'].transform(lambda x: x.expanding().mean().shift(1)) # Average goals scored by the home team
df['AwayTeamAvgGoals'] = df.groupby('AwayTeam')['FTAG'].transform(lambda x: x.expanding().mean().shift(1)) # Average goals scored by the away team



In [627]:
# recent form
def calculate_form_points(team, df):
    # Get the last 5 matches of the home and away teams
    home_matches = df[df['HomeTeam'] == team].tail(5)
    away_matches = df[df['AwayTeam'] == team].tail(5)

    # Calculate the points obtained in the last 5 matches
    home_points = home_matches['FTR_encoded'].sum()
    away_points = away_matches['FTR_encoded'].sum()

    # Calculate the average points obtained
    home_avg_points = home_points / 15
    away_avg_points = away_points / 15

    # Return the average points
    return home_avg_points, away_avg_points

# Calculate the average points obtained by each team in the last 5 matches
df['HomeTeamRecentForm'], df['AwayTeamRecentForm'] = zip(*df['HomeTeam'].apply(lambda x: calculate_form_points(x, df)))



### Recent Form

In [628]:
from sklearn.model_selection import train_test_split

# Convert 'Date' to datetime and extract useful features
df['Date'] = pd.to_datetime(df['Date'], format='%d/%m/%Y')
df['DayOfWeek'] = df['Date'].dt.dayofweek  # 0: Monday, 6: Sunday
df['IsWeekend'] = df['DayOfWeek'].isin([5, 6]).astype(int)  # 1 for weekend, 0 for weekdays



In [629]:
#select relevant columns or features and the target
features = [
    'HomeTeam_encoded', 'AwayTeam_encoded', 'HomeTeamRecentForm', 'AwayTeamRecentForm', 
    'HomeTeamAvgGoals', 'AwayTeamAvgGoals', 'HomeTeamPoints', 'AwayTeamPoints', 'HomeGoalsScoredVariance', 'AwayGoalsScoredVariance', 'HomeGoalsScoredAvg_3', 'AwayGoalsScoredAvg_3', 'HomeGoalsConcededAvg_3', 'AwayGoalsConcededAvg_3', 'HomeGoalsScoredAvg_5', 'AwayGoalsScoredAvg_5', 'HomeGoalsConcededAvg_5', 'AwayGoalsConcededAvg_5',
]

target = 'FTR_encoded'

# Split the data into training and test sets
X = df[features]
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
train_test_split(X, y, test_size=0.2, random_state=42)

# Previewing the processed features
X_train.head(7), y_train.head(7)

# encoding the categorical variables
from sklearn.preprocessing import LabelEncoder

# Initialize the LabelEncoder
label_encoder = LabelEncoder()

# Fit and transform the target variable using label_encoder_y
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)


In [630]:
# scaling the features
from sklearn.preprocessing import StandardScaler

# Initialize the StandardScaler
scaler = StandardScaler()

# Fit the StandardScaler
scaler.fit(X_train)

# Transform both the training and testing sets
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)



### Team Points

In [631]:
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Initialize the models
rf_model = RandomForestClassifier(random_state=42)
xgb_model = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='mlogloss')
lgbm_model = LGBMClassifier(random_state=42)   
catboost_model = CatBoostClassifier(random_state=42, verbose=False)

# Train the models
#rf_model.fit(X_train, y_train_encoded)
xgb_model.fit(X_train_scaled, y_train_encoded)
lgbm_model.fit(X_train_scaled, y_train_encoded)
catboost_model.fit(X_train_scaled, y_train_encoded)






[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000068 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 348
[LightGBM] [Info] Number of data points in the train set: 152, number of used features: 18
[LightGBM] [Info] Start training from score -1.262680
[LightGBM] [Info] Start training from score -1.053589
[LightGBM] [Info] Start training from score -0.998529


<catboost.core.CatBoostClassifier at 0x21c6f78ae50>

In [632]:
from sklearn.metrics import roc_auc_score

#evaluate the models using the metrics
models = [ xgb_model, lgbm_model, catboost_model]
model_names = [ 'XGBoost', 'LightGBM', 'CatBoost']

for model, name in zip(models, model_names):
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='macro', zero_division=0)
    recall = recall_score(y_test, y_pred, average='macro', zero_division=0)
    f1 = f1_score(y_test, y_pred, average='macro', zero_division=0)
    print(f"{name} Model Metrics")
    print(f"Accuracy: {accuracy}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1 Score: {f1}")
    print("\n")
    
    

XGBoost Model Metrics
Accuracy: 0.23684210526315788
Precision: 0.4259259259259259
Recall: 0.17592592592592593
F1 Score: 0.24555555555555555


LightGBM Model Metrics
Accuracy: 0.21052631578947367
Precision: 0.4666666666666666
Recall: 0.2800925925925926
F1 Score: 0.21514161220043573


CatBoost Model Metrics
Accuracy: 0.2631578947368421
Precision: 0.4473015873015873
Recall: 0.32175925925925924
F1 Score: 0.2718224221222722


In [633]:
df.tail()

Unnamed: 0,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,HS,...,AwayTeam_encoded,FTR_encoded,HomeTeamPoints,AwayTeamPoints,HomeTeamAvgGoals,AwayTeamAvgGoals,HomeTeamRecentForm,AwayTeamRecentForm,DayOfWeek,IsWeekend
185,2023-12-19,Dunkerque,Bordeaux,0,2,A,0,1,A,14,...,6,0,0.222222,1.333333,0.444444,0.777778,0.0,0.333333,1,0
186,2023-12-19,Laval,Auxerre,1,3,A,1,1,D,5,...,4,0,1.333333,0.555556,1.0,2.0,0.133333,0.2,1,0
187,2023-12-19,Paris FC,Quevilly Rouen,2,2,D,1,1,D,10,...,15,1,1.0,1.333333,1.333333,1.333333,0.4,0.066667,1,0
188,2023-12-19,Pau FC,Troyes,1,1,D,0,0,D,19,...,18,1,1.444444,1.222222,2.111111,1.0,0.466667,0.266667,1,0
189,2023-12-19,St Etienne,Bastia,3,2,H,2,0,H,12,...,5,2,0.888889,1.444444,0.888889,0.777778,0.266667,0.333333,1,0


In [634]:

from sklearn.preprocessing import LabelEncoder

# Initialize the LabelEncoder
label_encoder = LabelEncoder()

# Combine all unique team names from both HomeTeam and AwayTeam columns
all_teams = pd.concat([df['HomeTeam'], df['AwayTeam']]).unique()

# Fit the LabelEncoder with all unique team names
label_encoder.fit(all_teams)

# Transform both HomeTeam and AwayTeam using the fitted LabelEncoder
df['HomeTeam_encoded'] = label_encoder.transform(df['HomeTeam'])
df['AwayTeam_encoded'] = label_encoder.transform(df['AwayTeam'])

# Now you can transform individual team names
home_team_encoded = label_encoder.transform(['St Etienne'])[0]
away_team_encoded = label_encoder.transform(['Laval'])[0]

# Debugging: Print encoded values
print(f"Encoded Home Team: {home_team_encoded}, Encoded Away Team: {away_team_encoded}")

print(label_encoder.classes_)


Encoded Home Team: 17, Encoded Away Team: 12
['Ajaccio' 'Amiens' 'Angers' 'Annecy' 'Auxerre' 'Bastia' 'Bordeaux' 'Caen'
 'Concarneau' 'Dunkerque' 'Grenoble' 'Guingamp' 'Laval' 'Paris FC'
 'Pau FC' 'Quevilly Rouen' 'Rodez' 'St Etienne' 'Troyes' 'Valenciennes']


In [636]:

def predict_match(home_team, away_team, xgb_model, df, label_encoder):
     # Debugging: Check if the team names are in label_encoder's classes
    if home_team not in label_encoder.classes_:
        raise ValueError(f"Home team name '{home_team}' not recognized.")
    if away_team not in label_encoder.classes_:
        raise ValueError(f"Away team name '{away_team}' not recognized.")

    # Transform team names
    home_team_encoded = label_encoder.transform([home_team])[0]
    away_team_encoded = label_encoder.transform([away_team])[0]

    # Debugging: Print encoded values
    print(f"Encoded Home Team: {home_team_encoded}, Encoded Away Team: {away_team_encoded}")

    # Prepare the match data with relevant features
    match_data = {
        'HomeTeam_encoded': label_encoder.transform([home_team])[0],
        'AwayTeam_encoded': label_encoder.transform([away_team])[0],
        'HomeTeamRecentForm': calculate_form_points(home_team, df)[0],
        'AwayTeamRecentForm': calculate_form_points(away_team, df)[1],
        'HomeTeamAvgGoals': df[df['HomeTeam'] == home_team]['FTHG'].mean(),
        'AwayTeamAvgGoals': df[df['AwayTeam'] == away_team]['FTAG'].mean(),
        'HomeTeamPoints': df[df['HomeTeam'] == home_team]['FTR_encoded'].sum() / df[df['HomeTeam'] == home_team]['FTR_encoded'].count(),
        'AwayTeamPoints': df[df['AwayTeam'] == away_team]['FTR_encoded'].sum() / df[df['AwayTeam'] == away_team]['FTR_encoded'].count(),
        'HomeGoalsScoredVariance': df[df['HomeTeam'] == home_team]['FTHG'].var(),
        'AwayGoalsScoredVariance': df[df['AwayTeam'] == away_team]['FTAG'].var(),
        'HomeGoalsScoredAvg_3': df[df['HomeTeam'] == home_team]['FTHG'].rolling(3, min_periods=1).mean().iloc[-1],
        'AwayGoalsScoredAvg_3': df[df['AwayTeam'] == away_team]['FTAG'].rolling(3, min_periods=1).mean().iloc[-1],
        'HomeGoalsConcededAvg_3': df[df['HomeTeam'] == home_team]['FTAG'].rolling(3, min_periods=1).mean().iloc[-1],
        'AwayGoalsConcededAvg_3': df[df['AwayTeam'] == away_team]['FTHG'].rolling(3, min_periods=1).mean().iloc[-1],
        'HomeGoalsScoredAvg_5': df[df['HomeTeam'] == home_team]['FTHG'].rolling(5, min_periods=1).mean().iloc[-1],
        'AwayGoalsScoredAvg_5': df[df['AwayTeam'] == away_team]['FTAG'].rolling(5, min_periods=1).mean().iloc[-1],
        'HomeGoalsConcededAvg_5': df[df['HomeTeam'] == home_team]['FTAG'].rolling(5, min_periods=1).mean().iloc[-1],
        'AwayGoalsConcededAvg_5': df[df['AwayTeam'] == away_team]['FTHG'].rolling(5, min_periods=1).mean().iloc[-1],
        
        # Add other features as required by the model, calculated or retrieved as done during training
        # ...
    }

    match_df = pd.DataFrame([match_data])

    # Make predictions
    probabilities = xgb_model.predict_proba(match_df)[0]
    home_team_win_prob = probabilities[0]
    draw_prob = probabilities[1]
    away_team_win_prob = probabilities[2]

    print(f"{home_team} win probability: {home_team_win_prob}")
    print(f"Draw probability: {draw_prob}")
    print(f"{away_team} win probability: {away_team_win_prob}")

    return home_team_win_prob, draw_prob, away_team_win_prob



print('##########################')
print('Today\'s matches')

predict_match('St Etienne', 'Laval', xgb_model, df, label_encoder)

predict_match('Caen', 'Concarneau', xgb_model, df, label_encoder)

predict_match('Grenoble', 'Dunkerque', xgb_model, df, label_encoder)

predict_match('Paris FC', 'Annecy', xgb_model, df, label_encoder)

predict_match('Quevilly Rouen', 'Guingamp', xgb_model, df, label_encoder)

predict_match('Rodez', 'Pau FC', xgb_model, df, label_encoder)

predict_match('Bastia', 'Angers', xgb_model, df, label_encoder)

predict_match('Troyes', 'Ajaccio', xgb_model, df, label_encoder)

predict_match('Valenciennes', 'Amiens', xgb_model, df, label_encoder)


##########################
Today's matches
Encoded Home Team: 17, Encoded Away Team: 12
St Etienne win probability: 0.1919000893831253
Draw probability: 0.7436269521713257
Laval win probability: 0.06447300314903259
Encoded Home Team: 7, Encoded Away Team: 8
Caen win probability: 0.45467352867126465
Draw probability: 0.4960976839065552
Concarneau win probability: 0.04922875761985779
Encoded Home Team: 10, Encoded Away Team: 9
Grenoble win probability: 0.3971242606639862
Draw probability: 0.552302360534668
Dunkerque win probability: 0.050573356449604034
Encoded Home Team: 13, Encoded Away Team: 3
Paris FC win probability: 0.3224872052669525
Draw probability: 0.5994745492935181
Annecy win probability: 0.07803826034069061
Encoded Home Team: 15, Encoded Away Team: 11
Quevilly Rouen win probability: 0.4085828363895416
Draw probability: 0.5393984317779541
Guingamp win probability: 0.052018746733665466
Encoded Home Team: 16, Encoded Away Team: 14
Rodez win probability: 0.21343238651752472
Draw

(0.4008002, 0.5689399, 0.0302599)

In [643]:

def predict_match(home_team, away_team, catboost_model, df, label_encoder):
     # Debugging: Check if the team names are in label_encoder's classes
    if home_team not in label_encoder.classes_:
        raise ValueError(f"Home team name '{home_team}' not recognized.")
    if away_team not in label_encoder.classes_:
        raise ValueError(f"Away team name '{away_team}' not recognized.")

    # Transform team names
    home_team_encoded = label_encoder.transform([home_team])[0]
    away_team_encoded = label_encoder.transform([away_team])[0]

    # Debugging: Print encoded values
    print(f"Encoded Home Team: {home_team_encoded}, Encoded Away Team: {away_team_encoded}")

    # Prepare the match data with relevant features
    match_data = {
        'HomeTeam_encoded': label_encoder.transform([home_team])[0],
        'AwayTeam_encoded': label_encoder.transform([away_team])[0],
        'HomeTeamRecentForm': calculate_form_points(home_team, df)[0],
        'AwayTeamRecentForm': calculate_form_points(away_team, df)[1],
        'HomeTeamAvgGoals': df[df['HomeTeam'] == home_team]['FTHG'].mean(),
        'AwayTeamAvgGoals': df[df['AwayTeam'] == away_team]['FTAG'].mean(),
        'HomeTeamPoints': df[df['HomeTeam'] == home_team]['FTR_encoded'].sum() / df[df['HomeTeam'] == home_team]['FTR_encoded'].count(),
        'AwayTeamPoints': df[df['AwayTeam'] == away_team]['FTR_encoded'].sum() / df[df['AwayTeam'] == away_team]['FTR_encoded'].count(),
        'HomeGoalsScoredVariance': df[df['HomeTeam'] == home_team]['FTHG'].var(),
        'AwayGoalsScoredVariance': df[df['AwayTeam'] == away_team]['FTAG'].var(),
        'HomeGoalsScoredAvg_3': df[df['HomeTeam'] == home_team]['FTHG'].rolling(3, min_periods=1).mean().iloc[-1],
        'AwayGoalsScoredAvg_3': df[df['AwayTeam'] == away_team]['FTAG'].rolling(3, min_periods=1).mean().iloc[-1],
        'HomeGoalsConcededAvg_3': df[df['HomeTeam'] == home_team]['FTAG'].rolling(3, min_periods=1).mean().iloc[-1],
        'AwayGoalsConcededAvg_3': df[df['AwayTeam'] == away_team]['FTHG'].rolling(3, min_periods=1).mean().iloc[-1],
        'HomeGoalsScoredAvg_5': df[df['HomeTeam'] == home_team]['FTHG'].rolling(5, min_periods=1).mean().iloc[-1],
        'AwayGoalsScoredAvg_5': df[df['AwayTeam'] == away_team]['FTAG'].rolling(5, min_periods=1).mean().iloc[-1],
        'HomeGoalsConcededAvg_5': df[df['HomeTeam'] == home_team]['FTAG'].rolling(5, min_periods=1).mean().iloc[-1],
        'AwayGoalsConcededAvg_5': df[df['AwayTeam'] == away_team]['FTHG'].rolling(5, min_periods=1).mean().iloc[-1],
        
        # Add other features as required by the model, calculated or retrieved as done during training
        # ...
    }

    match_df = pd.DataFrame([match_data])

    # Make predictions
    probabilities = catboost_model.predict_proba(match_df)[0]
    home_team_win_prob = probabilities[0]
    draw_prob = probabilities[1]
    away_team_win_prob = probabilities[2]

    print(f"{home_team} win probability: {home_team_win_prob}")
    print(f"Draw probability: {draw_prob}")
    print(f"{away_team} win probability: {away_team_win_prob}")

    return home_team_win_prob, draw_prob, away_team_win_prob



print('##########################')
print('Today\'s matches')

predict_match('St Etienne', 'Laval', catboost_model, df, label_encoder)

predict_match('Caen', 'Concarneau', catboost_model, df, label_encoder)

predict_match('Grenoble', 'Dunkerque', catboost_model, df, label_encoder)

predict_match('Paris FC', 'Annecy', catboost_model, df, label_encoder)

predict_match('Quevilly Rouen', 'Guingamp', catboost_model, df, label_encoder)

predict_match('Rodez', 'Pau FC', catboost_model, df, label_encoder)

predict_match('Bastia', 'Angers', catboost_model, df, label_encoder)

predict_match('Troyes', 'Ajaccio', catboost_model, df, label_encoder)

predict_match('Valenciennes', 'Amiens', catboost_model, df, label_encoder)

predict_match('Auxerre', 'Bordeaux', catboost_model, df, label_encoder)


##########################
Today's matches
Encoded Home Team: 17, Encoded Away Team: 12
St Etienne win probability: 0.41733230137147237
Draw probability: 0.49051421643205484
Laval win probability: 0.09215348219647271
Encoded Home Team: 7, Encoded Away Team: 8
Caen win probability: 0.4122002811732149
Draw probability: 0.4891236305135153
Concarneau win probability: 0.09867608831326978
Encoded Home Team: 10, Encoded Away Team: 9
Grenoble win probability: 0.4364251477504607
Draw probability: 0.4943091487330067
Dunkerque win probability: 0.06926570351653248
Encoded Home Team: 13, Encoded Away Team: 3
Paris FC win probability: 0.39555612827061304
Draw probability: 0.47173367314245157
Annecy win probability: 0.13271019858693536
Encoded Home Team: 15, Encoded Away Team: 11
Quevilly Rouen win probability: 0.42521677663137725
Draw probability: 0.4772683935172214
Guingamp win probability: 0.09751482985140134
Encoded Home Team: 16, Encoded Away Team: 14
Rodez win probability: 0.3942365863150441
Dr

(0.3714788108792658, 0.522258408504998, 0.10626278061573607)

In [638]:
from sklearn.impute import SimpleImputer
import numpy as np

# Initialize the imputer (you can change the strategy to 'median' or 'most_frequent' if more appropriate)
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')

# Fit on the training data and transform both training and testing data
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)




## using random forest

In [639]:
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV



# Then scale the imputed data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_imputed)
X_test_scaled = scaler.transform(X_test_imputed)

# Hyperparameter Tuning
model = RandomForestClassifier(random_state=42)
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
    
}
grid_search = GridSearchCV(model, param_grid, cv=3, scoring='f1_macro')
grid_search.fit(X_train_scaled, y_train_encoded)

# Best Model Evaluation
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test_scaled)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.83      0.62      0.71        16
           1       0.11      0.25      0.15         4
           2       0.76      0.72      0.74        18

    accuracy                           0.63        38
   macro avg       0.57      0.53      0.54        38
weighted avg       0.72      0.63      0.67        38


In [641]:

def predict_match(home_team, away_team, best_model, df, label_encoder,scaler):
     # Debugging: Check if the team names are in label_encoder's classes
    if home_team not in label_encoder.classes_:
        raise ValueError(f"Home team name '{home_team}' not recognized.")
    if away_team not in label_encoder.classes_:
        raise ValueError(f"Away team name '{away_team}' not recognized.")

    # Transform team names
    home_team_encoded = label_encoder.transform([home_team])[0]
    away_team_encoded = label_encoder.transform([away_team])[0]

    # Debugging: Print encoded values
    print(f"Encoded Home Team: {home_team_encoded}, Encoded Away Team: {away_team_encoded}")

    # Prepare the match data with relevant features
    match_data = {
        'HomeTeam_encoded': label_encoder.transform([home_team])[0],
        'AwayTeam_encoded': label_encoder.transform([away_team])[0],
        'HomeTeamRecentForm': calculate_form_points(home_team, df)[0],
        'AwayTeamRecentForm': calculate_form_points(away_team, df)[1],
        'HomeTeamAvgGoals': df[df['HomeTeam'] == home_team]['FTHG'].mean(),
        'AwayTeamAvgGoals': df[df['AwayTeam'] == away_team]['FTAG'].mean(),
        'HomeTeamPoints': df[df['HomeTeam'] == home_team]['FTR_encoded'].sum() / df[df['HomeTeam'] == home_team]['FTR_encoded'].count(),
        'AwayTeamPoints': df[df['AwayTeam'] == away_team]['FTR_encoded'].sum() / df[df['AwayTeam'] == away_team]['FTR_encoded'].count(),
        'HomeGoalsScoredVariance': df[df['HomeTeam'] == home_team]['FTHG'].var(),
        'AwayGoalsScoredVariance': df[df['AwayTeam'] == away_team]['FTAG'].var(),
        'HomeGoalsScoredAvg_3': df[df['HomeTeam'] == home_team]['FTHG'].rolling(3, min_periods=1).mean().iloc[-1],
        'AwayGoalsScoredAvg_3': df[df['AwayTeam'] == away_team]['FTAG'].rolling(3, min_periods=1).mean().iloc[-1],
        'HomeGoalsConcededAvg_3': df[df['HomeTeam'] == home_team]['FTAG'].rolling(3, min_periods=1).mean().iloc[-1],
        'AwayGoalsConcededAvg_3': df[df['AwayTeam'] == away_team]['FTHG'].rolling(3, min_periods=1).mean().iloc[-1],
        'HomeGoalsScoredAvg_5': df[df['HomeTeam'] == home_team]['FTHG'].rolling(5, min_periods=1).mean().iloc[-1],
        'AwayGoalsScoredAvg_5': df[df['AwayTeam'] == away_team]['FTAG'].rolling(5, min_periods=1).mean().iloc[-1],
        'HomeGoalsConcededAvg_5': df[df['HomeTeam'] == home_team]['FTAG'].rolling(5, min_periods=1).mean().iloc[-1],
        'AwayGoalsConcededAvg_5': df[df['AwayTeam'] == away_team]['FTHG'].rolling(5, min_periods=1).mean().iloc[-1],
        
        # Add other features as required by the model, calculated or retrieved as done during training
        # ...
    }

    match_df = pd.DataFrame([match_data])
    match_scaled = scaler.transform(match_df)
    # Make predictions
    home_team_win_prob = best_model.predict_proba(match_scaled)[0][0]
    away_team_win_prob = best_model.predict_proba(match_scaled)[0][2]
    draw_prob = best_model.predict_proba(match_scaled)[0][1]
    # Print the results
    print(f"{home_team} win probability: {home_team_win_prob}")
    print(f"{away_team} win probability: {away_team_win_prob}")
    print(f"Draw probability: {draw_prob}")
    
    return home_team_win_prob, away_team_win_prob, draw_prob



print('##########################')
print('Today\'s matches')

predict_match('St Etienne', 'Laval', best_model, df, label_encoder,scaler)

predict_match('Caen', 'Concarneau', best_model, df, label_encoder,scaler)

predict_match('Grenoble', 'Dunkerque', best_model, df, label_encoder,scaler)

predict_match('Paris FC', 'Annecy', best_model, df, label_encoder,scaler)

predict_match('Quevilly Rouen', 'Guingamp', best_model, df, label_encoder, scaler)

predict_match('Rodez', 'Pau FC', best_model, df, label_encoder, scaler)

predict_match('Bastia', 'Angers', best_model, df, label_encoder, scaler)

predict_match('Troyes', 'Ajaccio', best_model, df, label_encoder, scaler)

predict_match('Valenciennes', 'Amiens', best_model, df, label_encoder, scaler)


##########################
Today's matches
Encoded Home Team: 17, Encoded Away Team: 12
St Etienne win probability: 0.5175218253968255
Laval win probability: 0.2699285714285714
Draw probability: 0.21254960317460309
Encoded Home Team: 7, Encoded Away Team: 8
Caen win probability: 0.28498015873015875
Concarneau win probability: 0.44179761904761894
Draw probability: 0.2732222222222223
Encoded Home Team: 10, Encoded Away Team: 9
Grenoble win probability: 0.26754978354978365
Dunkerque win probability: 0.3689880952380953
Draw probability: 0.36346212121212135
Encoded Home Team: 13, Encoded Away Team: 3
Paris FC win probability: 0.12079761904761906
Annecy win probability: 0.6233869047619047
Draw probability: 0.2558154761904762
Encoded Home Team: 15, Encoded Away Team: 11




Quevilly Rouen win probability: 0.3630634920634921
Guingamp win probability: 0.38047222222222227
Draw probability: 0.2564642857142857
Encoded Home Team: 16, Encoded Away Team: 14
Rodez win probability: 0.2244107142857143
Pau FC win probability: 0.32725793650793655
Draw probability: 0.44833134920634926
Encoded Home Team: 5, Encoded Away Team: 2
Bastia win probability: 0.24737499999999998
Angers win probability: 0.24841071428571432
Draw probability: 0.5042142857142857
Encoded Home Team: 18, Encoded Away Team: 0
Troyes win probability: 0.3983869047619047
Ajaccio win probability: 0.1853055555555555
Draw probability: 0.41630753968253975
Encoded Home Team: 19, Encoded Away Team: 1
Valenciennes win probability: 0.40711309523809525
Amiens win probability: 0.07601984126984125
Draw probability: 0.5168670634920635


(0.40711309523809525, 0.07601984126984125, 0.5168670634920635)