# Premier League 2023-2024 prediction

## Importing libraries

In [None]:
import pandas as pd
import openpyxl

In [None]:
# Importing the dataset and creating a dataframe
df =  pd.read_excel('all-euro-data-2023-2024.xlsx', sheet_name='E0' )
df.head()

In [None]:
# convert the dataframe to csv file
df.to_csv('premier_league.csv')
df_new = pd.read_csv('premier_league.csv')
df_new.head()

In [None]:
# removing the first column
df_new.drop(df_new.columns[0], axis=1, inplace=True)
df_new.head()

In [None]:
# save the new dataframe to csv file 
df_new.to_csv('premier_league.csv')
df_new

## Data Preprocessing

In [None]:
# checking for missing values
df_new.isnull().sum()


In [None]:
# checking for duplicates
df_new.duplicated().sum()


In [None]:
# checking for outliers
import matplotlib.pyplot as plt
df_new.boxplot(figsize=(12,8))


## Performing Exploratory Data Analysis

In [None]:
# checking the total matches played
total_matches = df_new.shape[0]
print('Total matches played:', total_matches)

#checking the list of all teams involved
teams = df_new['HomeTeam'].unique()
print('Teams involved:', teams)

# checking the total number of goals scored
total_goals = df_new['FTHG'].sum() + df_new['FTAG'].sum()
print('Total goals scored so far:', total_goals)

# checking the total number of home wins for chelsea
chelsea_home_wins = df_new[(df_new['HomeTeam'] == 'Chelsea') & (df_new['FTR'] == 'H')].shape[0]
print('Total number of home wins for Chelsea:', chelsea_home_wins)

# checking the total number of home losses for manchester united
man_united_home_losses = df_new[(df_new['HomeTeam'] == 'Man United') & (df_new['FTR'] == 'A')].shape[0]
print('Total number of home losses for this shitty man utd team is: ', man_united_home_losses)

In [None]:
# get the current position and calculate points for chelsea and the rest of the teams
def calculate_points_goals(row):
    home_points, away_points = 0, 0
    home_goal_diff = row['FTHG'] - row['FTAG']
    away_goal_diff = row['FTAG'] - row['FTHG']

    if row['FTR'] == 'H':
        home_points = 3
    elif row['FTR'] == 'A':
        away_points = 3
    else:
        home_points = away_points = 1

    return pd.Series([home_points, away_points, home_goal_diff, away_goal_diff, row['FTHG'], row['FTAG']])

# Applying the function to the dataset
df_new[['HomePoints', 'AwayPoints', 'HomeGoalDiff', 'AwayGoalDiff', 'HomeGoals', 'AwayGoals']] = df_new.apply(calculate_points_goals, axis=1)

# Summarizing the data for each team
team_stats = pd.DataFrame(index=teams)

# Calculating total points, goal difference, and goals scored for each team
team_stats['Points'] = df_new.groupby('HomeTeam')['HomePoints'].sum() + df_new.groupby('AwayTeam')['AwayPoints'].sum()
team_stats['GoalDiff'] = df_new.groupby('HomeTeam')['HomeGoalDiff'].sum() + df_new.groupby('AwayTeam')['AwayGoalDiff'].sum()
team_stats['GoalsScored'] = df_new.groupby('HomeTeam')['HomeGoals'].sum() + df_new.groupby('AwayTeam')['AwayGoals'].sum()

# Sorting the teams based on Points, Goal Difference, and Goals Scored
sorted_teams = team_stats.sort_values(by=['Points', 'GoalDiff', 'GoalsScored'], ascending=[False, False, False])

# Finding Chelsea's position
chelsea_position_updated = sorted_teams.index.get_loc('Chelsea') + 1
print('Chelsea is currently in position:', chelsea_position_updated)
sorted_teams.head(), chelsea_position_updated


   

### Analysing Chelsea's performance

In [None]:
# team performance overview
Total_matches = df_new[(df_new['HomeTeam'] == 'Chelsea') | (df_new['AwayTeam'] == 'Chelsea')].shape[0]
Total_matches

# total number of goals scored by chelsea
Total_goals_scored = df_new[(df_new['HomeTeam']== 'Chelsea')]['FTHG'].sum() + df_new[(df_new['HomeTeam'] == 'Chelsea')]['FTAG'].sum()
Total_goals_scored

# total number of goals conceded by chelsea
Total_goals_conceded = df_new[(df_new['HomeTeam'] == 'Chelsea')]['FTAG'].sum() + df_new[(df_new['AwayTeam'] == 'Chelsea')]['FTHG'].sum()
Total_goals_conceded

# total number of home wins for chelsea
Total_home_wins = df_new[(df_new['HomeTeam'] == 'Chelsea') & (df_new['FTR'] == 'H')].shape[0]

# total number of home losses for chelsea
Total_home_losses = df_new[(df_new['HomeTeam'] == 'Chelsea') & (df_new['FTR'] == 'A')].shape[0]

# total number of away wins for chelsea
Total_away_wins = df_new[(df_new['AwayTeam'] == 'Chelsea') & (df_new['FTR'] == 'A')].shape[0]

# total number of away losses for chelsea
Total_away_losses = df_new[(df_new['AwayTeam'] == 'Chelsea') & (df_new['FTR'] == 'H')].shape[0]

# total number of home draws for chelsea
Total_home_draws = df_new[(df_new['HomeTeam'] == 'Chelsea') & (df_new['FTR'] == 'D')].shape[0]

# total number of away draws for chelsea
Total_away_draws = df_new[(df_new['AwayTeam'] == 'Chelsea') & (df_new['FTR'] == 'D')].shape[0]

print(f'Chelsea has played a total of {Total_matches} matches so far in the 2023-2024 season')
print(f'Chelsea has scored a total of {Total_goals_scored} goals so far in the 2023-2024 season')
print(f'Chelsea has conceded a total of {Total_goals_conceded} goals so far in the 2023-2024 season')
print(f'Chelsea has won a total of {Total_home_wins} home matches so far in the 2023-2024 season')
print(f'Chelsea has lost a total of {Total_home_losses} home matches so far in the 2023-2024 season')
print(f'Chelsea has won a total of {Total_away_wins} away matches so far in the 2023-2024 season')
print(f'Chelsea has lost a total of {Total_away_losses} away matches so far in the 2023-2024 season')
print(f'Chelsea has drawn a total of {Total_home_draws} home matches so far in the 2023-2024 season')
print(f'Chelsea has drawn a total of {Total_away_draws} away matches so far in the 2023-2024 season')



### Analysing the top 6 teams

In [None]:
# top 6 teams
top_6_teams = sorted_teams.head(6)
top_6_teams

### feature engineering

In [None]:
# selecting the features for the model
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Encoding Team Names
label_encoder = LabelEncoder()
df_new['HomeTeamEncoded'] = label_encoder.fit_transform(df_new['HomeTeam'])
df_new['AwayTeamEncoded'] = label_encoder.transform(df_new['AwayTeam'])

# Calculate Team's Average Goals Scored and Conceded per Match
# Replace 'FTHG' and 'FTAG' with your actual column names for goals
average_goals_scored_home = df_new.groupby('HomeTeam')['FTHG'].mean()
average_goals_scored_away = df_new.groupby('AwayTeam')['FTAG'].mean()

# Mapping these averages to the main DataFrame
df_new['HomeTeamAvgGoals'] = df_new['HomeTeam'].map(average_goals_scored_home)
df_new['AwayTeamAvgGoals'] = df_new['AwayTeam'].map(average_goals_scored_away)

# Viewing the modified DataFrame
print(df_new.head())


In [None]:
# Calculate Home Team Win Percentage
home_team_wins = df_new[df_new['FTR'] == 'H'].groupby('HomeTeam').size()
total_home_matches = df_new.groupby('HomeTeam').size()
home_team_win_percentage = home_team_wins / total_home_matches

# Calculate Away Team Win Percentage
away_team_wins = df_new[df_new['FTR'] == 'A'].groupby('AwayTeam').size()
total_away_matches = df_new.groupby('AwayTeam').size()
away_team_win_percentage = away_team_wins / total_away_matches

# Create mappings for win percentages
home_team_win_percentage_map = home_team_win_percentage.to_dict()
away_team_win_percentage_map = away_team_win_percentage.to_dict()

# Map the win percentages to the original DataFrame
df_new['HomeTeamWinPercentage'] = df_new['HomeTeam'].map(home_team_win_percentage_map).fillna(0)
df_new['AwayTeamWinPercentage'] = df_new['AwayTeam'].map(away_team_win_percentage_map).fillna(0)

# Check the first few rows to confirm the new features
print(df_new[['HomeTeam', 'HomeTeamWinPercentage', 'AwayTeam', 'AwayTeamWinPercentage']].head())


In [None]:
# Ensure the DataFrame is sorted by date
df_new.sort_values('Date', inplace=True)

# Initialize columns for points and form
df_new['HomeTeamPoints'] = 0
df_new['AwayTeamPoints'] = 0
df_new['HomeTeamRecentForm'] = 0
df_new['AwayTeamRecentForm'] = 0

# Calculate points for each team
for team in df_new['HomeTeam'].unique():
    # Calculate points for the home team
    home_points = df_new[df_new['HomeTeam'] == team]['FTR'].apply(lambda x: 3 if x == 'H' else 1 if x == 'D' else 0).cumsum()
    away_points = df_new[df_new['AwayTeam'] == team]['FTR'].apply(lambda x: 3 if x == 'A' else 1 if x == 'D' else 0).cumsum()

    # Assign the points to the team's home and away matches
    df_new.loc[df_new['HomeTeam'] == team, 'HomeTeamPoints'] = home_points
    df_new.loc[df_new['AwayTeam'] == team, 'AwayTeamPoints'] = away_points

# Calculate recent form for each team
for team in df_new['HomeTeam'].unique():
    # Get indices of the team's home and away games
    home_indices = df_new[df_new['HomeTeam'] == team].index
    away_indices = df_new[df_new['AwayTeam'] == team].index
    
    # Calculate the form for the last 5 games, excluding the current match
    df_new.loc[home_indices, 'HomeTeamRecentForm'] = df_new.loc[home_indices, 'HomeTeamPoints'].diff().fillna(0).rolling(window=6, min_periods=1).sum().shift(fill_value=0)
    df_new.loc[away_indices, 'AwayTeamRecentForm'] = df_new.loc[away_indices, 'AwayTeamPoints'].diff().fillna(0).rolling(window=6, min_periods=1).sum().shift(fill_value=0)

# Display the head of the dataframe to verify
df_new[['Date', 'HomeTeam', 'HomeTeamPoints', 'HomeTeamRecentForm', 'AwayTeam', 'AwayTeamPoints', 'AwayTeamRecentForm']].tail(10)



In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, make_scorer, f1_score
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE

# Selecting features and target
features = ['HomeTeamEncoded', 'AwayTeamEncoded', 'HomeTeamAvgGoals', 'AwayTeamAvgGoals', 'HomeTeamPoints', 'AwayTeamPoints', 'HomeTeamRecentForm', 'AwayTeamRecentForm',
    'HomeTeamWinPercentage', 'AwayTeamWinPercentage']
X = df_new[features]
y = df_new['FTR']

# Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Addressing Class Imbalance
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# Scaling the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_smote)
X_test_scaled = scaler.transform(X_test)

# Hyperparameter Tuning
model = RandomForestClassifier(random_state=42)
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
    
}
grid_search = GridSearchCV(model, param_grid, cv=3, scoring='f1_macro')
grid_search.fit(X_train_scaled, y_train_smote)

# Best Model Evaluation
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test_scaled)
print(classification_report(y_test, y_pred))


In [None]:
# Convert odds to implied probabilities
for bookmaker in ['B365', 'BW', 'IW', 'WH']:
    for outcome in ['H', 'D', 'A']:
        odds_col = f'{bookmaker}{outcome}'
        prob_col = f'ImpliedProb{bookmaker}{outcome}'
        df_new[prob_col] = 1 / df_new[odds_col]

# Normalize implied probabilities
for bookmaker in ['B365', 'BW', 'IW', 'WH']:
    prob_cols = [f'ImpliedProb{bookmaker}{outcome}' for outcome in ['H', 'D', 'A']]
    df_new[prob_cols] = df_new[prob_cols].div(df_new[prob_cols].sum(axis=1), axis=0)

# Include Asian Handicap odds
df_new['AsianHandicapHome'] = df_new['B365AHH']  # Example for Bet365 Asian Handicap Home
df_new['AsianHandicapAway'] = df_new['B365AHA']  # Example for Bet365 Asian Handicap Away

# Include match statistics as features
match_stats_features = ['HS', 'AS', 'HST', 'AST', 'HC', 'AC', 'HF', 'AF', 'HY', 'AY', 'HR', 'AR']
df_new[match_stats_features] = df_new[match_stats_features].fillna(df_new[match_stats_features].median())

# Your features now include odds, Asian handicaps, and match statistics
features.extend([f'ImpliedProb{bookmaker}{outcome}' for bookmaker in ['B365', 'BW', 'IW', 'WH'] for outcome in ['H', 'D', 'A']])
features.extend(['AsianHandicapHome', 'AsianHandicapAway'])
features.extend(match_stats_features)

# Now use these features in your model training and evaluation
# ...


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier

# Define a list with models to evaluate
models = [
    ('Logistic Regression', LogisticRegression(random_state=42)),
    ('Decision Tree', DecisionTreeClassifier(random_state=42)),
    ('Support Vector Machine', SVC(random_state=42)),
    ('K-Nearest Neighbors', KNeighborsClassifier()),
    ('Random Forest', RandomForestClassifier(random_state=42)),
    ('Gradient Boosting', GradientBoostingClassifier(random_state=42))
]

# Record model performance
model_performance = {}

# Loop through models
for name, model in models:
    # Train the model
    model.fit(X_train_scaled, y_train_smote)
    
    # Make predictions
    y_pred = model.predict(X_test_scaled)
    
    # Record performance
    model_performance[name] = classification_report(y_test, y_pred, output_dict=True)

# Print the performance for each model
for name, performance in model_performance.items():
    print(f"Model: {name}")
    print(pd.DataFrame(performance).transpose())


#### Model using xgboost, lightgbm and catboost

In [None]:
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder

# Initialize label encoder
label_encoder = LabelEncoder()

# Fit label encoder and return encoded labels
y_encoded = label_encoder.fit_transform(y)

# Split the dataset into training and testing sets with the encoded target
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.3, random_state=42)

# Now continue with training and evaluating the models as before
classifiers = {
    "XGBoost": XGBClassifier(n_estimators=100, max_depth=6, learning_rate=0.1, verbosity=1, objective='multi:softprob'),
    "LightGBM": LGBMClassifier(n_estimators=100, max_depth=6, learning_rate=0.1, verbose=-1),
    "CatBoost": CatBoostClassifier(iterations=1000, learning_rate=0.1, depth=6, loss_function='MultiClass', verbose=False)
}

evaluation_results = {}

for classifier_name, classifier in classifiers.items():
    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_test)
    report = classification_report(y_test, y_pred, output_dict=True)
    evaluation_results[classifier_name] = report

# Display the results
for classifier_name, report in evaluation_results.items():
    print(f"Classifier: {classifier_name}")
    print(pd.DataFrame(report).transpose())
    print("\n")


### ensemble model with voting classifier

In [None]:
# from sklearn.ensemble import VotingClassifier
# from sklearn.metrics import classification_report
# 
# # Initialize the models
# log_reg = LogisticRegression(random_state=42)
# dec_tree = DecisionTreeClassifier(random_state=42)
# svc = SVC(probability=True, random_state=42)
# knn = KNeighborsClassifier()
# rand_forest = RandomForestClassifier(random_state=42)
# grad_boost = GradientBoostingClassifier(random_state=42)
# xgb = XGBClassifier(n_estimators=100, max_depth=6, learning_rate=0.1, verbosity=1, objective='multi:softprob')
# lgbm = LGBMClassifier(n_estimators=100, max_depth=6, learning_rate=0.1, verbose=-1)
# cat_boost = CatBoostClassifier(iterations=1000, learning_rate=0.1, depth=6, loss_function='MultiClass', verbose=False)
# 
# # Define a list called `estimators` containing tuples of the form (name, model)
# estimators = [
#     ('Logistic Regression', log_reg),
#     ('Decision Tree', dec_tree),
#     ('Support Vector Machine', svc),
#     ('K-Nearest Neighbors', knn),
#     ('Random Forest', rand_forest),
#     ('Gradient Boosting', grad_boost),
#     ('XGBoost', xgb),
#     ('LightGBM', lgbm),
#     ('CatBoost', cat_boost)
# ]
# 
# label_encoder = LabelEncoder()
# y_train_smote_encoded = label_encoder.fit_transform(y_train_smote)
# y_test_encoded = label_encoder.transform(y_test)
# 
# # Initialize voting classifier with 'soft' voting
# voting_clf = VotingClassifier(estimators=estimators, voting='soft')
# 
# # Fit the model on the TRAINING data that has been resampled and scaled
# voting_clf.fit(X_train_scaled, y_train_smote)
# 
# # Evaluate the model on the ORIGINAL TEST data (which has been scaled but not resampled)
# y_pred = voting_clf.predict(X_test_scaled)
# print(classification_report(y_test, y_pred))

In [None]:
# print("X_train_scaled shape:", X_train_scaled.shape)
# print("y_train_encoded shape:", y_train_encoded.shape)
# print("X_test_scaled shape:", X_test_scaled.shape)
# print("y_test_encoded shape:", y_test_encoded.shape)
# 


In [None]:
# from sklearn.preprocessing import LabelEncoder
# 
# # Initialize the label encoder
# label_encoder = LabelEncoder()
# 
# # Apply label encoding to the training and testing target variables separately
# y_train_encoded = label_encoder.fit_transform(y_train)
# y_test_encoded = label_encoder.transform(y_test)
# 
# # Now verify the shapes again
# print("X_train_scaled shape:", X_train_scaled.shape)
# print("y_train_encoded shape:", y_train_encoded.shape)
# print("X_test_scaled shape:", X_test_scaled.shape)
# print("y_test_encoded shape:", y_test_encoded.shape)
# 
# # If the shapes now match, you can proceed with training the voting classifier
# if X_train_scaled.shape[0] == y_train_encoded.shape[0] and X_test_scaled.shape[0] == y_test_encoded.shape[0]:
#     voting_clf.fit(X_train_scaled, y_train_encoded)
#     y_pred = voting_clf.predict(X_test_scaled)
#     print(classification_report(y_test_encoded, y_pred))
# else:
#     print("Mismatch in dataset dimensions, please check data preprocessing steps.")


In [None]:
df_new.head()

In [None]:
from hyperopt import hp, tpe, Trials, fmin, STATUS_OK,space_eval
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier
import numpy as np

# Feature Engineering
# Ensure that this step is already done and df_new contains the engineered features


# Assuming 'df_new' is your DataFrame and 'E0' is in the 'Div'      
pd.get_dummies(df_new, columns=['Div'], drop_first=True)


# Hyperparameter Space
space = {
    'n_estimators': hp.quniform('n_estimators', 100, 1000, 50),
    'max_depth': hp.choice('max_depth', np.arange(3, 11, dtype=int)),
    'min_child_weight': hp.quniform('min_child_weight', 1, 9, 1),
    'learning_rate': hp.loguniform('learning_rate', np.log(0.01), np.log(0.2)),
    'subsample': hp.uniform('subsample', 0.7, 1),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.7, 1)
}

def objective(params):
    params['n_estimators'] = int(params['n_estimators'])
    clf = XGBClassifier(**params, use_label_encoder=False, objective='multi:softprob', eval_metric='mlogloss')
    score = cross_val_score(clf, X_train_scaled, y_train, scoring='accuracy', cv=StratifiedKFold(5)).mean()
    return {'loss': -score, 'status': STATUS_OK}

# List of categorical column names
categorical_cols = ['HomeTeam', 'AwayTeam']  # Add all your categorical columns here

# Apply label encoding to each categorical column
for col in categorical_cols:
    df_new[col] = label_encoder.fit_transform(df_new[col])

# saving the label encoder
import pickle
pickle.dump(label_encoder, open('new_label_encoder.pkl', 'wb'))
# Encoding target variable
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# Scaling features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Run Hyperparameter Optimization
trials = Trials()
best = fmin(fn=objective, space=space, algo=tpe.suggest, max_evals=10, trials=trials)

# Best hyperparameters
best_params = space_eval(space, best)
best_params['n_estimators'] = int(best_params['n_estimators'])
print(f"Best parameters: {best_params}")

# Initialize model with best parameters
model = XGBClassifier(**best_params, use_label_encoder=False, objective='multi:softprob', eval_metric='mlogloss')
model.fit(X_train_scaled, y_train_encoded)

# Prediction and Evaluation
y_pred = model.predict(X_test_scaled)
y_pred_proba = model.predict_proba(X_test_scaled)
print(classification_report(y_test_encoded, y_pred))


# Calculate ROC AUC Score
try:
    roc_auc = roc_auc_score(y_test_encoded, y_pred_proba, multi_class='ovr')
    print(f"ROC AUC: {roc_auc}")
except ValueError as e:
    print("Error calculating ROC AUC:", e)

# Other metrics
accuracy = accuracy_score(y_test_encoded, y_pred)
precision = precision_score(y_test_encoded, y_pred, average='macro')
recall = recall_score(y_test_encoded, y_pred, average='macro')
f1 = f1_score(y_test_encoded, y_pred, average='macro')
roc_auc = roc_auc_score(y_test_encoded, y_pred_proba, multi_class='ovo')

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")
print(f"ROC AUC: {roc_auc}")


In [1]:
# catboost hyperparameter tuning
from hyperopt import hp, tpe, Trials, fmin, STATUS_OK,space_eval
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from sklearn.preprocessing import StandardScaler
from catboost import CatBoostClassifier
import numpy as np

# Feature Engineering
# Ensure that this step is already done and df_new contains the engineered features

# Define hyperparameter space
space = {
    'iterations': hp.quniform('iterations', 100, 1000, 50),
    'depth': hp.choice('depth', np.arange(3, 11, dtype=int)),
    'learning_rate': hp.loguniform('learning_rate', np.log(0.01), np.log(0.2)),
    'l2_leaf_reg': hp.uniform('l2_leaf_reg', 1, 10),
    'bagging_temperature': hp.uniform('bagging_temperature', 0, 1),
    'random_strength': hp.uniform('random_strength', 0, 1)
}

def objective(params):
    params['iterations'] = int(params['iterations'])
    clf = CatBoostClassifier(**params, loss_function='MultiClass', verbose=False)
    score = cross_val_score(clf, X_train_scaled, y_train, scoring='accuracy', cv=StratifiedKFold(3)).mean()
    return {'loss': -score, 'status': STATUS_OK}

# Run hyperparameter tuning
trials = Trials()
best = fmin(fn=objective, space=space, algo=tpe.suggest, max_evals=15, trials=trials)

# Get best hyperparameters
best_params = space_eval(space, best)

# Initialize model with best parameters
model = CatBoostClassifier(**best_params, loss_function='MultiClass', eval_metric='MultiClass', verbose=False)

# Fit the model
model.fit(X_train_scaled, y_train_encoded)

# Prediction and Evaluation
y_pred = model.predict(X_test_scaled)
y_pred_proba = model.predict_proba(X_test_scaled)
print(classification_report(y_test_encoded, y_pred))

# Calculate ROC AUC Score
try:
    roc_auc = roc_auc_score(y_test_encoded, y_pred_proba, multi_class='ovr')
    print(f"ROC AUC: {roc_auc}")
except ValueError as e:
    print("Error calculating ROC AUC:", e)

# Other metrics
accuracy = accuracy_score(y_test_encoded, y_pred)
precision = precision_score(y_test_encoded, y_pred, average='macro')
recall = recall_score(y_test_encoded, y_pred, average='macro')
f1 = f1_score(y_test_encoded, y_pred, average='macro')
roc_auc = roc_auc_score(y_test_encoded, y_pred_proba, multi_class='ovo')

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")
print(f"ROC AUC: {roc_auc}")



  0%|          | 0/15 [00:00<?, ?trial/s, best loss=?]

job exception: name 'X_train_scaled' is not defined



  0%|          | 0/15 [00:00<?, ?trial/s, best loss=?]


NameError: name 'X_train_scaled' is not defined

### saving the catboost model

In [182]:
# save the encoder and scaler
import pickle
pickle.dump(label_encoder, open('label_encoder.pkl', 'wb'))
pickle.dump(scaler, open('scaler.pkl', 'wb'))

# save the model as a pickle file
pickle.dump(model, open('catboost_model.pkl', 'wb'))



In [183]:
# load the model from disk
import pickle
loaded_model = pickle.load(open('catboost_model.pkl', 'rb'))
result = loaded_model.score(X_test_scaled, y_test_encoded)
print(result)

0.6862745098039216


In [184]:
import pickle

# Load the saved objects
loaded_label_encoder = pickle.load(open('label_encoder.pkl', 'rb'))
loaded_scaler = pickle.load(open('scaler.pkl', 'rb'))
loaded_model = pickle.load(open('catboost_model.pkl', 'rb'))


In [195]:
print(features)

['HomeTeamEncoded', 'AwayTeamEncoded', 'HomeTeamAvgGoals', 'AwayTeamAvgGoals', 'HomeTeamPoints', 'AwayTeamPoints', 'HomeTeamRecentForm', 'AwayTeamRecentForm', 'HomeTeamWinPercentage', 'AwayTeamWinPercentage', 'ImpliedProbB365H', 'ImpliedProbB365D', 'ImpliedProbB365A', 'ImpliedProbBWH', 'ImpliedProbBWD', 'ImpliedProbBWA', 'ImpliedProbIWH', 'ImpliedProbIWD', 'ImpliedProbIWA', 'ImpliedProbWHH', 'ImpliedProbWHD', 'ImpliedProbWHA', 'AsianHandicapHome', 'AsianHandicapAway', 'HS', 'AS', 'HST', 'AST', 'HC', 'AC', 'HF', 'AF', 'HY', 'AY', 'HR', 'AR']


In [196]:
import pickle
import numpy as np

# Load the saved model and label encoder
model = pickle.load(open('catboost_model.pkl', 'rb'))
label_encoder = pickle.load(open('label_encoder.pkl', 'rb'))

# Function to encode team names
def encode_team(team_name):
    try:
        return label_encoder.transform([team_name])[0]
    except ValueError:
        print(f"Team name '{team_name}' not recognized.")
        return -1  # Or handle unknown teams differently

# Function to make a prediction with only home and away team names
def make_prediction(home_team, away_team):
    # Encode team names
    home_team_encoded = encode_team(home_team)
    away_team_encoded = encode_team(away_team)

    if home_team_encoded == -1 or away_team_encoded == -1:
        return "Invalid team name(s)"

    # Creating a feature array with only team names
    features_array = np.array([home_team_encoded, away_team_encoded]).reshape(1, -1)

    # Predicting the result
    prediction = model.predict(features_array)
    return prediction

# Example usage
home_team = input("Enter Home Team: ")
away_team = input("Enter Away Team: ")

predicted_result = make_prediction(home_team, away_team)
print("Predicted Result:", predicted_result)


Team name 'Chelsea' not recognized.
Team name 'Luton' not recognized.
Predicted Result: Invalid team name(s)


In [199]:
import pickle
from sklearn.preprocessing import LabelEncoder

# Assuming 'df_new' contains the correct team names in 'HomeTeam' and 'AwayTeam' columns
all_teams = pd.concat([df_new['HomeTeam'], df_new['AwayTeam']]).unique()
label_encoder_n= LabelEncoder()
label_encoder_n.fit(all_teams)

# Save the re-trained label encoder
pickle.dump(label_encoder_n, open('new_label_encoder.pkl', 'wb'))



In [200]:
print(label_encoder_n.classes_)

[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19]


In [209]:
import pickle
import numpy as np

# Load the new label encoder and CatBoost model
label_encoder_n = pickle.load(open('new_label_encoder.pkl', 'rb'))
model = pickle.load(open('catboost_model.pkl', 'rb'))

# Function to make a prediction
def predict_match(home_team, away_team):
    try:
        # Encode the team names using the new label encoder
        home_team_encoded = label_encoder_n.transform([home_team])[0]
        away_team_encoded = label_encoder_n.transform([away_team])[0]

        # Placeholder for other features (use average or typical values)
        other_features = np.array([0] * 18)  # Adjust the length according to your model's feature count

        # Combine all features
        features = np.append([home_team_encoded, away_team_encoded], other_features).reshape(1, -1)

        # Make a prediction
        prediction = model.predict(features)
        return label_encoder_n.inverse_transform(prediction)[0]
    except Exception as e:
        return f"Error during prediction: {e}"

# Example usage
home_team = 7
away_team = 7
result = predict_match(home_team, away_team)
print(f"Predicted Result: {result}")


Predicted Result: 2


  y = column_or_1d(y, warn=True)
