In [25]:
#Import packages
import pandas as pd

In [26]:
#Read CSV
df = pd.read_csv("../data/processed/epl_matches_final.csv")

In [27]:
df.head()

Unnamed: 0.1,Unnamed: 0,date,time,result,poss,sh,sot,dist,fk,pk,...,team_Newcastle Utd,team_Norwich City,team_Nottingham Forest,team_Sheffield United,team_Southampton,team_Tottenham,team_Watford,team_West Bromwich Albion,team_West Ham,team_Wolves
0,0,2019-08-09,20,1,57.0,15.0,7.0,17.1,1.0,0,...,False,False,False,False,False,False,False,False,False,False
1,1,2019-08-17,15,1,63.0,15.0,6.0,18.6,1.0,0,...,False,False,False,False,False,False,False,False,False,False
2,2,2019-08-24,17,1,52.0,24.0,4.0,18.8,0.0,1,...,False,False,False,False,False,False,False,False,False,False
3,3,2019-08-31,17,1,63.0,15.0,7.0,21.0,0.0,0,...,False,False,False,False,False,False,False,False,False,False
4,4,2019-09-14,12,1,74.0,21.0,8.0,13.6,0.0,0,...,False,False,False,False,False,False,False,False,False,False


In [28]:
#Convert date to datetime
df['date'] = pd.to_datetime(df['date'])

In [29]:
# Sort data by date to ensure chronological order
df_sorted = df.sort_values(by='date')

# Define a split point; use 80% of the data for training and 20% for testing
split_date = df_sorted['date'].quantile(0.8)

# Split the data
train_df = df_sorted[df_sorted['date'] <= split_date]
test_df = df_sorted[df_sorted['date'] > split_date]

In [30]:
target_column = 'result'
feature_columns = df.drop(['date', 'Unnamed: 0', 'result'], axis=1).columns

# For training data
X_train = train_df[feature_columns]
y_train = train_df[target_column]

# For testing data
X_test = test_df[feature_columns]
y_test = test_df[target_column]

# Remap labels from [-1, 0, 1] to [0, 1, 2]
y_train = y_train.replace(-1, 2)
y_test = y_test.replace(-1, 2)

In [31]:
import xgboost as xgb
from sklearn.metrics import classification_report, accuracy_score

# Initialize models
models = {
    'XGBoost': xgb.XGBClassifier(
    objective='multi:softmax',  # Multi-class classification
    num_class=3,                # Number of classes (win, loss, draw)
    eval_metric='mlogloss'     # Evaluation metric
)
}

# Train and evaluate each model
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f"Model: {name}")
    print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
    print(classification_report(y_test, y_pred))

Model: XGBoost
Accuracy: 0.5539473684210526
              precision    recall  f1-score   support

           0       0.25      0.18      0.21       164
           1       0.59      0.71      0.64       298
           2       0.63      0.61      0.62       298

    accuracy                           0.55       760
   macro avg       0.49      0.50      0.49       760
weighted avg       0.53      0.55      0.54       760



In [32]:
from sklearn.model_selection import GridSearchCV
# Initialize the XGBoost model
model = xgb.XGBClassifier(objective='multi:softmax', num_class=3, eval_metric='mlogloss')

# Define the hyperparameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0],
}

# Perform Grid Search
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, scoring='accuracy', cv=5, verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Review the best parameters
print("Best Parameters:", grid_search.best_params_)

# Evaluate the best model
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')
print(classification_report(y_test, y_pred))

Fitting 5 folds for each of 243 candidates, totalling 1215 fits
Best Parameters: {'colsample_bytree': 0.8, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100, 'subsample': 0.9}
Accuracy: 0.6039473684210527
              precision    recall  f1-score   support

           0       0.32      0.05      0.08       164
           1       0.60      0.81      0.69       298
           2       0.63      0.70      0.67       298

    accuracy                           0.60       760
   macro avg       0.52      0.52      0.48       760
weighted avg       0.55      0.60      0.55       760

