Importing necessary libraries:

In [1]:
import pandas as pd
import numpy as np
from sklearn import model_selection
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
import matplotlib.pyplot as plt
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, log_loss, precision_score, recall_score, f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier

Loading train data:

In [2]:
train_home_team_statistics_df = pd.read_csv("https://huggingface.co/datasets/bh2821/soccer_pred/resolve/main/train_home_team_statistics_df.csv", index_col=0)
train_away_team_statistics_df = pd.read_csv("https://huggingface.co/datasets/bh2821/soccer_pred/resolve/main/train_away_team_statistics_df.csv", index_col=0)

train_scores = pd.read_csv('https://huggingface.co/datasets/bh2821/soccer_pred/resolve/main/Y_train.csv', index_col=0)

In [None]:
train_home_team_statistics_df

Unnamed: 0_level_0,LEAGUE,TEAM_NAME,TEAM_SHOTS_TOTAL_season_sum,TEAM_SHOTS_INSIDEBOX_season_sum,TEAM_SHOTS_OFF_TARGET_season_sum,TEAM_SHOTS_ON_TARGET_season_sum,TEAM_SHOTS_OUTSIDEBOX_season_sum,TEAM_PASSES_season_sum,TEAM_SUCCESSFUL_PASSES_season_sum,TEAM_SAVES_season_sum,...,TEAM_YELLOWCARDS_5_last_match_std,TEAM_REDCARDS_5_last_match_std,TEAM_OFFSIDES_5_last_match_std,TEAM_ATTACKS_5_last_match_std,TEAM_PENALTIES_5_last_match_std,TEAM_SUBSTITUTIONS_5_last_match_std,TEAM_BALL_SAFE_5_last_match_std,TEAM_DANGEROUS_ATTACKS_5_last_match_std,TEAM_INJURIES_5_last_match_std,TEAM_GOALS_5_last_match_std
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,Ligue 1,Toulouse,3.0,2.0,5.0,2.0,1.0,2.0,2.0,5.0,...,3.0,0.0,6.0,0.0,10.0,8.0,7.0,2.0,4.0,3.0
1,Ligue 2,Brest,6.0,8.0,3.0,6.0,5.0,8.0,7.0,10.0,...,4.0,0.0,4.0,3.0,10.0,0.0,1.0,2.0,8.0,4.0
2,Serie A,Sampdoria,4.0,2.0,5.0,2.0,8.0,1.0,1.0,2.0,...,4.0,5.0,6.0,3.0,6.0,7.0,2.0,3.0,2.0,4.0
3,League One,Coventry City,7.0,5.0,5.0,6.0,6.0,9.0,9.0,2.0,...,4.0,0.0,1.0,8.0,8.0,5.0,5.0,5.0,,6.0
4,Premier League,Wolverhampton Wanderers,3.0,3.0,2.0,3.0,4.0,4.0,3.0,4.0,...,1.0,0.0,2.0,5.0,8.0,7.0,2.0,6.0,4.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12298,League One,Southend United,4.0,2.0,3.0,4.0,7.0,4.0,4.0,3.0,...,4.0,0.0,2.0,5.0,0.0,5.0,0.0,0.0,,4.0
12299,Liga Portugal,Boavista,4.0,2.0,3.0,1.0,5.0,1.0,1.0,9.0,...,6.0,10.0,7.0,2.0,0.0,0.0,3.0,10.0,6.0,1.0
12300,Bundesliga,Schalke 04,4.0,3.0,5.0,3.0,5.0,1.0,1.0,6.0,...,7.0,8.0,0.0,7.0,0.0,0.0,3.0,1.0,3.0,2.0
12301,League One,Fleetwood Town,2.0,,1.0,1.0,,,,0.0,...,10.0,10.0,,6.0,8.0,3.0,0.0,3.0,,5.0


Data Preprocessing:

In [3]:
train_home = train_home_team_statistics_df.iloc[:,2:] #select relevant feature columns
train_away = train_away_team_statistics_df.iloc[:,2:]

train_difference = train_home.to_numpy() - train_away.to_numpy()
train_difference_df = pd.DataFrame(train_difference)
train_difference_df.columns = 'DIFF_' + train_home.columns #create difference features

train_home.columns = 'HOME_' + train_home.columns
train_away.columns = 'AWAY_' + train_away.columns #rename columns to avoid conflicts

train_data =  pd.concat([train_home, train_away, train_difference_df], join='inner',axis=1) #combine everything into 1 feature matrix
train_scores = train_scores.loc[train_data.index] #align labels with features

train_data = train_data.replace({np.inf:np.nan,-np.inf:np.nan})

Data Cleaning and Filtering:

In [4]:
crit = train_data.notna().sum(axis=1) > 390

purified_train_data = train_data[crit]
purified_train_data = purified_train_data.fillna(value=0)

purified_train_scores = train_scores[crit]

Normalizing the Data:

In [5]:
purified_train_data = purified_train_data / 10

Defining functions for data augmentation (feature transformation):

In [6]:
def data_augmentation(append_on, df, func, name):
    df_new = append_on
    for c in df.columns:
        df_new[c+name] = func(df[c])
    return df_new

In [7]:
auged_train_data = purified_train_data.copy()
auged_train_data = data_augmentation(auged_train_data, (purified_train_data + 2), np.log10, "_log")
auged_train_data = data_augmentation(auged_train_data, (purified_train_data + 0), np.square, "_sqre")

  df_new[c+name] = func(df[c])
  df_new[c+name] = func(df[c])
  df_new[c+name] = func(df[c])
  df_new[c+name] = func(df[c])
  df_new[c+name] = func(df[c])
  df_new[c+name] = func(df[c])
  df_new[c+name] = func(df[c])
  df_new[c+name] = func(df[c])
  df_new[c+name] = func(df[c])
  df_new[c+name] = func(df[c])
  df_new[c+name] = func(df[c])
  df_new[c+name] = func(df[c])
  df_new[c+name] = func(df[c])
  df_new[c+name] = func(df[c])
  df_new[c+name] = func(df[c])
  df_new[c+name] = func(df[c])
  df_new[c+name] = func(df[c])
  df_new[c+name] = func(df[c])
  df_new[c+name] = func(df[c])
  df_new[c+name] = func(df[c])
  df_new[c+name] = func(df[c])
  df_new[c+name] = func(df[c])
  df_new[c+name] = func(df[c])
  df_new[c+name] = func(df[c])
  df_new[c+name] = func(df[c])
  df_new[c+name] = func(df[c])
  df_new[c+name] = func(df[c])
  df_new[c+name] = func(df[c])
  df_new[c+name] = func(df[c])
  df_new[c+name] = func(df[c])
  df_new[c+name] = func(df[c])
  df_new[c+name] = func(df[c])
  df_new

Encoding Target Variable

In [8]:
train_new_y = purified_train_scores["HOME_WINS"] - purified_train_scores["AWAY_WINS"]

In [9]:
train_new_y = train_new_y + 1

Train-Test Spliting:

In [10]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(auged_train_data, train_new_y, train_size=0.8, random_state=2821)
X_train_mt, X_test_mt, y_train_mt, y_test_mt = model_selection.train_test_split(auged_train_data, purified_train_scores, train_size=0.8, random_state=2821)

Same Data Preprocessing procedure to process the given test data:

In [None]:
test_home_team_statistics_df = pd.read_csv("/content/test_home_team_statistics_df.csv")
test_away_team_statistics_df = pd.read_csv("/content/test_away_team_statistics_df.csv")
test_home = test_home_team_statistics_df.iloc[:, 2:]
test_away = test_away_team_statistics_df.iloc[:, 2:]

test_difference = test_home.to_numpy() - test_away.to_numpy()
test_difference_df = pd.DataFrame(test_difference, index=test_home.index)

test_difference_df.columns = 'DIFF_' + test_home.columns
test_home.columns = 'HOME_' + test_home.columns
test_away.columns = 'AWAY_' + test_away.columns

test_data = pd.concat([test_home, test_away, test_difference_df], axis=1)

test_data = test_data.replace({np.inf: np.nan, -np.inf: np.nan})

# crit_test = test_data.notna().sum(axis=1) > 390. # here we remove this criteria as we noticed including it results in missing values for predictions
# purified_test_data = test_data[crit_test]

purified_test_data = test_data.fillna(0)
purified_test_data = purified_test_data / 10

auged_test_data = purified_test_data.copy()
auged_test_data = data_augmentation(auged_test_data, (purified_test_data + 2), np.log10, "_log")
auged_test_data = data_augmentation(auged_test_data, purified_test_data, np.square, "_sqre")
# auged_test_data.shape
# X_test.shape
auged_test_data = auged_test_data.reindex(columns=X_train.columns, fill_value=0)

  df_new[c+name] = func(df[c])
  df_new[c+name] = func(df[c])
  df_new[c+name] = func(df[c])
  df_new[c+name] = func(df[c])
  df_new[c+name] = func(df[c])
  df_new[c+name] = func(df[c])
  df_new[c+name] = func(df[c])
  df_new[c+name] = func(df[c])
  df_new[c+name] = func(df[c])
  df_new[c+name] = func(df[c])
  df_new[c+name] = func(df[c])
  df_new[c+name] = func(df[c])
  df_new[c+name] = func(df[c])
  df_new[c+name] = func(df[c])
  df_new[c+name] = func(df[c])
  df_new[c+name] = func(df[c])
  df_new[c+name] = func(df[c])
  df_new[c+name] = func(df[c])
  df_new[c+name] = func(df[c])
  df_new[c+name] = func(df[c])
  df_new[c+name] = func(df[c])
  df_new[c+name] = func(df[c])
  df_new[c+name] = func(df[c])
  df_new[c+name] = func(df[c])
  df_new[c+name] = func(df[c])
  df_new[c+name] = func(df[c])
  df_new[c+name] = func(df[c])
  df_new[c+name] = func(df[c])
  df_new[c+name] = func(df[c])
  df_new[c+name] = func(df[c])
  df_new[c+name] = func(df[c])
  df_new[c+name] = func(df[c])
  df_new

#### Logitistic Regression:

In [None]:
model = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000)
model.fit(X_train, y_train)



First attempt to get prediction result:

In [None]:
y_pred = model.predict(auged_test_data)
y_pred_probs = model.predict_proba(auged_test_data)
y_pred
y_pred_onehot = pd.get_dummies(y_pred)

y_pred_onehot = y_pred_onehot.rename(columns={
    0: 'AWAY_WINS',
    1: 'DRAW',
    2: 'HOME_WINS'
})

y_pred_onehot = y_pred_onehot[['HOME_WINS', 'DRAW', 'AWAY_WINS']]

In [None]:
y_pred_onehot.to_csv("test1.csv")

Grid Search to find best params:

In [None]:
param_grid = {
    'C': [0.01, 0.1, 1, 10],
    'solver': ['lbfgs', 'newton-cg', 'saga'],
    'penalty': ['l2']
}

grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_log_loss', verbose=2, n_jobs=-1)
grid_search.fit(X_train, y_train)
print("Best Parameters:", grid_search.best_params_)
print("Best Log Loss (negative):", grid_search.best_score_)

Fitting 5 folds for each of 12 candidates, totalling 60 fits




Best Parameters: {'C': 0.01, 'penalty': 'l2', 'solver': 'newton-cg'}
Best Log Loss (negative): -1.0295171854126892


In [None]:
best_model_lr = grid_search.best_estimator_
y_pred = best_model_lr.predict(X_test)
y_proba = best_model_lr.predict_proba(X_test)

print(grid_search.best_estimator_)

print("\nAccuracy:", accuracy_score(y_test, y_pred))
print("Log Loss:", log_loss(y_test, y_proba))

print("\n Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=["Away Win", "Draw", "Home Win"]))


Accuracy: 0.48563218390804597
Log Loss: 1.0277128195831162

 Confusion Matrix:
[[277  43 296]
 [170  46 325]
 [177  63 691]]

Classification Report:
              precision    recall  f1-score   support

    Away Win       0.44      0.45      0.45       616
        Draw       0.30      0.09      0.13       541
    Home Win       0.53      0.74      0.62       931

    accuracy                           0.49      2088
   macro avg       0.42      0.43      0.40      2088
weighted avg       0.44      0.49      0.44      2088



Using best logistic regression model to generate prediction:

In [None]:
best_model = grid_search.best_estimator_
y_pred = best_model.predict(auged_test_data)
y_pred_onehot = pd.get_dummies(y_pred)

y_pred_onehot = y_pred_onehot.rename(columns={
    0: 'AWAY_WINS',
    1: 'DRAW',
    2: 'HOME_WINS'
})

y_pred_onehot = y_pred_onehot[['HOME_WINS', 'DRAW', 'AWAY_WINS']]

In [None]:
y_pred_onehot.to_csv("test2.csv")

SVM (classifier)

In [11]:
svm_model = svm.SVC(kernel='rbf', C=1.0, probability=True)
svm_model.fit(X_train, y_train)

In [None]:
y_pred = svm_model.predict(X_test)
y_proba = svm_model.predict_proba(X_test)

print("\nAccuracy:", accuracy_score(y_test, y_pred))
print("Log Loss:", log_loss(y_test, y_proba))
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=["Away Win", "Draw", "Home Win"]))

print("\n Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Accuracy: 0.5086206896551724
Log Loss: 1.0244084559151923

Classification Report:
              precision    recall  f1-score   support

    Away Win       0.49      0.44      0.46       616
        Draw       0.31      0.03      0.05       541
    Home Win       0.52      0.83      0.64       931

    accuracy                           0.51      2088
   macro avg       0.44      0.43      0.39      2088
weighted avg       0.46      0.51      0.44      2088


 Confusion Matrix:
[[272  20 324]
 [146  15 380]
 [142  14 775]]


In [None]:
y_pred = svm_model.predict(auged_test_data)
y_pred_onehot = pd.get_dummies(y_pred)

y_pred_onehot = y_pred_onehot.rename(columns={
    0: 'AWAY_WINS',
    1: 'DRAW',
    2: 'HOME_WINS'
})

y_pred_onehot = y_pred_onehot[['HOME_WINS', 'DRAW', 'AWAY_WINS']]
y_pred_onehot.to_csv("testSVM.csv")

Random forest:

In [None]:
rf_model = RandomForestClassifier(
    n_estimators=300,
    max_depth=30,
    random_state=42,
    n_jobs=-1
)

rf_model.fit(X_train, y_train)

In [None]:
y_pred = rf_model.predict(X_test)
y_proba = rf_model.predict_proba(X_test)

In [None]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nLog Loss:", log_loss(y_test, y_proba))
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=["Away Win", "Draw", "Home Win"]))

print("\n Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.5067049808429118

Log Loss: 1.0158590318571272

Classification Report:
              precision    recall  f1-score   support

    Away Win       0.49      0.44      0.46       616
        Draw       0.33      0.04      0.07       541
    Home Win       0.52      0.82      0.64       931

    accuracy                           0.51      2088
   macro avg       0.45      0.43      0.39      2088
weighted avg       0.46      0.51      0.44      2088


 Confusion Matrix:
[[272  23 321]
 [140  21 380]
 [147  19 765]]


In [None]:
y_pred = rf_model.predict(auged_test_data)
y_pred_onehot = pd.get_dummies(y_pred)

y_pred_onehot = y_pred_onehot.rename(columns={
    0: 'AWAY_WINS',
    1: 'DRAW',
    2: 'HOME_WINS'
})

y_pred_onehot = y_pred_onehot[['HOME_WINS', 'DRAW', 'AWAY_WINS']]
y_pred_onehot.to_csv("testRF.csv")

I tried to use BayesSearch to find the best params, but it was taking too long (more than 5 hours) and I had to give up:

In [None]:
from skopt import BayesSearchCV
from sklearn.model_selection import StratifiedKFold

rf = RandomForestClassifier(random_state=42, n_jobs=-1)

search_spaces = {
    'n_estimators': (100, 1000),
    'max_depth': (10, 100),
    'min_samples_split': (2, 20),
    'min_samples_leaf': (1, 10),
    'max_features': ['sqrt', 'log2', None]
}

bayes_cv = BayesSearchCV(
    estimator=rf,
    search_spaces=search_spaces,
    scoring='neg_log_loss',
    n_iter=30,
    cv=StratifiedKFold(n_splits=3),
    n_jobs=-1,
    verbose=2,
    random_state=42
)

bayes_cv.fit(X_train, y_train)

print("Best Params:", bayes_cv.best_params_)
print("\nBest Score:", -bayes_cv.best_score_)

best_rf_model = bayes_cv.best_estimator_


Generating metrics table using ten different seeds on the models:

In [None]:
models = {
    "LogisticRegression": LogisticRegression(multi_class='multinomial', solver='newton-cg', max_iter=1000, C=0.01, penalty="l2"),
    "SVM": svm.SVC(kernel='rbf', C=1.0, probability=True),
    "RandomForest": RandomForestClassifier(n_estimators=300, max_depth=30, random_state=42, n_jobs=-1)
}

metric_functions = {
    "Accuracy": accuracy_score,
    "Precision": lambda y_true, y_pred: precision_score(y_true, y_pred, average='weighted', zero_division=0),
    "Recall": lambda y_true, y_pred: recall_score(y_true, y_pred, average='weighted', zero_division=0),
    "F1": lambda y_true, y_pred: f1_score(y_true, y_pred, average='weighted', zero_division=0),
}

results = {model_name: {metric: [] for metric in metric_functions} for model_name in models}

seeds = range(10)
for seed in seeds:
    X_tr, X_te, y_tr, y_te = model_selection.train_test_split(auged_train_data, train_new_y, test_size=0.2, random_state=seed)

    for model_name, model in models.items():
        model.fit(X_tr, y_tr)
        y_pred = model.predict(X_te)

        for metric_name, metric_fn in metric_functions.items():
            score = metric_fn(y_te, y_pred)
            results[model_name][metric_name].append(score)



In [None]:
metrics_summary = {}

for metric_name in metric_functions:
    row = {}
    for model_name in models:
        scores = results[model_name][metric_name]
        mean = np.mean(scores)
        std = np.std(scores)
        row[model_name] = f"({mean - std:.4f}, {mean + std:.4f})"
    metrics_summary[metric_name] = row

metrics_df = pd.DataFrame(metrics_summary).T
metrics_df.index.name = "Metric"

In [None]:
metrics_df

Unnamed: 0_level_0,LogisticRegression,SVM,RandomForest
Metric,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Accuracy,"(0.4726, 0.4935)","(0.4813, 0.5018)","(0.4797, 0.5017)"
Precision,"(0.4282, 0.4608)","(0.4259, 0.4675)","(0.4356, 0.4599)"
Recall,"(0.4726, 0.4935)","(0.4813, 0.5018)","(0.4797, 0.5017)"
F1,"(0.4236, 0.4509)","(0.4053, 0.4342)","(0.4099, 0.4367)"
