In [4]:
# notebooks/RandomForest.ipynb

## Import necessary libraries
import sys
sys.path.append('../scripts')

from mongo_connection import get_matches_collection
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib


In [5]:
## Step 1: Data Preparation
# Fetch data from MongoDB
collection = get_matches_collection()
cursor = collection.find({})

# Convert MongoDB cursor to DataFrame
df = pd.DataFrame(list(cursor))

# Show the first few rows of the DataFrame
df.head()

Unnamed: 0,_id,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral,winner
0,6692ad7288365a953ad9e558,1872-11-30,Scotland,England,0,0,Friendly,Glasgow,Scotland,False,Draw
1,6692ad7288365a953ad9e559,1873-03-08,England,Scotland,4,2,Friendly,London,England,False,England
2,6692ad7288365a953ad9e55a,1874-03-07,Scotland,England,2,1,Friendly,Glasgow,Scotland,False,Scotland
3,6692ad7288365a953ad9e55b,1875-03-06,England,Scotland,2,2,Friendly,London,England,False,Draw
4,6692ad7288365a953ad9e55c,1876-03-04,Scotland,England,3,0,Friendly,Glasgow,Scotland,False,Scotland


In [6]:
# Feature engineering
df['winner'] = df.apply(lambda row: 'Draw' if row['home_score'] == row['away_score'] else (row['home_team'] if row['home_score'] > row['away_score'] else row['away_team']), axis=1)

# Converting categorical columns to numerical (one-hot encoded)
df = pd.get_dummies(df, columns=['home_team', 'away_team', 'tournament', 'city', 'country', 'winner'], drop_first=True)

In [7]:
# Defining features and target variable
features = df.drop(columns=['_id', 'date', 'home_score', 'away_score', 'winner_Draw', 'winner_England', 'winner_Scotland'])
target = df['winner_Draw'] 

# Splitting data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

In [8]:
## Step 2: Model Training

# Initialize Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
rf_model.fit(X_train, y_train)

# Make predictions
y_pred = rf_model.predict(X_test)

In [9]:
# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

## Step 3: Saving the Model
# Save the trained model to a file

joblib.dump(rf_model, '../models/random_forest_model.pkl')

# Load the trained model from the file
loaded_rf_model = joblib.load('../models/random_forest_model.pkl')

Accuracy: 0.9805845731771657
Classification Report:
               precision    recall  f1-score   support

       False       0.99      0.99      0.99      7434
        True       0.96      0.95      0.95      2043

    accuracy                           0.98      9477
   macro avg       0.97      0.97      0.97      9477
weighted avg       0.98      0.98      0.98      9477

Confusion Matrix:
 [[7353   81]
 [ 103 1940]]
