# Random Forest

In [None]:
# notebooks/RandomForest.ipynb

## Import necessary libraries
import os, sys
from pathlib import Path
sys.path.append('../scripts')

from mongo_connection import get_matches_collection
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_curve, roc_auc_score, precision_recall_curve, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

# Data Preparation

In [None]:
# Fetch data from MongoDB
collection = get_matches_collection()
cursor = collection.find({})

In [None]:
# Convert MongoDB cursor to DataFrame
df = pd.DataFrame(list(cursor))

# Show the first few rows of the DataFrame
df.head()

In [None]:
# Feature engineering
df['winner'] = df.apply(lambda row: 'Draw' if row['home_score'] == row['away_score'] else (row['home_team'] if row['home_score'] > row['away_score'] else row['away_team']), axis=1)

# Converting categorical columns to numerical (one-hot encoded)
df = pd.get_dummies(df, columns=['home_team', 'away_team', 'tournament', 'city', 'country', 'winner'], drop_first=True)

# Defining features and target variable
features = df.drop(columns=['_id', 'date', 'home_score', 'away_score', 'winner_Draw', 'winner_England', 'winner_Scotland'])
target = df['winner_Draw'] 

# Splitting data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Model Training

In [None]:
## Step 2: Model Training

# Initialize Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
rf_model.fit(X_train, y_train)

# Make predictions
y_pred = rf_model.predict(X_test)

# Model Evaluation

In [None]:
# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

In [None]:
# Improved Confusion Matrix Visualization
ConfusionMatrixDisplay.from_predictions(y_test, y_pred, cmap='Oranges_r', normalize='true')
plt.title('Normalized Confusion Matrix')
plt.show()

In [None]:
# Plot Feature Importances
feature_importances = rf_model.feature_importances_
feature_importance_df = pd.DataFrame({
    'Feature': features.columns,
    'Importance': feature_importances
}).sort_values(by='Importance', ascending=False)

In [None]:
# Improved Feature Importance Visualization
top_n = 20  # Number of top features to display
sorted_importances = feature_importance_df.head(top_n)

plt.figure(figsize=(10, 8))
sns.barplot(x='Importance', y='Feature', data=sorted_importances, palette='viridis')
plt.title('Top 20 Feature Importances')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.show()


In [None]:
# Calculate and Plot ROC Curve
y_prob = rf_model.predict_proba(X_test)[:, 1]
fpr, tpr, _ = roc_curve(y_test, y_prob)
auc_score = roc_auc_score(y_test, y_prob)

In [None]:
# Improved ROC Curve Visualization
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f'AUC = {auc_score:.2f}', color='darkorange')
plt.plot([0, 1], [0, 1], 'k--', color='gray')  # Diagonal line
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc='lower right')
plt.grid(True)
plt.show()

In [None]:
# Improved Precision-Recall Curve Visualization
precision, recall, _ = precision_recall_curve(y_test, y_prob)
plt.figure(figsize=(8, 6))
plt.plot(recall, precision, marker='.', color='blue')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.grid(True)
plt.show()

In [None]:
# Perform Cross-Validation
cv_scores = cross_val_score(rf_model, features, target, cv=5, scoring='accuracy')
print("Cross-Validation Scores:", cv_scores)
print("Mean Cross-Validation Accuracy:", np.mean(cv_scores))

In [None]:
# Visualize Cross-Validation Results
plt.figure(figsize=(8, 6))
plt.plot(range(1, len(cv_scores) + 1), cv_scores, marker='o', linestyle='-', color='green')
plt.axhline(y=np.mean(cv_scores), color='red', linestyle='--', label='Mean Accuracy')
plt.xlabel('Fold Number')
plt.ylabel('Accuracy')
plt.title('Cross-Validation Scores')
plt.legend()
plt.grid(True)
plt.show()


# Saving the Model

In [None]:
## Step 4: Model Persistence

# If models dir doesn't exist, create it
models_dir = Path('../models')

if not (models_dir.exists()):
    os.mkdir("../models", 0o755)

# Save the trained model to a file
joblib.dump(rf_model, '../models/random_forest_model.pkl')

# Load the trained model from the file
loaded_rf_model = joblib.load('../models/random_forest_model.pkl')