# Import Libraries and Setup MongoDB Connection

In [2]:
import pandas as pd
from pymongo import MongoClient
from dotenv import load_dotenv
import os

# Add script path for importing custom functions
import sys
sys.path.append('../scripts')

from mongo_connection import get_matches_collection, get_penalties_collection

# Load environment variables from .env file
load_dotenv()

# Retrieve collections
matches_collection = get_matches_collection()
penalties_collection = get_penalties_collection()


# Load Data into DataFrames

In [3]:
# Extract data into Pandas DataFrames
matches_df = pd.DataFrame(list(matches_collection.find()))
penalties_df = pd.DataFrame(list(penalties_collection.find()))

# Convert date columns to datetime
matches_df['date'] = pd.to_datetime(matches_df['date'])
penalties_df['date'] = pd.to_datetime(penalties_df['date'])

# Display the columns in matches_df
print("Columns in matches_df before merge:", matches_df.columns.tolist())

# Merge penalties data if needed, using specific suffixes to avoid conflicts
matches_df = pd.merge(matches_df, penalties_df[['date', 'home_team', 'away_team', 'winner']], 
                      on=['date', 'home_team', 'away_team'], how='left', suffixes=('', '_penalty'))

# Handle 'winner' column if it exists in both DataFrames
matches_df['winner'] = matches_df['winner'].fillna(matches_df['winner_penalty'])
matches_df.drop(columns=['winner_penalty'], inplace=True, errors='ignore')

# Display the columns in matches_df after merge
print("Columns in matches_df after merge:", matches_df.columns.tolist())

# Display the first few rows of the data
matches_df.head()


Columns in matches_df before merge: ['_id', 'date', 'home_team', 'away_team', 'home_score', 'away_score', 'tournament', 'city', 'country', 'neutral', 'winner']
Columns in matches_df after merge: ['_id', 'date', 'home_team', 'away_team', 'home_score', 'away_score', 'tournament', 'city', 'country', 'neutral', 'winner']


Unnamed: 0,_id,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral,winner
0,6692ad7288365a953ad9e558,1872-11-30,Scotland,England,0,0,Friendly,Glasgow,Scotland,False,Draw
1,6692ad7288365a953ad9e559,1873-03-08,England,Scotland,4,2,Friendly,London,England,False,England
2,6692ad7288365a953ad9e55a,1874-03-07,Scotland,England,2,1,Friendly,Glasgow,Scotland,False,Scotland
3,6692ad7288365a953ad9e55b,1875-03-06,England,Scotland,2,2,Friendly,London,England,False,Draw
4,6692ad7288365a953ad9e55c,1876-03-04,Scotland,England,3,0,Friendly,Glasgow,Scotland,False,Scotland


# Check column presence

In [4]:
# Check the columns in matches_df
print("Columns in matches_df:", matches_df.columns.tolist())

# Ensure the DataFrame has all necessary columns before proceeding
required_columns = ['home_team', 'away_team', 'home_score', 'away_score', 'neutral', 'winner']
missing_columns = [col for col in required_columns if col not in matches_df.columns]
if missing_columns:
    raise ValueError(f"Missing columns in matches_df: {missing_columns}")


Columns in matches_df: ['_id', 'date', 'home_team', 'away_team', 'home_score', 'away_score', 'tournament', 'city', 'country', 'neutral', 'winner']


# Feature Engineering

In [5]:
# Create additional features
matches_df['goal_diff'] = matches_df['home_score'] - matches_df['away_score']
matches_df['is_neutral'] = matches_df['neutral'].astype(int)
matches_df['is_home_win'] = (matches_df['winner'] == matches_df['home_team']).astype(int)
matches_df['is_away_win'] = (matches_df['winner'] == matches_df['away_team']).astype(int)
matches_df['is_draw'] = (matches_df['winner'] == 'draw').astype(int)

# Display the DataFrame with the new features
matches_df.head()


Unnamed: 0,_id,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral,winner,goal_diff,is_neutral,is_home_win,is_away_win,is_draw
0,6692ad7288365a953ad9e558,1872-11-30,Scotland,England,0,0,Friendly,Glasgow,Scotland,False,Draw,0,0,0,0,0
1,6692ad7288365a953ad9e559,1873-03-08,England,Scotland,4,2,Friendly,London,England,False,England,2,0,1,0,0
2,6692ad7288365a953ad9e55a,1874-03-07,Scotland,England,2,1,Friendly,Glasgow,Scotland,False,Scotland,1,0,1,0,0
3,6692ad7288365a953ad9e55b,1875-03-06,England,Scotland,2,2,Friendly,London,England,False,Draw,0,0,0,0,0
4,6692ad7288365a953ad9e55c,1876-03-04,Scotland,England,3,0,Friendly,Glasgow,Scotland,False,Scotland,3,0,1,0,0


# Prepare Data for Model Training

In [6]:
# Define features and target
features = ['home_score', 'away_score', 'goal_diff', 'is_neutral']
target = 'winner'

X = matches_df[features]
y = matches_df[target]

# Encode the target variable using all data to include all possible classes
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y = le.fit_transform(y)

# Display the feature matrix and target vector
X.head(), y[:5]


(   home_score  away_score  goal_diff  is_neutral
 0           0           0          0           0
 1           4           2          2           0
 2           2           1          1           0
 3           2           2          0           0
 4           3           0          3           0,
 array([ 82,  89, 240,  82, 240]))

In [12]:
from sklearn.linear_model import LinearRegression

# Separate features and target variables
X = matches_df.drop(columns=['home_score', 'away_score', 'winner'])
y_home = matches_df['home_score']
y_away = matches_df['away_score']
# Split the data into training and testing sets
X_train, X_test, y_train_home, y_test_home = train_test_split(X, y_home, test_size=0.2, random_state=42)
X_train, X_test, y_train_away, y_test_away = train_test_split(X, y_away, test_size=0.2, random_state=42)
# Initialize the linear regression model
model_home = LinearRegression()
model_away = LinearRegression()

# Train the model on the training data
model_home.fit(X_train, y_train_home)
model_away.fit(X_train, y_train_away)

TypeError: float() argument must be a string or a real number, not 'ObjectId'

# Split Data into Training and Test Sets

In [7]:
from sklearn.model_selection import train_test_split

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display the shapes of the training and test sets
X_train.shape, X_test.shape, y_train.shape, y_test.shape


((37904, 4), (9477, 4), (37904,), (9477,))

# Train the Linear Regression Model

In [8]:
from sklearn.linear_model import LogisticRegression

# Train a logistic regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)


# Evaluate the Model

In [9]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# Ensure the target names are correctly matched with the classes in y_test
print("Classification Report:\n", classification_report(y_test, y_pred, target_names=le.inverse_transform(model.classes_)))


Accuracy: 0.20417853751187084
Confusion Matrix:
 [[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


ValueError: Number of classes, 284, does not match size of target_names, 309. Try specifying the labels parameter