In [33]:
# Feature engineering best practices
# Keep the data simple: Don’t over-engineer. Create features that make logical sense based on the problem at hand.
# Handle missing data appropriately: Ensure that any missing values are handled before modeling.
# Normalize or scale the data: Some algorithms (like logistic regression and neural networks) work better when numerical features are scaled to a similar range.

# Import necessary libraries
import pandas as pd

# Load the cleaned dataset
data = pd.read_csv('../data/football_data_cleaned.csv')

# Proposed existing features (exist in the data)
# Home team goals (FTHG)
# Away team goals (FTAG)
# Home team shots (HS)
# Away team shots (AS)
# Home team shots on target (HST)
# Away team shots on target (AST)
# Match results (target variable)

# Create new features

# 1. Goal Difference (Home - Away)
data['Goal_Diff'] = data['FTHG'] - data['FTAG']

# 2. Home Shooting Accuracy (Home Shots on Target / Home Shots)
data['Home_Accuracy'] = data['HST'] / data['HS']

# 3. Away Shooting Accuracy (Away Shots on Target / Away Shots)
data['Away_Accuracy'] = data['AST'] / data['AS']

# 4. Total Goals (Home + Away)
data['Total_Goals'] = data['FTHG'] + data['FTAG']

# Check the first few rows to see the new features
data.head()

Unnamed: 0,Div,Date,Time,FTHG,FTAG,FTR,HTHG,HTAG,HTR,Referee,...,AwayTeam_Sheffield United,AwayTeam_Southampton,AwayTeam_Tottenham,AwayTeam_West Ham,AwayTeam_Wolves,Result,Goal_Diff,Home_Accuracy,Away_Accuracy,Total_Goals
0,E0,2022-08-05,20:00,0,2,A,0,1,A,A Taylor,...,False,False,False,False,False,0,-2,0.2,0.2,2
1,E0,2022-08-06,12:30,2,2,D,1,0,H,A Madley,...,False,False,False,False,False,1,0,0.333333,0.363636,4
2,E0,2022-08-06,15:00,2,0,H,1,0,H,P Bankes,...,False,False,False,False,False,2,2,0.428571,0.133333,2
3,E0,2022-08-06,15:00,2,1,H,1,1,D,R Jones,...,False,False,False,False,True,2,1,0.333333,0.4,3
4,E0,2022-08-06,15:00,2,0,H,0,0,D,S Hooper,...,False,False,False,False,False,2,2,0.434783,0.0,2


In [34]:
# Check for missing values
print(data.isnull().sum())


Div              0
Date             0
Time             0
FTHG             0
FTAG             0
                ..
Result           0
Goal_Diff        0
Home_Accuracy    0
Away_Accuracy    0
Total_Goals      0
Length: 184, dtype: int64


In [35]:
# Remove rows with missing data or replace with mean

# Drop data
#data = data.dropna()

# Replace with mean
data['Home_Accuracy'].fillna(data['Home_Accuracy'].mean(), inplace=True)
data['Away_Accuracy'].fillna(data['Away_Accuracy'].mean(), inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Home_Accuracy'].fillna(data['Home_Accuracy'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Away_Accuracy'].fillna(data['Away_Accuracy'].mean(), inplace=True)


In [36]:
# Save the updated dataset with the new features.
# ONLY RUN THIS IF NEW FEATURES ARE ADDED
#data.to_csv('../data/football_data_cleaned.csv', index=False)

In [37]:
# Split into train and test datasets

# Import train_test_split from sklearn
from sklearn.model_selection import train_test_split

# Define the features (X) and the target variable (y)
# Drop unnecessary columns (we don't want to use goals or shots directly as features)
X = data[['Goal_Diff', 'Home_Accuracy', 'Away_Accuracy', 'Total_Goals']]
y = data['Result']

# Split the data: 80% training, 20% testing
# random_state=42 ensure the ratio stays the same
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Check the size of the splits
print(f"Training set: {X_train.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")

Training set: 648 samples
Test set: 162 samples


In [38]:
# Scaling the data. Can help performance of logistic reg if numerical features are on a similar scale

# Import the StandardScaler
from sklearn.preprocessing import StandardScaler

# Initialize the scaler
scaler = StandardScaler()

# Fit the scaler on the training data and transform both training and test sets
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Check scaled data (optional)
print(X_train_scaled[:5])  # Print first 5 rows of the scaled training data


[[-0.15006056  0.18838771  0.11646181 -0.60838567]
 [ 1.83441376 -0.15809392  1.51965834  0.53764315]
 [ 0.34605802 -0.15809392 -0.09828827 -0.03537126]
 [ 0.8421766   1.81252038  1.20694597  0.53764315]
 [ 0.34605802  0.59261629 -1.67962975 -1.18140007]]
