In [2]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd

# List of URLs for each season you want to scrape
urls = [
    'https://fbref.com/en/comps/189/2023-2024/schedule/2023-2024-Womens-Super-League-Scores-and-Fixtures',
    'https://fbref.com/en/comps/189/2022-2023/schedule/2022-2023-Womens-Super-League-Scores-and-Fixtures',
    'https://fbref.com/en/comps/189/2021-2022/schedule/2021-2022-Womens-Super-League-Scores-and-Fixtures',
    'https://fbref.com/en/comps/189/2020-2021/schedule/2020-2021-Womens-Super-League-Scores-and-Fixtures',
    'https://fbref.com/en/comps/189/2019-2020/schedule/2019-2020-Womens-Super-League-Scores-and-Fixtures',
    'https://fbref.com/en/comps/189/2018-2019/schedule/2018-2019-Womens-Super-League-Scores-and-Fixtures'
]
# Set up the Selenium WebDriver with automatic driver management
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))

# Initialize an empty list to store dataframes for each season
dataframes = []

# Loop through each URL to scrape data
for url in urls:
    # Fetch the page
    driver.get(url)
    # Wait for the table to load
    driver.implicitly_wait(10)

    # Locate the table element using XPath with partial match
    table_element = driver.find_element(By.XPATH, "//table[starts-with(@id, 'sched_') and contains(@id, '_189_1')]")
    
    # Get the HTML content of the table
    table_html = table_element.get_attribute('outerHTML')
    
    # Use pandas to read the HTML table
    df = pd.read_html(table_html)[0]
    
    # Store the DataFrame 
    dataframes.append(df)

# Combine all the dataframes into one
all_seasons_df = pd.concat(dataframes, ignore_index=True)

# Close the driver
driver.quit()

# Display the combined data
print(all_seasons_df.head())

  df = pd.read_html(table_html)[0]
  df = pd.read_html(table_html)[0]
  df = pd.read_html(table_html)[0]
  df = pd.read_html(table_html)[0]
  df = pd.read_html(table_html)[0]
  df = pd.read_html(table_html)[0]


  Wk  Day        Date           Time          Home   xG Score xG.1  \
0  1  Sun  2023-10-01  12:30 (07:30)   Aston Villa  0.9   1–2  2.5   
1  1  Sun  2023-10-01  13:00 (08:00)       Everton  1.6   1–2  1.5   
2  1  Sun  2023-10-01  14:00 (09:00)  Bristol City  1.2   2–4  1.8   
3  1  Sun  2023-10-01  14:00 (09:00)       Arsenal  1.7   0–1  1.0   
4  1  Sun  2023-10-01  15:00 (10:00)      West Ham  0.2   0–2  1.8   

              Away Attendance                          Venue          Referee  \
0   Manchester Utd      12533                     Villa Park    Rebecca Welch   
1         Brighton       1295               Walton Hall Park     Kirsty Dowle   
2   Leicester City       4132            Ashton Gate Stadium  Elizabeth Simms   
3        Liverpool      54115               Emirates Stadium    Emily Heaslip   
4  Manchester City       2032  Chigwell Construction Stadium    Abigail Byrne   

   Match Report Notes  
0  Match Report   NaN  
1  Match Report   NaN  
2  Match Report   Na

In [3]:
# Drop unnecessary columns like 'Attendance', 'Time', 'Match Report', 'Notes'
all_seasons_df = all_seasons_df.drop(columns=['Attendance', 'Time', 'Match Report', 'Notes'])

In [4]:
# Remove apostrophes from stadium names
all_seasons_df['Venue'] = all_seasons_df['Venue'].str.replace("'", "", regex=False)

In [5]:
# Drop rows where 'Date' is 'Date' 
all_seasons_df = all_seasons_df[all_seasons_df['Date'] != 'Date']

# Drop rows where 'Date' is NaN (i.e., games where the date is missing)
all_seasons_df = all_seasons_df.dropna(subset=['Date'])

# Drop rows where 'Score' is NaN (i.e., games where the score is missing)
all_seasons_df = all_seasons_df.dropna(subset=['Score'])

# Fill empty xG with 0
all_seasons_df['xG'] = all_seasons_df['xG'].fillna(0.0)
all_seasons_df['xG.1'] = all_seasons_df['xG.1'].fillna(0.0)

# Fill missing values in the 'Referee' column with 'Unknown'
all_seasons_df['Referee'] = all_seasons_df['Referee'].fillna('Unknown')


# Example: Fill missing weeks based on sequential filling, if applicable
all_seasons_df['Wk'] = all_seasons_df['Wk'].fillna(method='ffill')


  all_seasons_df['Wk'] = all_seasons_df['Wk'].fillna(method='ffill')


In [6]:
# Create a dictionary mapping dates to match weeks (Wk)
date_to_week_mapping = {
    ('2019-09-07', '2019-09-08'): 1,
    ('2019-09-15', '2019-09-16'): 2,
    ('2019-09-28', '2019-09-29'): 3,
    ('2019-10-12', '2019-10-13'): 4,
    ('2019-10-27'): 5,
    ('2019-11-17'): 6,
    ('2019-11-24'): 7,
    ('2019-11-29', '2019-12-01', '2019-12-04'): 8,
    ('2019-12-08'): 9,
    ('2019-12-15'): 10,
    ('2020-01-05'): 11,
    ('2020-01-11', '2020-01-12'): 12,
    ('2020-01-19'): 13,
    ('2020-02-02'): 14,
    ('2020-02-12', '2020-02-13'): 16,
    ('2020-02-23'): 17
}

# Function to map dates to weeks
def get_week_from_date(date):
    for date_group, week in date_to_week_mapping.items():
        if date in date_group:
            return week
    return None  # Return None if no match is found

# Apply the function to rows where Wk is NaN
all_seasons_df.loc[all_seasons_df['Wk'].isna(), 'Wk'] = all_seasons_df['Date'].apply(get_week_from_date)

In [7]:
all_seasons_df['Date'] = pd.to_datetime(all_seasons_df['Date'], format='%Y-%m-%d')

# Remove rows where 'Score' is not in the format of 'X–Y', ensuring it matches typical score formats
all_seasons_df = all_seasons_df[all_seasons_df['Score'].str.contains('^\d+–\d+$', regex=True)]

# Split the 'Score' column into two new columns: 'home_goals' and 'away_goals' using the '–' delimiter
all_seasons_df[['home_goals', 'away_goals']] = all_seasons_df['Score'].str.split('–', expand=True).astype(int)

# Define a function to determine the match result based on the home and away goals
def determine_result(row):
    if row['home_goals'] > row['away_goals']:
        return 'Home win'
    elif row['home_goals'] < row['away_goals']:
        return 'Away win'
    else:
        return 'Draw'

# Apply the function to each row to create a new 'result' column indicating the match outcome
all_seasons_df['result'] = all_seasons_df.apply(determine_result, axis=1)

# Convert the 'Date' column to datetime format
all_seasons_df['Date'] = pd.to_datetime(all_seasons_df['Date'])

# Extract the day of the week from the 'Date' column and store it in a new 'Day' column
all_seasons_df['Day'] = all_seasons_df['Date'].dt.day_name()

# Create a 'season_start' column to represent the starting year of the season
# If the match date is before August, it assumes the match belongs to the previous season.
all_seasons_df['season_start'] = all_seasons_df['Date'].apply(lambda x: x.year - 1 if x.month < 8 else x.year)

# Convert the 'Day' column into dummy variables (one-hot encoding) for each day of the week
all_seasons_df = pd.get_dummies(all_seasons_df, columns=['Day'])

# Reset the index of the DataFrame to maintain a clean sequence of indices after filtering/cleaning
all_seasons_df.reset_index(drop=True, inplace=True)

# Update the rolling average goals for the home team in the main dataset
# For each unique team, filter its matches, sort by date, and calculate rolling averages
for x in all_seasons_df.Home.unique():
    # Filter all matches involving the team either as home or away
    temp_df = all_seasons_df[(all_seasons_df['Home'] == x) | (all_seasons_df['Away'] == x)]
    temp_df = temp_df.sort_values(['Date'])  # Sort the matches by date
    
    # Calculate the relevant goals for rolling averages (home goals if the team is home, away goals otherwise)
    temp_df['goal_value_to_calculate'] = temp_df.apply(lambda y: y['home_goals'] if y['Home'] == x else y['away_goals'], axis=1)
    
    # Calculate the rolling average of goals over a 5-match window, excluding the current match
    temp_df['rolling_avg_goals'] = temp_df['goal_value_to_calculate'].rolling(window=5, closed="left", min_periods=1).mean()
    
    # Update the main DataFrame with the rolling averages for home or away teams
    for index, row in temp_df.iterrows():
        if row['Home'] == x:
            all_seasons_df.at[index, 'home_rolling_avg_goals'] = row['rolling_avg_goals']
        else:
            all_seasons_df.at[index, 'away_rolling_avg_goals'] = row['rolling_avg_goals']

# Do the same process for expected goals (xG)
for x in all_seasons_df.Home.unique():
    # Filter all matches involving the team either as home or away
    temp_df = all_seasons_df[(all_seasons_df['Home'] == x) | (all_seasons_df['Away'] == x)]
    temp_df = temp_df.sort_values(['Date'])  # Sort the matches by date
    
    # Calculate the relevant xG for rolling averages (home xG if the team is home, away xG otherwise)
    temp_df['xG_value_to_calculate'] = temp_df.apply(lambda y: y['xG'] if y['Home'] == x else y['xG.1'], axis=1)
    
    # Calculate the rolling average of xG over a 5-match window, excluding the current match
    temp_df['rolling_avg_xG'] = temp_df['xG_value_to_calculate'].rolling(window=5, closed="left", min_periods=1).mean()
    
    # Update the main DataFrame with the rolling averages for home or away teams
    for index, row in temp_df.iterrows():
        if row['Home'] == x:
            all_seasons_df.at[index, 'home_rolling_avg_xG'] = row['rolling_avg_xG']
        else:
            all_seasons_df.at[index, 'away_rolling_avg_xG'] = row['rolling_avg_xG']

# Drop rows where any of the rolling average columns are NaN (to avoid incomplete data)
all_seasons_df = all_seasons_df.dropna(subset=['home_rolling_avg_goals', 'away_rolling_avg_goals', 'home_rolling_avg_xG', 'away_rolling_avg_xG'])

# Display the first few rows 
all_seasons_df.head()


Unnamed: 0,Wk,Date,Home,xG,Score,xG.1,Away,Venue,Referee,home_goals,...,Day_Monday,Day_Saturday,Day_Sunday,Day_Thursday,Day_Tuesday,Day_Wednesday,home_rolling_avg_goals,away_rolling_avg_goals,home_rolling_avg_xG,away_rolling_avg_xG
0,1,2023-10-01,Aston Villa,0.9,1–2,2.5,Manchester Utd,Villa Park,Rebecca Welch,1,...,False,False,True,False,False,False,3.0,2.0,1.36,1.48
1,1,2023-10-01,Everton,1.6,1–2,1.5,Brighton,Walton Hall Park,Kirsty Dowle,1,...,False,False,True,False,False,False,1.6,0.8,1.1,1.14
2,1,2023-10-01,Bristol City,1.2,2–4,1.8,Leicester City,Ashton Gate Stadium,Elizabeth Simms,2,...,False,False,True,False,False,False,0.8,1.2,0.74,1.26
3,1,2023-10-01,Arsenal,1.7,0–1,1.0,Liverpool,Emirates Stadium,Emily Heaslip,0,...,False,False,True,False,False,False,1.8,1.2,2.06,0.84
4,1,2023-10-01,West Ham,0.2,0–2,1.8,Manchester City,Chigwell Construction Stadium,Abigail Byrne,0,...,False,False,True,False,False,False,1.2,3.0,0.8,2.38


### Create df for first week fixtures to predict the results

In [8]:
import pandas as pd

# Fixtures for the first week
fixtures = [
    {'Wk': 1, 'Home': 'Chelsea', 'Away': 'Aston Villa', 'Venue': 'Cherry Red Records Fans Stadium', 'Referee': 'Kirsty Dowle',
     'home_rolling_avg_goals': 3.0, 'away_rolling_avg_goals': 1.2, 'home_rolling_avg_xG': 2.48, 'away_rolling_avg_xG': 0.70,
     'Day_Friday': 1, 'Home_Chelsea': 1, 'Away_Aston Villa': 1, 'Referee_Kirsty Dowle': 1, 'Venue_Cherry Red Records Fans Stadium': 1},
    
    {'Wk': 1, 'Home': 'Manchester Utd', 'Away': 'West Ham', 'Venue': 'Old Trafford', 'Referee': 'Cheryl Foster',
     'home_rolling_avg_goals': 1.6, 'away_rolling_avg_goals': 0.4, 'home_rolling_avg_xG': 1.50, 'away_rolling_avg_xG': 0.84,
     'Day_Saturday': 1, 'Home_Manchester Utd': 1, 'Away_West Ham': 1, 'Referee_Cheryl Foster': 1, 'Venue_Old Trafford': 1},
    
    {'Wk': 1, 'Home': 'Brighton', 'Away': 'Everton', 'Venue': 'Broadfield Stadium', 'Referee': 'Lauren Impey',
     'home_rolling_avg_goals': 1.2, 'away_rolling_avg_goals': 1.2, 'home_rolling_avg_xG': 1.00, 'away_rolling_avg_xG': 0.52,
     'Day_Saturday': 1, 'Home_Brighton': 1, 'Away_Everton': 1, 'Referee_Lauren Impey': 1, 'Venue_Broadfield Stadium': 1},
    
    {'Wk': 1, 'Home': 'Arsenal', 'Away': 'Manchester City', 'Venue': 'Emirates Stadium', 'Referee': 'Abi Byrne',
     'home_rolling_avg_goals': 2.8, 'away_rolling_avg_goals': 3.4, 'home_rolling_avg_xG': 2.92, 'away_rolling_avg_xG': 2.24,
     'Day_Sunday': 1, 'Home_Arsenal': 1, 'Away_Manchester City': 1, 'Referee_Abigail Byrne': 1, 'Venue_Emirates Stadium': 1},
    
    {'Wk': 1, 'Home': 'Liverpool', 'Away': 'Leicester City', 'Venue': 'Totally Wicked Stadium', 'Referee': 'Ade Soneye',
     'home_rolling_avg_goals': 1.8, 'away_rolling_avg_goals': 0.8, 'home_rolling_avg_xG': 1.02, 'away_rolling_avg_xG': 0.98,
     'Day_Sunday': 1, 'Home_Liverpool': 1, 'Away_Leicester City': 1, 'Referee_Adewunmi Soneye': 1, 'Venue_Totally Wicked Stadium': 1},
    
    {'Wk': 1, 'Home': 'Tottenham', 'Away': 'Crystal Palace', 'Venue': 'Gaughan Group Stadium', 'Referee': 'Amy Fearn',
     'home_rolling_avg_goals': 1.2, 'away_rolling_avg_goals': None, 'home_rolling_avg_xG': 1.82, 'away_rolling_avg_xG': None,
     'Day_Sunday': 1, 'Home_Tottenham': 1, 'Away_Crystal Palace': 1, 'Referee_Amy Fearn': 1, 'Venue_Gaughan Group Stadium': 1}
]

# Create a DataFrame from the fixtures
twenty_four_df = pd.DataFrame(fixtures)


In [9]:
# Calculate league median for xG and goals
median_xG = twenty_four_df['away_rolling_avg_xG'].median()
median_goals = twenty_four_df['away_rolling_avg_goals'].median()

# Identify rows related to Crystal Palace
crystal_palace_mask = twenty_four_df['Away_Crystal Palace'] == 1

# Fill missing values for Crystal Palace with league medians
twenty_four_df.loc[crystal_palace_mask, 'away_rolling_avg_xG'] = median_xG
twenty_four_df.loc[crystal_palace_mask, 'away_rolling_avg_goals'] = median_goals


In [None]:
twenty_four_df.columns

In [10]:
# Replace NaNs with 0 for categorical columns
categorical_columns = [
    'Day_Friday', 'Day_Saturday', 'Day_Sunday',
    'Home_Arsenal', 'Home_Brighton', 'Home_Chelsea', 'Home_Liverpool',
    'Home_Manchester Utd', 'Home_Tottenham',
    'Away_Aston Villa', 'Away_Everton', 'Away_Leicester City',
    'Away_Manchester City', 'Away_West Ham', 'Away_Crystal Palace',
    'Referee_Amy Fearn', 'Referee_Cheryl Foster', 'Referee_Kirsty Dowle',
    'Referee_Lauren Impey', 'Referee_Adewunmi Soneye', 'Referee_Abigail Byrne',
    'Venue_Broadfield Stadium', 'Venue_Emirates Stadium', 'Venue_Cherry Red Records Fans Stadium',
    'Venue_Gaughan Group Stadium', 'Venue_Old Trafford', 'Venue_Totally Wicked Stadium'
]

twenty_four_df[categorical_columns] = twenty_four_df[categorical_columns].fillna(0)

# Check the DataFrame to confirm NaNs are replaced
print(twenty_four_df.head())


   Wk            Home             Away                            Venue  \
0   1         Chelsea      Aston Villa  Cherry Red Records Fans Stadium   
1   1  Manchester Utd         West Ham                     Old Trafford   
2   1        Brighton          Everton               Broadfield Stadium   
3   1         Arsenal  Manchester City                 Emirates Stadium   
4   1       Liverpool   Leicester City           Totally Wicked Stadium   

         Referee  home_rolling_avg_goals  away_rolling_avg_goals  \
0   Kirsty Dowle                     3.0                     1.2   
1  Cheryl Foster                     1.6                     0.4   
2   Lauren Impey                     1.2                     1.2   
3      Abi Byrne                     2.8                     3.4   
4     Ade Soneye                     1.8                     0.8   

   home_rolling_avg_xG  away_rolling_avg_xG  Day_Friday  ...  \
0                 2.48                 0.70         1.0  ...   
1           

# Train the classifier model

In [11]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

# Filter data to include only the last 3 seasons
recent_seasons_df = all_seasons_df[all_seasons_df['season_start'] >= 2020]

# Define the features, dropping non-numeric and non-relevant columns
features = [column for column in recent_seasons_df.drop(columns=['Date', 'xG', 'xG.1', 'Home', 'Away', 'Referee', 'Venue',
                                                  'Score', 'result', 'home_goals', 'away_goals', 'season_start'])]

# Split the data into training and test sets based on the season
train_data = recent_seasons_df[recent_seasons_df['season_start'] < 2023]  # Training on all seasons before 2023
test_data = recent_seasons_df[recent_seasons_df['season_start'] == 2023]  # Testing on the 2023 season


# Define the features and target variable
X_train = train_data[features]
y_train = train_data['result']
X_test = test_data[features]
y_test = test_data['result']

# Initialize and train the RandomForestClassifier model with class weights
clf = RandomForestClassifier(random_state=1)
clf.fit(X_train, y_train)

# Predict on the test set
y_pred = clf.predict(X_test)

# Evaluate model performance
print("Predictions:", y_pred)


Predictions: ['Home win' 'Home win' 'Away win' 'Home win' 'Away win' 'Home win'
 'Home win' 'Home win' 'Home win' 'Home win' 'Home win' 'Away win'
 'Home win' 'Home win' 'Home win' 'Away win' 'Away win' 'Home win'
 'Away win' 'Away win' 'Away win' 'Home win' 'Away win' 'Away win'
 'Away win' 'Away win' 'Home win' 'Away win' 'Away win' 'Away win'
 'Home win' 'Home win' 'Home win' 'Away win' 'Away win' 'Away win'
 'Home win' 'Away win' 'Away win' 'Home win' 'Away win' 'Home win'
 'Away win' 'Home win' 'Home win' 'Home win' 'Home win' 'Home win'
 'Home win' 'Home win' 'Home win' 'Home win' 'Home win' 'Away win'
 'Away win' 'Home win' 'Away win' 'Away win' 'Home win' 'Home win'
 'Home win' 'Home win' 'Home win' 'Home win' 'Home win' 'Home win'
 'Away win' 'Home win' 'Home win' 'Away win' 'Away win' 'Away win'
 'Home win' 'Home win' 'Away win' 'Home win' 'Home win' 'Home win'
 'Home win' 'Home win' 'Draw' 'Away win' 'Home win' 'Home win' 'Home win'
 'Home win' 'Home win' 'Home win' 'Home wi

In [12]:
# Add features ommitted earlier and see if they improve the model further
recent_seasons_df = pd.get_dummies(recent_seasons_df, columns=['Home', 'Away', 'Referee', 'Venue'])
recent_seasons_df.head()

Unnamed: 0,Wk,Date,xG,Score,xG.1,home_goals,away_goals,result,season_start,Day_Friday,...,Venue_Stamford Bridge,Venue_The American Express Community Stadium,Venue_The Bankss Stadium,Venue_The Breyer Group Stadium,Venue_The Hive Stadium,Venue_The People’s Pension Stadium,Venue_Tottenham Hotspur Stadium,Venue_Twerton Park Stadium,Venue_Villa Park,Venue_Walton Hall Park
0,1,2023-10-01,0.9,1–2,2.5,1,2,Away win,2023,False,...,False,False,False,False,False,False,False,False,True,False
1,1,2023-10-01,1.6,1–2,1.5,1,2,Away win,2023,False,...,False,False,False,False,False,False,False,False,False,True
2,1,2023-10-01,1.2,2–4,1.8,2,4,Away win,2023,False,...,False,False,False,False,False,False,False,False,False,False
3,1,2023-10-01,1.7,0–1,1.0,0,1,Away win,2023,False,...,False,False,False,False,False,False,False,False,False,False
4,1,2023-10-01,0.2,0–2,1.8,0,2,Away win,2023,False,...,False,False,False,False,False,False,False,False,False,False


In [13]:
from sklearn.metrics import accuracy_score

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")


# Our baseline model has an accuracy of
recent_seasons_df['result'].value_counts(normalize=True)

# Let's first try to tune the hyperparameters
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15, 20]
}

grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, cv=5)
grid_search.fit(X_train, y_train)

print(grid_search.best_params_)

clf = RandomForestClassifier(random_state=1, max_depth= 5, n_estimators= 50)
clf.fit(X_train, y_train)
                             
                             
# Make predictions
predictions = clf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, predictions)
accuracy

Accuracy: 55.30%
{'max_depth': 5, 'n_estimators': 50}


0.5454545454545454

In [14]:
# Define the features and target variable
features = [column for column in recent_seasons_df.drop(columns=['Date', 'xG', 'xG.1', 'Score', 'result', 'home_goals', 'away_goals', 'season_start']).columns]

# Split the data into train and test sets
train_data = recent_seasons_df[recent_seasons_df['season_start'] < 2023]
test_data = recent_seasons_df[recent_seasons_df['season_start'] == 2023]

X_train = train_data[features]
y_train = train_data['result']
X_test = test_data[features]
y_test = test_data['result']

# Find the best hyperparameters for the Random Forest model
grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, cv=5)
grid_search.fit(X_train, y_train)

print(grid_search.best_params_)

clf = RandomForestClassifier(random_state=1, max_depth= 5, n_estimators= 100)
clf.fit(X_train, y_train)
                             
                             
# Make predictions
predictions = clf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, predictions)
accuracy

{'max_depth': 5, 'n_estimators': 100}


0.5833333333333334

In [455]:
importances = best_model.feature_importances_
feature_names = X_train.columns
sorted_indices = importances.argsort()
for index in sorted_indices:
    print(f"{feature_names[index]}: {importances[index]}")


Referee_Edward Duckworth: 0.0
Referee_Constantine Hatzidakis: 0.0
Referee_Elizabeth Simms: 0.0
Referee_Elliot Swallow: 0.0
Referee_Grace Lowe: 0.0
Referee_Keith Stroud: 0.0
Referee_Martin Woods: 0.0
Referee_Megan Wilson: 0.0
Referee_Melissa Burgin: 0.0
Referee_Neil Hair: 0.0
Referee_Andy Madley: 0.0
Referee_Phoebe Cross: 0.0
Referee_Steven Hughes: 0.0
Referee_Sunny Gill: 0.0
Referee_Tim Robinson: 0.0
Referee_Tom Nield: 0.0
Venue_Anfield: 0.0
Venue_Ashton Gate Stadium: 0.0
Venue_Broadfield Stadium: 0.0
Venue_Joie Stadium: 0.0
Venue_Goodison Park: 0.0
Referee_Stephen Parkinson: 0.0
Referee_Adewunmi Soneye: 0.0
Venue_Gaughan Group Stadium: 0.0
Venue_Stamford Bridge: 0.0
Referee_Farai Hallam: 1.695984440431723e-05
Referee_Scott Jackson: 2.5314014140234345e-05
Referee_Adam Herczeg: 2.956435549423796e-05
Referee_Anthony Backhouse: 4.653726899687864e-05
Day_Thursday: 7.85072967173631e-05
Referee_Ryan Atkin: 8.032645407802952e-05
Referee_James Bell: 8.161968865867241e-05
Referee_Yoshimi Yamash

# Run predictions for week 1

In [15]:
# Ensure the DataFrame has all feature columns, filling missing columns with 0
missing_cols = set(X_train.columns) - set(twenty_four_df.columns)
for col in missing_cols:
    twenty_four_df[col] = 0

# Reorder the columns to match the model's feature columns
twenty_four_df = twenty_four_df[X_train.columns]


  twenty_four_df[col] = 0
  twenty_four_df[col] = 0
  twenty_four_df[col] = 0
  twenty_four_df[col] = 0
  twenty_four_df[col] = 0
  twenty_four_df[col] = 0
  twenty_four_df[col] = 0
  twenty_four_df[col] = 0
  twenty_four_df[col] = 0
  twenty_four_df[col] = 0
  twenty_four_df[col] = 0
  twenty_four_df[col] = 0
  twenty_four_df[col] = 0
  twenty_four_df[col] = 0
  twenty_four_df[col] = 0
  twenty_four_df[col] = 0
  twenty_four_df[col] = 0
  twenty_four_df[col] = 0
  twenty_four_df[col] = 0
  twenty_four_df[col] = 0
  twenty_four_df[col] = 0
  twenty_four_df[col] = 0
  twenty_four_df[col] = 0
  twenty_four_df[col] = 0
  twenty_four_df[col] = 0
  twenty_four_df[col] = 0
  twenty_four_df[col] = 0
  twenty_four_df[col] = 0
  twenty_four_df[col] = 0
  twenty_four_df[col] = 0
  twenty_four_df[col] = 0


In [16]:
# Make predictions for the fixtures
predictions = clf.predict(twenty_four_df)

print(predictions)

['Home win' 'Home win' 'Home win' 'Home win' 'Home win' 'Home win']
