In [None]:
#Dependencies
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import GradientBoostingClassifier


In [None]:
#load in datasets
matches = pd.read_csv("Resources/epl_matches_combined.csv", index_col=0) 
matches.head()


In [None]:
# #random forest classifierf
# Define the list of predictors including venue code, opponent code, hour, and day code.
predictors = ['venue_code', 'opp_code', 'hour', 'day_code','team_code','last_3_results','last_3_results_2', 'xg', 'xga', 'xg_2', 'xga_2' ]
            #'formation_code','last_3_results','last_3_results_2'
            #  'formation_code_2', 
            #  , 'last_3_gf', 'last_3_ga','last_3_avg_poss', 'last_3_avg_sot', 'last_3_gd', 
            #  ,'last_3_gf_2', 'last_3_ga_2', 'last_3_avg_poss_2', 'last_3_avg_sot_2','opp_last_3_gd', 
            #  'last_3_gd_diff', 'last_3_avg_poss_diff','last_3_avg_sot_diff', 'last_3_results_diff'

# Prediction Based on User Selection

In [None]:
#turns the user input into a dataframe
user_input_columns = ["venue_code", "opp_code", "hour", "day_code", 'team_code']
user_input = pd.read_csv("/c:/Users/crcla/AI_Activities/group-2-project-2/Presentation/Resources/user_input.csv")
user_input


In [None]:
team_1_columns = ['Team_1','formation_code',
'last_3_results',
'last_3_gf',
'last_3_ga',
'last_3_avg_poss',
'last_3_avg_sot', 'date']


team_2_columns= ['Team_2','formation_code_2',
                'last_3_results_2',
                'last_3_gf_2', 
                'last_3_ga_2', 
                'last_3_avg_poss_2', 
                'last_3_avg_sot_2', 'date']



In [None]:
# Defining the teams list and their corresponding codes
teams_list = ['Arsenal', 'Aston Villa',
              'Bournemouth', 'Brentford', 'Brighton and Hove Albion', 'Burnley',
              'Cardiff City', 'Chelsea', 'Crystal Palace',
              'Everton',
              'Fulham',
              'Huddersfield Town',
              'Leeds United', 'Leicester City', 'Liverpool', 'Luton Town',
              'Manchester City', 'Manchester United',
              'Newcastle United', 'Norwich City', 'Nottingham Forest',
              'Sheffield United', 'Southampton',
              'Tottenham Hotspur',
              'Watford', 'West Bromwich Albion', 'West Ham United', 'Wolverhampton Wanderers'
              ]

In [None]:
def process_user_input(user_input, teams_list, matches, team_1_columns, team_2_columns, opponent_column_headings, user_input_columns):
    """
    Process user input and create dataframes for team 1, team 2, and user input.

    Args:
        user_input (DataFrame): User input data.
        teams_list (list): List of team names.
        matches (DataFrame): Dataframe containing match data.
        team_1_columns (list): List of column names for team 1 data.
        team_2_columns (list): List of column names for team 2 data.
        opponent_column_headings (list): List of column names for opponent data.
        user_input_columns (list): List of column names for user input data.

    Returns:
        team_1_last_values (DataFrame): Dataframe containing the last values for team 1.
        team_2_last_values (DataFrame): Dataframe containing the last values for team 2.
        user_input_df (DataFrame): Dataframe containing the user input data.
    """
    # Get team 1 name from user input and retrieve corresponding data from matches dataframe
    team_1_name = teams_list[user_input["team_code"].values[0] - 1]
    team_1_data = matches.loc[matches['Team_1'] == team_1_name, team_1_columns]

    # Get team 2 name from user input and retrieve corresponding data from matches dataframe
    team_2_name = teams_list[user_input["opp_code"].values[0] - 1]
    team_2_data = matches.loc[matches['Team_2'] == team_2_name, team_2_columns]

    # Get the last row of team 1 data
    last_values_1 = team_1_data.iloc[0]

    # Get the last row of team 2 data
    last_values_2 = team_2_data.iloc[0]

    # Create a new DataFrame with the last values of team 1 using the 'team_1_columns' as column names
    team_1_last_values = pd.DataFrame(last_values_1.values.reshape(1, -1), columns=team_1_columns)

    # Create a new DataFrame with the last values of team 2 using the 'opponent_column_headings' as column names
    team_2_last_values = pd.DataFrame(last_values_2.values.reshape(1, -1), columns=team_2_columns)

    # Create a new DataFrame with the user input using the 'user_input_columns' as column names
    user_input_df = pd.DataFrame(np.array(user_input).reshape(1, -1), columns=user_input_columns)

    return team_1_last_values, team_2_last_values, user_input_df


In [None]:
def concatenate_dataframes(user_input_df, team_1_last_values, team_2_last_values):
    """
    Concatenates the 'user_input_df', 'team_1_last_values', and 'team_2_last_values' DataFrames along the columns.

    Parameters:
    user_input_df (pandas.DataFrame): The DataFrame containing user input data.
    team_1_last_values (pandas.DataFrame): The DataFrame containing last values for team 1.
    team_2_last_values (pandas.DataFrame): The DataFrame containing last values for team 2.

    Returns:
    pandas.DataFrame: The combined DataFrame with all the input and last values.

    """
    combined_df = pd.concat([user_input_df, team_1_last_values, team_2_last_values], axis=1)
    return combined_df

# Example usage:
combined_df = concatenate_dataframes(user_input_df, team_1_last_values, team_2_last_values)
combined_df


In [None]:
#Calculate the comparison stats and add them to the dataframecomparison Stats
def calculate_differentials(combined_df, window):
    """
    Calculates the differentials between various statistics for a given window of matches.

    Parameters:
    combined_df (DataFrame): The combined dataframe containing the match statistics.
    window (int): The number of previous matches to consider for calculating the differentials.

    Returns:
    DataFrame: The combined dataframe with additional columns representing the differentials.

    """
    # Calculate the stat difference between the team and the opponent
    combined_df['last_{}_gd'.format(window)] = combined_df['last_{}_gf'.format(window)] - combined_df['last_{}_ga'.format(window)]
    combined_df['opp_last_{}_gd'.format(window)] = combined_df['last_{}_gf_2'.format(window)] - combined_df['last_{}_ga_2'.format(window)]
    combined_df['last_{}_gd_diff'.format(window)] = combined_df['last_{}_gd'.format(window)] - combined_df['opp_last_{}_gd'.format(window)]
    combined_df['last_{}_avg_poss_diff'.format(window)] = combined_df['last_{}_avg_poss'.format(window)] - combined_df['last_{}_avg_poss_2'.format(window)]
    combined_df['last_{}_avg_sot_diff'.format(window)] = combined_df['last_{}_avg_sot'.format(window)] - combined_df['last_{}_avg_sot_2'.format(window)]
    combined_df['last_{}_results_diff'.format(window)] = combined_df['last_{}_results'.format(window)] - combined_df['last_{}_results_2'.format(window)]
    return combined_df


combined_df = calculate_differentials(combined_df, 3)
combined_df.head()

In [None]:
#rearrange the columns to match the order of the predictors
combined_df = combined_df[predictors]

In [None]:
preds = rf_model.predict(combined_df)
preds