In [1]:
import numpy as np
import pandas as pd

import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots

In [132]:
results = pd.read_csv("/Users/eden/Desktop/Uni/Side projects/EPL_prediction/epl_prediction/data/results.csv", encoding='ISO-8859-1')

In [None]:
epl_results = results.copy().dropna()
epl_results.info()

### Lets inspect the data and whether it is consisiten throughout the dataset

In [None]:
# Full time goals
# Goals conceded
# Shots
# Shots on target
# Corners
# Fouls
# Yellow cards
# Red cards

ftg_by_season = epl_results.groupby("Season")[["FTHG", "FTAG"]].sum().sum(axis=1) / epl_results.groupby("Season").size()
ftgc_by_season = epl_results.groupby("Season")[["FTHG", "FTAG"]].sum().sum(axis=1) / epl_results.groupby("Season").size()
fts_by_season = epl_results.groupby("Season")[["HS", "AS"]].sum().sum(axis=1) / epl_results.groupby("Season").size()
ftst_by_season = epl_results.groupby("Season")[["HST", "AST"]].sum().sum(axis=1) / epl_results.groupby("Season").size()
fc_by_season = epl_results.groupby("Season")[["HC", "AC"]].sum().sum(axis=1) / epl_results.groupby("Season").size()
ff_by_season = epl_results.groupby("Season")[["HF", "AF"]].sum().sum(axis=1) / epl_results.groupby("Season").size()
fhy_by_season = epl_results.groupby("Season")[["HY", "AY"]].sum().sum(axis=1) / epl_results.groupby("Season").size()
fhr_by_season = epl_results.groupby("Season")[["HR", "AR"]].sum().sum(axis=1) / epl_results.groupby("Season").size()


fig = make_subplots(rows = 4, cols = 2, shared_xaxes=True)
fig.add_trace(go.Scatter(x = ftg_by_season.index.astype(str), y = ftg_by_season.values, name = "Full time goals"), row = 1, col = 1)
fig.add_trace(go.Scatter(x = ftgc_by_season.index.astype(str), y = ftgc_by_season.values, name = "Full time goals conceded"), row = 1, col = 2)
fig.add_trace(go.Scatter(x = fts_by_season.index.astype(str), y = fts_by_season.values, name = "Full time shots"), row = 2, col = 1)
fig.add_trace(go.Scatter(x = ftst_by_season.index.astype(str), y = ftst_by_season.values, name = "Full time shots on target"), row = 2, col = 2)
fig.add_trace(go.Scatter(x = fc_by_season.index.astype(str), y = fc_by_season.values, name = "Full time corners"), row = 3, col = 1)
fig.add_trace(go.Scatter(x = ff_by_season.index.astype(str), y = ff_by_season.values, name = "Full time fouls"), row = 3, col = 2)
fig.add_trace(go.Scatter(x = fhy_by_season.index.astype(str), y = fhy_by_season.values, name = "Full time yellow cards"), row = 4, col = 1)
fig.add_trace(go.Scatter(x = fhr_by_season.index.astype(str), y = fhr_by_season.values, name = "Full time red cards"), row = 4, col = 2)

fig.update_layout(title = "Season-long stats by season", xaxis_title = "Season", yaxis_title = "Average stats")
fig.update_xaxes(type = "category")
fig.show()



All follow a rough consistent trend barring shots on target which sees a significant drop between the 2012-13 and 2013-14 seasons. It is likely that the way in which shots on target are counted has changed, so I won't use this feature, but apart from that it's probably fine.

In [None]:
ftr_h_count = epl_results[epl_results["FTR"] == "H"].shape[0]
ftr_a_count = epl_results[epl_results["FTR"] == "A"].shape[0]
ftr_d_count = epl_results[epl_results["FTR"] == "D"].shape[0]

fig = go.Figure()
fig.add_trace(go.Bar(x = ["Home wins", "Away wins", "Draws"], y = [ftr_h_count, ftr_a_count, ftr_d_count], name = "Full time results"))
fig.update_layout(title = "Full time results", xaxis_title = "Result", yaxis_title = "Count")
fig.show()

It's clear being h/a makes a significant difference, so we will seperate the data into home and away stats, as well as overall stats.

- 'Season' - needed for season-long stats, but probably not useful for individual game prediction?
- 'DateTime' - not useful
- 'HomeTeam' - needed 
- 'AwayTeam' - needed
- 'FTHG' - probably useful
- 'FTAG' - probably useful
- 'FTR' - result of the game, target variable
- 'HTHG' - probably not useful
- 'HTAG' - probably not useful
- 'HTR' - probably not useful
- 'Referee' - Probaly useful, but referees are not consistent throughout the years, 
            so we probably could not reliably use this feature.
- 'HS' - probably useful
- 'AS' - probably useful
- 'HST' - not useful, as mentioned previously
- 'AST' - not useful, as mentioned previously
- 'HC' - maybe useful
- 'AC' - maybe useful
- 'HF' - maybe useful
- 'AF' - maybe useful
- 'HY' - maybe useful
- 'AY' - maybe useful
- 'HR' - maybe useful
- 'AR' - maybe useful

# Feature selection

Unfortunately, we will be unable to predict any results based on the stats for a single game, as we will only have previous data.
Therefore, we will need to aggregate data to a season level. I'm going to try and predict the outcome of a game based on a team's historical performances. Specific information against the opponent may also be useful.

I'm goint to start by just picking a ton of different stats, and then we will evaluate their significance later, and trim down the number of features.

Features:
- Per season stats:
    - Points per game (home, away, total)
    - Goals scored per game (home, away, total)
    - Goals conceded per game (home, away, total)
    - Goal difference per game (home, away, total)
    - Shots per game (home, away, total)
    - Fouls per game (home, away, total)
    - Corners per game (home, away, total)
    - Yellow cards per game (home, away, total)
    - Red cards per game (home, away, total)
    - Finishing position

- Per last 5 games against opponent:
    - Points against opponent per game (home, away, total)
    - Goals scored against opponent per game (home, away, total)
    - Shots against opponent per game (home, away, total)
    - Fouls against opponent per game (home, away, total)
    - Corners against opponent per game (home, away, total)
    - Yellow cards against opponent per game (home, away, total)
    - Red cards against opponent per game (home, away, total)

# Per-Season stats

In [4]:
season_2019_20 = epl_results[epl_results["Season"] == "2019-20"].copy()

In [5]:
"""
Calculate points per game for each team
- Home team points per game at home
- Away team points per game on away
- Overall points per game for both teams
"""

def calculate_points_for_season(season):
    def calculate_points(results):
        return ((results == "W").sum() * 3 + (results == "D").sum() * 1) / len(results)

    home_points = season.groupby("HomeTeam").apply(
        lambda x: calculate_points(x["FTR"].map({"H": "W", "D": "D", "A": "L"}))
    )
    away_points = season.groupby("AwayTeam").apply(
        lambda x: calculate_points(x["FTR"].map({"A": "W", "D": "D", "H": "L"}))
    )

    all_points = pd.concat([home_points, away_points], axis=1)
    all_points.columns = ["PointsPGAtHome", "PointsPGOnAway"]
    all_points["PointsPGOverall"] = (all_points["PointsPGAtHome"] + all_points["PointsPGOnAway"]) / 2

    # Add to main dataset
    season = season.merge(all_points, left_on="HomeTeam", right_index=True, how="left")
    season = season.merge(all_points, left_on="AwayTeam", right_index=True, 
                                        how="left", suffixes=('_HomeTeam', '_AwayTeam'))

    season = season.drop(columns = ["PointsPGAtHome_AwayTeam", "PointsPGOnAway_HomeTeam"]
                                        ).rename(columns = {"PointsPGAtHome_HomeTeam" : "HomeTeamPointsPGAtHome",
                                                            "PointsPGOnAway_AwayTeam" : "AwayTeamPointsPGOnAway",
                                                            "PointsPGOverall_HomeTeam" : "HomeTeamPointsPGOverall",
                                                            "PointsPGOverall_AwayTeam" : "AwayTeamPointsPGOverall"})
    return season


In [6]:
"""
Goals scored per game for each team
- Home team goals scored per game at home
- Away team goals scored per game on away
- Overall goals scored per game for both teams
"""
def calculate_goals_scored_for_season(season):
    home_goals_scored = season.groupby("HomeTeam").apply(
        lambda x: x["FTHG"].mean()
    )
    away_goals_scored = season.groupby("AwayTeam").apply(
        lambda x: x["FTAG"].mean()
    )

    all_goals_scored = pd.concat([home_goals_scored, away_goals_scored], axis=1)
    all_goals_scored.columns = ["HomeTeamGoalsScoredPG", "AwayTeamGoalsScoredPG"]
    all_goals_scored["OverallGoalsScoredPG"] = (all_goals_scored["HomeTeamGoalsScoredPG"] + all_goals_scored["AwayTeamGoalsScoredPG"]) / 2

    # Add to main dataset
    season = season.merge(all_goals_scored, left_on="HomeTeam", right_index=True, how="left")
    season = season.merge(all_goals_scored, left_on="AwayTeam", right_index=True, 
                                      how="left", suffixes=('_HomeTeam', '_AwayTeam'))
    season = season.drop(columns = ["HomeTeamGoalsScoredPG_AwayTeam", "AwayTeamGoalsScoredPG_HomeTeam"]
                                     ).rename(columns = {"HomeTeamGoalsScoredPG_HomeTeam" : "HomeTeamGoalsScoredPGAtHome",
                                                        "AwayTeamGoalsScoredPG_AwayTeam" : "AwayTeamGoalsScoredPGOnAway",
                                                        "OverallGoalsScoredPG_HomeTeam" : "HomeTeamGoalsScoredPGOverall",
                                                        "OverallGoalsScoredPG_AwayTeam" : "AwayTeamGoalsScoredPGOverall"})  
    return season

In [7]:
"""
Goals conceded per game for each team
- Home team goals conceded per game at home
- Away team goals conceded per game on away
- Overall goals conceded per game for both teams
"""
def calculate_goals_conceded_for_season(season):
    home_goals_conceded = season.groupby("HomeTeam").apply(
        lambda x: x["FTAG"].mean()
    )
    away_goals_conceded = season.groupby("AwayTeam").apply(
        lambda x: x["FTHG"].mean()
    )

    all_goals_conceded = pd.concat([home_goals_conceded, away_goals_conceded], axis=1)  
    all_goals_conceded.columns = ["HomeTeamGoalsConcededPG", "AwayTeamGoalsConcededPG"]
    all_goals_conceded["OverallGoalsConcededPG"] = (all_goals_conceded["HomeTeamGoalsConcededPG"] + all_goals_conceded["AwayTeamGoalsConcededPG"]) / 2

    # Add to main dataset
    season = season.merge(all_goals_conceded, left_on="HomeTeam", right_index=True, how="left")
    season = season.merge(all_goals_conceded, left_on="AwayTeam", right_index=True, 
                                      how="left", suffixes=('_HomeTeam', '_AwayTeam'))  
    season = season.drop(columns = ["HomeTeamGoalsConcededPG_AwayTeam", "AwayTeamGoalsConcededPG_HomeTeam"]
                                     ).rename(columns = {"HomeTeamGoalsConcededPG_HomeTeam" : "HomeTeamGoalsConcededPGAtHome",
                                                        "AwayTeamGoalsConcededPG_AwayTeam" : "AwayTeamGoalsConcededPGOnAway",
                                                        "OverallGoalsConcededPG_HomeTeam" : "HomeTeamGoalsConcededPGOverall",
                                                        "OverallGoalsConcededPG_AwayTeam" : "AwayTeamGoalsConcededPGOverall"})  
    return season

In [8]:
"""
Goal difference per game for each team
- Home team goal difference per game at home
- Away team goal difference per game on away
- Overall goal difference per game for both teams

This feature may either be redundant, or render goals scored and goals conceded redundant. We will see.
"""
def calculate_goal_difference_for_season(season):
    season["HomeTeamGoalDifferencePGAtHome"] = season["HomeTeamGoalsScoredPGAtHome"] - season["HomeTeamGoalsConcededPGAtHome"]
    season["AwayTeamGoalDifferencePGOnAway"] = season["AwayTeamGoalsScoredPGOnAway"] - season["AwayTeamGoalsConcededPGOnAway"]
    season["HomeTeamGoalDifferencePGOverall"] = season["HomeTeamGoalsScoredPGOverall"] - season["HomeTeamGoalsConcededPGOverall"]
    season["AwayTeamGoalDifferencePGOverall"] = season["AwayTeamGoalsScoredPGOverall"] - season["AwayTeamGoalsConcededPGOverall"]
    return season

In [9]:
"""
Shots per game for each team
- Home team shots per game at home
- Away team shots per game on away
- Overall shots per game for both teams

- Shots on target has a massive change from the 2013-14 season onward, hence I believe the way it was counted was changed.
- Hence, I will not be using it as it's values are not consistent over the whole dataset.
"""
def calculate_shots_for_season(season):
    home_shots = season.groupby("HomeTeam").apply(
        lambda x: x["HS"].mean()
    )
    away_shots = season.groupby("AwayTeam").apply(
        lambda x: x["AS"].mean()
    )

    all_shots = pd.concat([home_shots, away_shots], axis=1)
    all_shots.columns = ["HomeTeamShotsPG", "AwayTeamShotsPG"]
    all_shots["OverallShotsPG"] = (all_shots["HomeTeamShotsPG"] + all_shots["AwayTeamShotsPG"]) / 2

    # Add to main dataset
    season = season.merge(all_shots, left_on="HomeTeam", right_index=True, how="left")
    season = season.merge(all_shots, left_on="AwayTeam", right_index=True, 
                                      how="left", suffixes=('_HomeTeam', '_AwayTeam'))
    season = season.drop(columns = ["HomeTeamShotsPG_AwayTeam", "AwayTeamShotsPG_HomeTeam"]
                                     ).rename(columns = {"HomeTeamShotsPG_HomeTeam" : "HomeTeamShotsPGAtHome",
                                                        "AwayTeamShotsPG_AwayTeam" : "AwayTeamShotsPGOnAway",
                                                        "OverallShotsPG_HomeTeam" : "HomeTeamShotsPGOverall",
                                                        "OverallShotsPG_AwayTeam" : "AwayTeamShotsPGOverall"})
    return season


In [10]:
"""
Fouls per game for each team
- Home team fouls per game at home
- Away team fouls per game on away
- Overall fouls per game for both teams
"""
def calculate_fouls_for_season(season):
    home_fouls = season.groupby("HomeTeam").apply(
        lambda x: x["HF"].mean()
    )
    away_fouls = season.groupby("AwayTeam").apply(
        lambda x: x["AF"].mean()
    )

    all_fouls = pd.concat([home_fouls, away_fouls], axis=1)
    all_fouls.columns = ["HomeTeamFoulsPG", "AwayTeamFoulsPG"]
    all_fouls["OverallFoulsPG"] = (all_fouls["HomeTeamFoulsPG"] + all_fouls["AwayTeamFoulsPG"]) / 2

    # Add to main dataset
    season = season.merge(all_fouls, left_on="HomeTeam", right_index=True, how="left")
    season = season.merge(all_fouls, left_on="AwayTeam", right_index=True, 
                                      how="left", suffixes=('_HomeTeam', '_AwayTeam'))
    season = season.drop(columns = ["HomeTeamFoulsPG_AwayTeam", "AwayTeamFoulsPG_HomeTeam"]
                                     ).rename(columns = {"HomeTeamFoulsPG_HomeTeam" : "HomeTeamFoulsPGAtHome",
                                                        "AwayTeamFoulsPG_AwayTeam" : "AwayTeamFoulsPGOnAway",
                                                        "OverallFoulsPG_HomeTeam" : "HomeTeamFoulsPGOverall",
                                                        "OverallFoulsPG_AwayTeam" : "AwayTeamFoulsPGOverall"})
    return season

In [11]:
"""
Corners per game for each team
- Home team corners per game at home
- Away team corners per game on away
- Overall corners per game for both teams
"""
def calculate_corners_for_season(season):
    home_corners = season.groupby("HomeTeam").apply(
        lambda x: x["HC"].mean()
    )
    away_corners = season.groupby("AwayTeam").apply(
        lambda x: x["AC"].mean()
    )

    all_corners = pd.concat([home_corners, away_corners], axis=1)
    all_corners.columns = ["HomeTeamCornersPG", "AwayTeamCornersPG"]
    all_corners["OverallCornersPG"] = (all_corners["HomeTeamCornersPG"] + all_corners["AwayTeamCornersPG"]) / 2

    # Add to main dataset
    season = season.merge(all_corners, left_on="HomeTeam", right_index=True, how="left")
    season = season.merge(all_corners, left_on="AwayTeam", right_index=True, 
                                      how="left", suffixes=('_HomeTeam', '_AwayTeam'))
    season = season.drop(columns = ["HomeTeamCornersPG_AwayTeam", "AwayTeamCornersPG_HomeTeam"]
                                     ).rename(columns = {"HomeTeamCornersPG_HomeTeam" : "HomeTeamCornersPGAtHome",
                                                        "AwayTeamCornersPG_AwayTeam" : "AwayTeamCornersPGOnAway",
                                                        "OverallCornersPG_HomeTeam" : "HomeTeamCornersPGOverall",
                                                        "OverallCornersPG_AwayTeam" : "AwayTeamCornersPGOverall"})
    return season

In [12]:
"""
Yellow cards per game for each team
- Home team yellow cards per game at home
- Away team yellow cards per game on away
- Overall yellow cards per game for both teams
"""
def calculate_yellow_cards_for_season(season):
    home_yellow_cards = season.groupby("HomeTeam").apply(
        lambda x: x["HY"].mean()
    )
    away_yellow_cards = season.groupby("AwayTeam").apply(
        lambda x: x["AY"].mean()
    )

    all_yellow_cards = pd.concat([home_yellow_cards, away_yellow_cards], axis=1)
    all_yellow_cards.columns = ["HomeTeamYellowCardsPG", "AwayTeamYellowCardsPG"]
    all_yellow_cards["OverallYellowCardsPG"] = (all_yellow_cards["HomeTeamYellowCardsPG"] + all_yellow_cards["AwayTeamYellowCardsPG"]) / 2

    # Add to main dataset
    season = season.merge(all_yellow_cards, left_on="HomeTeam", right_index=True, how="left")
    season = season.merge(all_yellow_cards, left_on="AwayTeam", right_index=True, 
                                      how="left", suffixes=('_HomeTeam', '_AwayTeam'))
    season = season.drop(columns = ["HomeTeamYellowCardsPG_AwayTeam", "AwayTeamYellowCardsPG_HomeTeam"]
                                     ).rename(columns = {"HomeTeamYellowCardsPG_HomeTeam" : "HomeTeamYellowCardsPGAtHome",
                                                        "AwayTeamYellowCardsPG_AwayTeam" : "AwayTeamYellowCardsPGOnAway",
                                                        "OverallYellowCardsPG_HomeTeam" : "HomeTeamYellowCardsPGOverall",
                                                        "OverallYellowCardsPG_AwayTeam" : "AwayTeamYellowCardsPGOverall"})
    return season

In [13]:
"""
Red cards per game for each team
- Home team red cards per game at home
- Away team red cards per game on away
- Overall red cards per game for both teams
"""
def calculate_red_cards_for_season(season):
    home_red_cards = season.groupby("HomeTeam").apply(
        lambda x: x["HR"].mean()
    )
    away_red_cards = season.groupby("AwayTeam").apply(
        lambda x: x["AR"].mean()
    )

    all_red_cards = pd.concat([home_red_cards, away_red_cards], axis=1)
    all_red_cards.columns = ["HomeTeamRedCardsPG", "AwayTeamRedCardsPG"]
    all_red_cards["OverallRedCardsPG"] = (all_red_cards["HomeTeamRedCardsPG"] + all_red_cards["AwayTeamRedCardsPG"]) / 2

    # Add to main dataset
    season = season.merge(all_red_cards, left_on="HomeTeam", right_index=True, how="left")
    season = season.merge(all_red_cards, left_on="AwayTeam", right_index=True, 
                                      how="left", suffixes=('_HomeTeam', '_AwayTeam'))
    season = season.drop(columns = ["HomeTeamRedCardsPG_AwayTeam", "AwayTeamRedCardsPG_HomeTeam"]
                                     ).rename(columns = {"HomeTeamRedCardsPG_HomeTeam" : "HomeTeamRedCardsPGAtHome",
                                                        "AwayTeamRedCardsPG_AwayTeam" : "AwayTeamRedCardsPGOnAway",
                                                        "OverallRedCardsPG_HomeTeam" : "HomeTeamRedCardsPGOverall",
                                                        "OverallRedCardsPG_AwayTeam" : "AwayTeamRedCardsPGOverall"})
    return season


---

In [None]:
test = epl_results.copy()

test = test.groupby("Season").apply(calculate_points_for_season).reset_index(drop = True)
test = test.groupby("Season").apply(calculate_goals_scored_for_season).reset_index(drop = True)
test = test.groupby("Season").apply(calculate_goals_conceded_for_season).reset_index(drop = True)
test = test.groupby("Season").apply(calculate_goal_difference_for_season).reset_index(drop = True)
test = test.groupby("Season").apply(calculate_shots_for_season).reset_index(drop = True)
test = test.groupby("Season").apply(calculate_fouls_for_season).reset_index(drop = True)
test = test.groupby("Season").apply(calculate_corners_for_season).reset_index(drop = True)
test = test.groupby("Season").apply(calculate_yellow_cards_for_season).reset_index(drop = True)
test = test.groupby("Season").apply(calculate_red_cards_for_season).reset_index(drop = True)

---

# Last 5 games between two teams

Unfortunately, this will likely need to be done fixture by fixture.

In [None]:
def get_last_5_games_stats(data, homeTeam, awayTeam, dateTime):
    all_games_between_teams = data[(data["HomeTeam"] == homeTeam) & (data["AwayTeam"] == awayTeam) |
                                    (data["HomeTeam"] == awayTeam) & (data["AwayTeam"] == homeTeam)]
    all_games_between_teams_previous = all_games_between_teams[all_games_between_teams["DateTime"] < dateTime].sort_values(by = "DateTime", ascending = True)
    last_5_games = all_games_between_teams_previous.tail(5)

    return last_5_games


get_last_5_games_stats(test, "Liverpool", "Man United", "2019-05-12 17:30:00")

In [16]:
"""Get points per game for last 5 games between two teams"""
def get_points_per_game_for_last_5_games(fixture, data):

    homeTeam = fixture["HomeTeam"]
    awayTeam = fixture["AwayTeam"]
    dateTime = fixture["DateTime"]
    
    last_5_games = get_last_5_games_stats(data, homeTeam, awayTeam, dateTime)
    def calculate_points(results):
        return ((results == "W").sum() * 3 + (results == "D").sum() * 1) / len(results)
    
    home_points = last_5_games.groupby("HomeTeam").apply(
        lambda x: calculate_points(x["FTR"].map({"H": "W", "D": "D", "A": "L"}))
    )
    away_points = last_5_games.groupby("AwayTeam").apply(
        lambda x: calculate_points(x["FTR"].map({"A": "W", "D": "D", "H": "L"}))
    )

    all_points = pd.concat([home_points, away_points], axis=1)

    try:
        all_points.columns = ["HomeTeamPointsPG", "AwayTeamPointsPG"]
        all_points["OverallPointsPG"] = (all_points["HomeTeamPointsPG"] + all_points["AwayTeamPointsPG"]) / 2
    except ValueError as e:
        #print(e)
        #print(all_points)
        fixture["HomeTeamPointsPGAtHomeAgainstOpponent"] = 0
        fixture["AwayTeamPointsPGOnAwayAgainstOpponent"] = 0
        fixture["HomeTeamPointsPGOverallAgainstOpponent"] = 0
        fixture["AwayTeamPointsPGOverallAgainstOpponent"] = 0
        return fixture

    fixture["HomeTeamPointsPGAtHomeAgainstOpponent"] = all_points.loc[homeTeam, "HomeTeamPointsPG"]
    fixture["AwayTeamPointsPGOnAwayAgainstOpponent"] = all_points.loc[awayTeam, "AwayTeamPointsPG"]
    fixture["HomeTeamPointsPGOverallAgainstOpponent"] = all_points.loc[homeTeam, "OverallPointsPG"]
    fixture["AwayTeamPointsPGOverallAgainstOpponent"] = all_points.loc[awayTeam, "OverallPointsPG"]
    return fixture


In [17]:
def get_goals_scored_per_game_for_last_5_games(fixture, data):
    homeTeam = fixture["HomeTeam"]
    awayTeam = fixture["AwayTeam"]
    dateTime = fixture["DateTime"]
    
    last_5_games = get_last_5_games_stats(data, homeTeam, awayTeam, dateTime)
    home_goals_scored = last_5_games.groupby("HomeTeam").apply(
        lambda x: x["FTHG"].mean()
    )
    away_goals_scored = last_5_games.groupby("AwayTeam").apply(
        lambda x: x["FTAG"].mean()
    )

    all_goals_scored = pd.concat([home_goals_scored, away_goals_scored], axis=1)
    try:
        all_goals_scored.columns = ["HomeTeamGoalsScoredPG", "AwayTeamGoalsScoredPG"]
        all_goals_scored["OverallGoalsScoredPG"] = (all_goals_scored["HomeTeamGoalsScoredPG"] + all_goals_scored["AwayTeamGoalsScoredPG"]) / 2

    except ValueError as e:
        fixture["HomeTeamGoalsScoredPGAtHomeAgainstOpponent"] = 0
        fixture["AwayTeamGoalsScoredPGOnAwayAgainstOpponent"] = 0
        fixture["HomeTeamGoalsScoredPGOverallAgainstOpponent"] = 0
        fixture["AwayTeamGoalsScoredPGOverallAgainstOpponent"] = 0
        return fixture

    fixture["HomeTeamGoalsScoredPGAtHomeAgainstOpponent"] = all_goals_scored.loc[homeTeam, "HomeTeamGoalsScoredPG"]
    fixture["AwayTeamGoalsScoredPGOnAwayAgainstOpponent"] = all_goals_scored.loc[awayTeam, "AwayTeamGoalsScoredPG"]
    fixture["HomeTeamGoalsScoredPGOverallAgainstOpponent"] = all_goals_scored.loc[homeTeam, "OverallGoalsScoredPG"]
    fixture["AwayTeamGoalsScoredPGOverallAgainstOpponent"] = all_goals_scored.loc[awayTeam, "OverallGoalsScoredPG"]
    return fixture


In this case goals conceded per game is just the opponent's goals scored per game, so this won't give us any new information.

In [18]:
def get_shots_per_game_for_last_5_games(fixture, data):
    homeTeam = fixture["HomeTeam"]
    awayTeam = fixture["AwayTeam"]
    dateTime = fixture["DateTime"]
    
    last_5_games = get_last_5_games_stats(data, homeTeam, awayTeam, dateTime)
    home_shots = last_5_games.groupby("HomeTeam").apply(
        lambda x: x["HS"].mean()
    )
    away_shots = last_5_games.groupby("AwayTeam").apply(
        lambda x: x["AS"].mean()
    )

    all_shots = pd.concat([home_shots, away_shots], axis=1)
    try:
        all_shots.columns = ["HomeTeamShotsPG", "AwayTeamShotsPG"]
        all_shots["OverallShotsPG"] = (all_shots["HomeTeamShotsPG"] + all_shots["AwayTeamShotsPG"]) / 2
    except ValueError as e:
        fixture["HomeTeamShotsPGAtHomeAgainstOpponent"] = 0
        fixture["AwayTeamShotsPGOnAwayAgainstOpponent"] = 0
        fixture["HomeTeamShotsPGOverallAgainstOpponent"] = 0
        fixture["AwayTeamShotsPGOverallAgainstOpponent"] = 0
        return fixture

    fixture["HomeTeamShotsPGAtHomeAgainstOpponent"] = all_shots.loc[homeTeam, "HomeTeamShotsPG"]
    fixture["AwayTeamShotsPGOnAwayAgainstOpponent"] = all_shots.loc[awayTeam, "AwayTeamShotsPG"]
    fixture["HomeTeamShotsPGOverallAgainstOpponent"] = all_shots.loc[homeTeam, "OverallShotsPG"]
    fixture["AwayTeamShotsPGOverallAgainstOpponent"] = all_shots.loc[awayTeam, "OverallShotsPG"]
    return fixture


In [19]:
def get_fouls_per_game_for_last_5_games(fixture, data):
    homeTeam = fixture["HomeTeam"]
    awayTeam = fixture["AwayTeam"]
    dateTime = fixture["DateTime"]
    
    last_5_games = get_last_5_games_stats(data, homeTeam, awayTeam, dateTime)
    home_fouls = last_5_games.groupby("HomeTeam").apply(
        lambda x: x["HF"].mean()
    )
    away_fouls = last_5_games.groupby("AwayTeam").apply(
        lambda x: x["AF"].mean()
    )

    all_fouls = pd.concat([home_fouls, away_fouls], axis=1)
    try:
        all_fouls.columns = ["HomeTeamFoulsPG", "AwayTeamFoulsPG"]
        all_fouls["OverallFoulsPG"] = (all_fouls["HomeTeamFoulsPG"] + all_fouls["AwayTeamFoulsPG"]) / 2
    except ValueError as e:
        fixture["HomeTeamFoulsPGAtHomeAgainstOpponent"] = 0
        fixture["AwayTeamFoulsPGOnAwayAgainstOpponent"] = 0
        fixture["HomeTeamFoulsPGOverallAgainstOpponent"] = 0
        fixture["AwayTeamFoulsPGOverallAgainstOpponent"] = 0
        return fixture

    fixture["HomeTeamFoulsPGAtHomeAgainstOpponent"] = all_fouls.loc[homeTeam, "HomeTeamFoulsPG"]
    fixture["AwayTeamFoulsPGOnAwayAgainstOpponent"] = all_fouls.loc[awayTeam, "AwayTeamFoulsPG"]
    fixture["HomeTeamFoulsPGOverallAgainstOpponent"] = all_fouls.loc[homeTeam, "OverallFoulsPG"]
    fixture["AwayTeamFoulsPGOverallAgainstOpponent"] = all_fouls.loc[awayTeam, "OverallFoulsPG"]
    return fixture

In [20]:
def get_corners_per_game_for_last_5_games(fixture, data):
    homeTeam = fixture["HomeTeam"]
    awayTeam = fixture["AwayTeam"]
    dateTime = fixture["DateTime"]
    
    last_5_games = get_last_5_games_stats(data, homeTeam, awayTeam, dateTime)
    home_corners = last_5_games.groupby("HomeTeam").apply(
        lambda x: x["HC"].mean()
    )
    away_corners = last_5_games.groupby("AwayTeam").apply(
        lambda x: x["AC"].mean()
    )

    all_corners = pd.concat([home_corners, away_corners], axis=1)
    try:
        all_corners.columns = ["HomeTeamCornersPG", "AwayTeamCornersPG"]
        all_corners["OverallCornersPG"] = (all_corners["HomeTeamCornersPG"] + all_corners["AwayTeamCornersPG"]) / 2
    except ValueError as e:
        fixture["HomeTeamCornersPGAtHomeAgainstOpponent"] = 0
        fixture["AwayTeamCornersPGOnAwayAgainstOpponent"] = 0
        fixture["HomeTeamCornersPGOverallAgainstOpponent"] = 0
        fixture["AwayTeamCornersPGOverallAgainstOpponent"] = 0
        return fixture

    fixture["HomeTeamCornersPGAtHomeAgainstOpponent"] = all_corners.loc[homeTeam, "HomeTeamCornersPG"]
    fixture["AwayTeamCornersPGOnAwayAgainstOpponent"] = all_corners.loc[awayTeam, "AwayTeamCornersPG"]
    fixture["HomeTeamCornersPGOverallAgainstOpponent"] = all_corners.loc[homeTeam, "OverallCornersPG"]
    fixture["AwayTeamCornersPGOverallAgainstOpponent"] = all_corners.loc[awayTeam, "OverallCornersPG"]
    return fixture

In [21]:
def get_yellow_cards_per_game_for_last_5_games(fixture, data):
    homeTeam = fixture["HomeTeam"]
    awayTeam = fixture["AwayTeam"]
    dateTime = fixture["DateTime"]
    
    last_5_games = get_last_5_games_stats(data, homeTeam, awayTeam, dateTime)
    home_yellow_cards = last_5_games.groupby("HomeTeam").apply(
        lambda x: x["HY"].mean()
    )
    away_yellow_cards = last_5_games.groupby("AwayTeam").apply(
        lambda x: x["AY"].mean()
    )

    all_yellow_cards = pd.concat([home_yellow_cards, away_yellow_cards], axis=1)
    try:
        all_yellow_cards.columns = ["HomeTeamYellowCardsPG", "AwayTeamYellowCardsPG"]
        all_yellow_cards["OverallYellowCardsPG"] = (all_yellow_cards["HomeTeamYellowCardsPG"] + all_yellow_cards["AwayTeamYellowCardsPG"]) / 2
    except ValueError as e:
        fixture["HomeTeamYellowCardsPGAtHomeAgainstOpponent"] = 0
        fixture["AwayTeamYellowCardsPGOnAwayAgainstOpponent"] = 0
        fixture["HomeTeamYellowCardsPGOverallAgainstOpponent"] = 0
        fixture["AwayTeamYellowCardsPGOverallAgainstOpponent"] = 0
        return fixture

    fixture["HomeTeamYellowCardsPGAtHomeAgainstOpponent"] = all_yellow_cards.loc[homeTeam, "HomeTeamYellowCardsPG"]
    fixture["AwayTeamYellowCardsPGOnAwayAgainstOpponent"] = all_yellow_cards.loc[awayTeam, "AwayTeamYellowCardsPG"]
    fixture["HomeTeamYellowCardsPGOverallAgainstOpponent"] = all_yellow_cards.loc[homeTeam, "OverallYellowCardsPG"]
    fixture["AwayTeamYellowCardsPGOverallAgainstOpponent"] = all_yellow_cards.loc[awayTeam, "OverallYellowCardsPG"]
    return fixture

In [22]:
def get_red_cards_per_game_for_last_5_games(fixture, data):
    homeTeam = fixture["HomeTeam"]
    awayTeam = fixture["AwayTeam"]
    dateTime = fixture["DateTime"]
    
    last_5_games = get_last_5_games_stats(data, homeTeam, awayTeam, dateTime)
    home_red_cards = last_5_games.groupby("HomeTeam").apply(
        lambda x: x["HR"].mean()
    )
    away_red_cards = last_5_games.groupby("AwayTeam").apply(
        lambda x: x["AR"].mean()
    )

    all_red_cards = pd.concat([home_red_cards, away_red_cards], axis=1)
    try:
        all_red_cards.columns = ["HomeTeamRedCardsPG", "AwayTeamRedCardsPG"]
        all_red_cards["OverallRedCardsPG"] = (all_red_cards["HomeTeamRedCardsPG"] + all_red_cards["AwayTeamRedCardsPG"]) / 2
    except ValueError as e:
        fixture["HomeTeamRedCardsPGAtHomeAgainstOpponent"] = 0
        fixture["AwayTeamRedCardsPGOnAwayAgainstOpponent"] = 0
        fixture["HomeTeamRedCardsPGOverallAgainstOpponent"] = 0
        fixture["AwayTeamRedCardsPGOverallAgainstOpponent"] = 0
        return fixture

    fixture["HomeTeamRedCardsPGAtHomeAgainstOpponent"] = all_red_cards.loc[homeTeam, "HomeTeamRedCardsPG"]
    fixture["AwayTeamRedCardsPGOnAwayAgainstOpponent"] = all_red_cards.loc[awayTeam, "AwayTeamRedCardsPG"]
    fixture["HomeTeamRedCardsPGOverallAgainstOpponent"] = all_red_cards.loc[homeTeam, "OverallRedCardsPG"]
    fixture["AwayTeamRedCardsPGOverallAgainstOpponent"] = all_red_cards.loc[awayTeam, "OverallRedCardsPG"]
    return fixture

In [None]:
test2 = test.copy()
test2.loc[:, "HomeTeamPointsPGAtHomeAgainstOpponent"] = 0
test2.loc[:, "AwayTeamPointsPGOnAwayAgainstOpponent"] = 0
test2.loc[:, "HomeTeamPointsPGOverallAgainstOpponent"] = 0
test2.loc[:, "AwayTeamPointsPGOverallAgainstOpponent"] = 0
test2.loc[:, "HomeTeamGoalsScoredPGAtHomeAgainstOpponent"] = 0
test2.loc[:, "AwayTeamGoalsScoredPGOnAwayAgainstOpponent"] = 0
test2.loc[:, "HomeTeamGoalsScoredPGOverallAgainstOpponent"] = 0
test2.loc[:, "AwayTeamGoalsScoredPGOverallAgainstOpponent"] = 0
test2.loc[:, "HomeTeamShotsPGAtHomeAgainstOpponent"] = 0
test2.loc[:, "AwayTeamShotsPGOnAwayAgainstOpponent"] = 0
test2.loc[:, "HomeTeamShotsPGOverallAgainstOpponent"] = 0
test2.loc[:, "AwayTeamShotsPGOverallAgainstOpponent"] = 0
test2.loc[:, "HomeTeamFoulsPGAtHomeAgainstOpponent"] = 0
test2.loc[:, "AwayTeamFoulsPGOnAwayAgainstOpponent"] = 0
test2.loc[:, "HomeTeamFoulsPGOverallAgainstOpponent"] = 0
test2.loc[:, "AwayTeamFoulsPGOverallAgainstOpponent"] = 0
test2.loc[:, "HomeTeamCornersPGAtHomeAgainstOpponent"] = 0
test2.loc[:, "AwayTeamCornersPGOnAwayAgainstOpponent"] = 0
test2.loc[:, "HomeTeamCornersPGOverallAgainstOpponent"] = 0
test2.loc[:, "AwayTeamCornersPGOverallAgainstOpponent"] = 0
test2.loc[:, "HomeTeamYellowCardsPGAtHomeAgainstOpponent"] = 0
test2.loc[:, "AwayTeamYellowCardsPGOnAwayAgainstOpponent"] = 0
test2.loc[:, "HomeTeamYellowCardsPGOverallAgainstOpponent"] = 0
test2.loc[:, "AwayTeamYellowCardsPGOverallAgainstOpponent"] = 0
test2.loc[:, "HomeTeamRedCardsPGAtHomeAgainstOpponent"] = 0
test2.loc[:, "AwayTeamRedCardsPGOnAwayAgainstOpponent"] = 0
test2.loc[:, "HomeTeamRedCardsPGOverallAgainstOpponent"] = 0
test2.loc[:, "AwayTeamRedCardsPGOverallAgainstOpponent"] = 0

# Yuck... this is the slowest part of the code as it is looping through every row.
# To be revisited in the future, even some basic optimisation should definitely help.
test2 = test2.apply(lambda row: get_points_per_game_for_last_5_games(row, test2), axis = 1)
test2 = test2.apply(lambda row: get_goals_scored_per_game_for_last_5_games(row, test2), axis = 1)
test2 = test2.apply(lambda row: get_shots_per_game_for_last_5_games(row, test2), axis = 1)
test2 = test2.apply(lambda row: get_fouls_per_game_for_last_5_games(row, test2), axis = 1)
test2 = test2.apply(lambda row: get_corners_per_game_for_last_5_games(row, test2), axis = 1)
test2 = test2.apply(lambda row: get_yellow_cards_per_game_for_last_5_games(row, test2), axis = 1)
test2 = test2.apply(lambda row: get_red_cards_per_game_for_last_5_games(row, test2), axis = 1)

In [None]:
test2

In [None]:
test2.info()

In [None]:
test2.isna().sum()

In [None]:
test2.to_csv("data_investigation.csv", index = False)


In [2]:
test2 = pd.read_csv("../data/data_investigation.csv")

# Season finish

In [None]:
"""
For each season, rank teams on PointsPGOverall

Start by creating a datafame with each season and each team and their finishing position for the season.

Columns:
- Season
- Team
- PointsPGOverall
- Finishing Position
"""

team_finishing_position = test2.groupby(["Season", "HomeTeam"])[["HomeTeamPointsPGOverall", "HomeTeamGoalDifferencePGOverall"]].max().reset_index()
team_finishing_position = team_finishing_position.rename(columns = {"HomeTeam" : "Team"})
team_finishing_position = team_finishing_position.sort_values(by = ["Season", "HomeTeamPointsPGOverall", "HomeTeamGoalDifferencePGOverall"], ascending = False)
team_finishing_position = team_finishing_position.groupby("Season").apply(lambda x: x.assign(FinishingPosition=x.sort_values(["HomeTeamPointsPGOverall", "HomeTeamGoalDifferencePGOverall"], ascending=[False, False]).reset_index(drop=True).index + 1)).reset_index(drop=True)

# Merge for HomeTeam
test3 = test2.merge(team_finishing_position.drop(columns=["HomeTeamGoalDifferencePGOverall", 'HomeTeamPointsPGOverall']), left_on=["Season", "HomeTeam"], right_on=["Season", "Team"], how="left")
test3 = test3.rename(columns={"FinishingPosition": "HomeTeam_finish"})

# Merge for AwayTeam
test3 = test3.merge(team_finishing_position.drop(columns=["HomeTeamGoalDifferencePGOverall", 'HomeTeamPointsPGOverall']), left_on=["Season", "AwayTeam"], right_on=["Season", "Team"], how="left")
test3 = test3.rename(columns={"FinishingPosition": "AwayTeam_finish"})

test3 = test3.drop(columns=["Team_x", "Team_y"])
test3


In [None]:
team_finishing_position[team_finishing_position["Season"] == "2000-01"]


In [None]:
test3.columns

# Handling promoted teams


We have a set of issues with promoted teams. 

1. They have no historical data.
2. Any "stats from previous season", "stats against opponent" will be NaN, as the data doesn't exist.

We will need to either:

1. Assume that we have no information on them, and default values to 0.
2. Use the mean of the column for the teams that have been promoted in the past.
3. Use other teams that have been promoted in the past as a proxy.

However all of those will likely result in each promoted team going straight to the relegation zone, which is clearly not always the case.

We could potentially use the mean if the bottom n teams in the league. This will in some cases result in teams being too strong, and in some cases too weak, but it may be good enough?

# Promoted teams and their finishing positions over time

In [6]:
def get_promoted_teams(team_finishing_position):
    """
    Get the teams that have been promoted in the past.
    A team has been promoted if they appear in one season and not the previous season.

    We will get a dataframe of with collumns:
    - Season
    - Team
    - Finishing Position

    Containing the 17 teams with the best finishing positions for each season.
    """
    # Get the top 17 teams for each season based on finishing position
    teams_by_season = team_finishing_position.copy()
    top_17_teams = teams_by_season.groupby("Season").apply(lambda x: x.nsmallest(17, 'FinishingPosition')).reset_index(drop=True)

    teams_series = teams_by_season.groupby('Season')['Team'].apply(list)
    last_season_teams = top_17_teams.groupby("Season")["Team"].apply(list).shift(1)
    
    teams_series = teams_series[1:]
    last_season_teams = last_season_teams[1:]

    # Subtract teams from the previous season from the current season
    # Should give the 3 teams that have been promoted
    promoted_teams = teams_series.combine(last_season_teams, lambda x, y: list(set(x) - set(y)))

    promoted_teams_df = pd.DataFrame({"Season" : promoted_teams.index, "Team" : promoted_teams.values})
    promoted_teams_df = promoted_teams_df.explode("Team")

    # Get the points per game for the promoted teams
    promoted_teams_df = promoted_teams_df.merge(team_finishing_position, on = ["Season", "Team"], how = "left")

    return promoted_teams_df
    


In [None]:
promoted_teams = get_promoted_teams(team_finishing_position)
mean_finishing_positions = promoted_teams.groupby("Season")["FinishingPosition"].mean()

In [None]:
promoted_teams.columns

In [None]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=mean_finishing_positions.index.astype(str), 
                     y=mean_finishing_positions.values, 
                     name="Mean Finishing Position"))
overall_mean = mean_finishing_positions.mean()

fig.add_hline(y=overall_mean, line_dash="dash", line_color="red", annotation_text=f"Overall Mean: {overall_mean:.2f}", annotation_position="top right")

fig.update_xaxes(type='category')
fig.update_layout(
    title="Mean Finishing Position of Promoted Teams Over Time",
    xaxis_title="Season",
    yaxis_title="Mean Finishing Position",
    yaxis=dict(
        range=[20, 1],
        autorange=False,
        tickmode='linear',
        tick0=1,
        dtick=1
    )
)
fig.show()


In [None]:
print(f"Mean finishing position of promoted teams:                 {overall_mean}")
print(f"Standard deviation of finishing position of promoted teams {mean_finishing_positions.std()}")

So, we know the mean finishing position and standard deviation of finishing position of promoted teams. 
Lets also get the rest of the data for them.

Since we will have no data for the single season after which a team has been promoted, we will use the mean of the columns for each of the promoted teams in the data, maybe also randomising according to the standard deviation as well.

We need to add the columns (home, away, overall):
- TeamPointsPG
- TeamGoalsScoredPG
- TeamGoalsConcededPG
- TeamGoalDifferencePG
- TeamShotsPG
- TeamFoulsPG
- TeamCornersPG
- TeamYellowCardsPG
- TeamRedCardsPG

In [11]:
promoted_teams_2 = promoted_teams.copy()

# We don't include PointsPGOverall or GoalDifferencePGOverall because we already have them in the finishing position.
home_stats = test3.groupby(['Season', 'HomeTeam']).agg({
    'HomeTeamPointsPGAtHome': 'first',
    'HomeTeamGoalsScoredPGOverall': 'first',
    'HomeTeamGoalsScoredPGAtHome': 'first',
    'HomeTeamGoalsConcededPGOverall': 'first',
    'HomeTeamGoalsConcededPGAtHome': 'first',
    'HomeTeamGoalDifferencePGAtHome': 'first',
    'HomeTeamShotsPGOverall': 'first',
    'HomeTeamShotsPGAtHome': 'first',
    'HomeTeamFoulsPGOverall': 'first',
    'HomeTeamFoulsPGAtHome': 'first',
    'HomeTeamCornersPGOverall': 'first',
    'HomeTeamCornersPGAtHome': 'first',
    'HomeTeamYellowCardsPGOverall': 'first',
    'HomeTeamYellowCardsPGAtHome': 'first',
    'HomeTeamRedCardsPGOverall': 'first',
    'HomeTeamRedCardsPGAtHome': 'first'
}).reset_index()
away_stats = test3.groupby(['Season', 'AwayTeam']).agg({
    'AwayTeamPointsPGOnAway': 'first',
    'AwayTeamGoalsScoredPGOnAway': 'first',
    'AwayTeamGoalsConcededPGOnAway': 'first',
    'AwayTeamGoalDifferencePGOnAway': 'first',
    'AwayTeamShotsPGOnAway': 'first',
    'AwayTeamFoulsPGOnAway': 'first',
    'AwayTeamCornersPGOnAway': 'first',
    'AwayTeamYellowCardsPGOnAway': 'first',
    'AwayTeamRedCardsPGOnAway': 'first'
}).reset_index()


promoted_teams_2 = promoted_teams_2.merge(
    home_stats,
    left_on=['Season', 'Team'],
    right_on=['Season', 'HomeTeam'],
    how='left'
).drop(columns=['HomeTeam'])
columns_to_rename = {
    'HomeTeamPointsPGOverall': 'TeamPointsPGOverall',
    'HomeTeamPointsPGAtHome': 'TeamPointsPGAtHome',
    'HomeTeamGoalsScoredPGOverall': 'TeamGoalsScoredPGOverall',
    'HomeTeamGoalsScoredPGAtHome': 'TeamGoalsScoredPGAtHome',
    'HomeTeamGoalsConcededPGOverall': 'TeamGoalsConcededPGOverall',
    'HomeTeamGoalsConcededPGAtHome': 'TeamGoalsConcededPGAtHome',
    'HomeTeamGoalDifferencePGOverall': 'TeamGoalDifferencePGOverall',
    'HomeTeamGoalDifferencePGAtHome': 'TeamGoalDifferencePGAtHome',
    'HomeTeamShotsPGOverall': 'TeamShotsPGOverall',
    'HomeTeamShotsPGAtHome': 'TeamShotsPGAtHome',
    'HomeTeamFoulsPGOverall': 'TeamFoulsPGOverall',
    'HomeTeamFoulsPGAtHome': 'TeamFoulsPGAtHome',
    'HomeTeamCornersPGOverall': 'TeamCornersPGOverall',
    'HomeTeamCornersPGAtHome': 'TeamCornersPGAtHome',
    'HomeTeamYellowCardsPGOverall': 'TeamYellowCardsPGOverall',
    'HomeTeamYellowCardsPGAtHome': 'TeamYellowCardsPGAtHome',
    'HomeTeamRedCardsPGOverall': 'TeamRedCardsPGOverall',
    'HomeTeamRedCardsPGAtHome': 'TeamRedCardsPGAtHome'
}
promoted_teams_2 = promoted_teams_2.rename(columns=columns_to_rename)


promoted_teams_2 = promoted_teams_2.merge(
    away_stats,
    left_on=['Season', 'Team'],
    right_on=['Season', 'AwayTeam'],
    how='left'
).drop(columns=['AwayTeam'])
away_columns_to_rename = {
    'AwayTeamPointsPGOnAway': 'TeamPointsPGOnAway',
    'AwayTeamGoalsScoredPGOnAway': 'TeamGoalsScoredPGOnAway',
    'AwayTeamGoalsConcededPGOnAway': 'TeamGoalsConcededPGOnAway',
    'AwayTeamGoalDifferencePGOnAway': 'TeamGoalDifferencePGOnAway',
    'AwayTeamShotsPGOnAway': 'TeamShotsPGOnAway',
    'AwayTeamFoulsPGOnAway': 'TeamFoulsPGOnAway',
    'AwayTeamCornersPGOnAway': 'TeamCornersPGOnAway',
    'AwayTeamYellowCardsPGOnAway': 'TeamYellowCardsPGOnAway',
    'AwayTeamRedCardsPGOnAway': 'TeamRedCardsPGOnAway'
}
promoted_teams_2 = promoted_teams_2.rename(columns=away_columns_to_rename)

In [None]:
promoted_teams_2.columns

So, for each promoted team and season we will use a random value between the mean +/- the standard deviation of said column for all the promoted teams in that season to fill in the missing values.

First though we should check if there is any patterns over time, e.g. if promoted teams teams 20 years ago scored less than they do now.

In [None]:
fig = make_subplots(9,1, shared_xaxes=True, subplot_titles=("PointsPG", "GoalsScoredPG", "GoalsConcededPG", "GoalDifferencePG", "ShotsPG", "FoulsPG", "CornersPG", "YellowCardsPG", "RedCardsPG"))
fig.add_trace(go.Scatter(x = promoted_teams_2.groupby("Season")["TeamPointsPGAtHome"].mean().index.astype(str), y = promoted_teams_2.groupby("Season")["TeamPointsPGAtHome"].mean().values, name = "PointsPGAtHome"), row = 1, col = 1)
fig.add_trace(go.Scatter(x = promoted_teams_2.groupby("Season")["TeamPointsPGOnAway"].mean().index.astype(str), y = promoted_teams_2.groupby("Season")["TeamPointsPGOnAway"].mean().values, name = "PointsPGOnAway"), row = 1, col = 1)
fig.add_trace(go.Scatter(x = promoted_teams_2.groupby("Season")["TeamPointsPGOverall"].mean().index.astype(str), y = promoted_teams_2.groupby("Season")["TeamPointsPGOverall"].mean().values, name = "PointsPGOverall"), row = 1, col = 1)
fig.add_hline(y = promoted_teams_2.groupby("Season")["TeamPointsPGOverall"].mean().values.mean(), line_dash="dash", line_color="red", row = 1, col = 1)

fig.add_trace(go.Scatter(x = promoted_teams_2.groupby("Season")["TeamGoalsScoredPGAtHome"].mean().index.astype(str), y = promoted_teams_2.groupby("Season")["TeamGoalsScoredPGAtHome"].mean().values, name = "GoalsScoredPGAtHome"), row = 2, col = 1)
fig.add_trace(go.Scatter(x = promoted_teams_2.groupby("Season")["TeamGoalsScoredPGOnAway"].mean().index.astype(str), y = promoted_teams_2.groupby("Season")["TeamGoalsScoredPGOnAway"].mean().values, name = "GoalsScoredPGOnAway"), row = 2, col = 1)
fig.add_trace(go.Scatter(x = promoted_teams_2.groupby("Season")["TeamGoalsScoredPGOverall"].mean().index.astype(str), y = promoted_teams_2.groupby("Season")["TeamGoalsScoredPGOverall"].mean().values, name = "GoalsScoredPGOverall"), row = 2, col = 1)
fig.add_hline(y=promoted_teams_2.groupby("Season")["TeamGoalsScoredPGOverall"].mean().values.mean(), line_dash="dash", line_color="red", row = 2, col = 1)

fig.add_trace(go.Scatter(x = promoted_teams_2.groupby("Season")["TeamGoalsConcededPGAtHome"].mean().index.astype(str), y = promoted_teams_2.groupby("Season")["TeamGoalsConcededPGAtHome"].mean().values, name = "GoalsConcededPGAtHome"), row = 3, col = 1)
fig.add_trace(go.Scatter(x = promoted_teams_2.groupby("Season")["TeamGoalsConcededPGOnAway"].mean().index.astype(str), y = promoted_teams_2.groupby("Season")["TeamGoalsConcededPGOnAway"].mean().values, name = "GoalsConcededPGOnAway"), row = 3, col = 1)
fig.add_trace(go.Scatter(x = promoted_teams_2.groupby("Season")["TeamGoalsConcededPGOverall"].mean().index.astype(str), y = promoted_teams_2.groupby("Season")["TeamGoalsConcededPGOverall"].mean().values, name = "GoalsConcededPGOverall"), row = 3, col = 1)
fig.add_hline(y=promoted_teams_2.groupby("Season")["TeamGoalsConcededPGOverall"].mean().values.mean(), line_dash="dash", line_color="red", row = 3, col = 1)

fig.add_trace(go.Scatter(x = promoted_teams_2.groupby("Season")["TeamGoalDifferencePGAtHome"].mean().index.astype(str), y = promoted_teams_2.groupby("Season")["TeamGoalDifferencePGAtHome"].mean().values, name = "GoalDifferencePGAtHome"), row = 4, col = 1)
fig.add_trace(go.Scatter(x = promoted_teams_2.groupby("Season")["TeamGoalDifferencePGOnAway"].mean().index.astype(str), y = promoted_teams_2.groupby("Season")["TeamGoalDifferencePGOnAway"].mean().values, name = "GoalDifferencePGOnAway"), row = 4, col = 1)
fig.add_trace(go.Scatter(x = promoted_teams_2.groupby("Season")["TeamGoalDifferencePGOverall"].mean().index.astype(str), y = promoted_teams_2.groupby("Season")["TeamGoalDifferencePGOverall"].mean().values, name = "GoalDifferencePGOverall"), row = 4, col = 1)
fig.add_hline(y=promoted_teams_2.groupby("Season")["TeamGoalDifferencePGOverall"].mean().values.mean(), line_dash="dash", line_color="red", row = 4, col = 1)

fig.add_trace(go.Scatter(x = promoted_teams_2.groupby("Season")["TeamShotsPGAtHome"].mean().index.astype(str), y = promoted_teams_2.groupby("Season")["TeamShotsPGAtHome"].mean().values, name = "ShotsPGAtHome"), row = 5, col = 1)
fig.add_trace(go.Scatter(x = promoted_teams_2.groupby("Season")["TeamShotsPGOnAway"].mean().index.astype(str), y = promoted_teams_2.groupby("Season")["TeamShotsPGOnAway"].mean().values, name = "ShotsPGOnAway"), row = 5, col = 1)
fig.add_trace(go.Scatter(x = promoted_teams_2.groupby("Season")["TeamShotsPGOverall"].mean().index.astype(str), y = promoted_teams_2.groupby("Season")["TeamShotsPGOverall"].mean().values, name = "ShotsPGOverall"), row = 5, col = 1)
fig.add_hline(y=promoted_teams_2.groupby("Season")["TeamShotsPGOverall"].mean().values.mean(), line_dash="dash", line_color="red", row = 5, col = 1)

fig.add_trace(go.Scatter(x = promoted_teams_2.groupby("Season")["TeamFoulsPGAtHome"].mean().index.astype(str), y = promoted_teams_2.groupby("Season")["TeamFoulsPGAtHome"].mean().values, name = "FoulsPGAtHome"), row = 6, col = 1)
fig.add_trace(go.Scatter(x = promoted_teams_2.groupby("Season")["TeamFoulsPGOnAway"].mean().index.astype(str), y = promoted_teams_2.groupby("Season")["TeamFoulsPGOnAway"].mean().values, name = "FoulsPGOnAway"), row = 6, col = 1)
fig.add_trace(go.Scatter(x = promoted_teams_2.groupby("Season")["TeamFoulsPGOverall"].mean().index.astype(str), y = promoted_teams_2.groupby("Season")["TeamFoulsPGOverall"].mean().values, name = "FoulsPGOverall"), row = 6, col = 1)
fig.add_hline(y=promoted_teams_2.groupby("Season")["TeamFoulsPGOverall"].mean().values.mean(), line_dash="dash", line_color="red", row = 6, col = 1)

fig.add_trace(go.Scatter(x = promoted_teams_2.groupby("Season")["TeamCornersPGAtHome"].mean().index.astype(str), y = promoted_teams_2.groupby("Season")["TeamCornersPGAtHome"].mean().values, name = "CornersPGAtHome"), row = 7, col = 1)
fig.add_trace(go.Scatter(x = promoted_teams_2.groupby("Season")["TeamCornersPGOnAway"].mean().index.astype(str), y = promoted_teams_2.groupby("Season")["TeamCornersPGOnAway"].mean().values, name = "CornersPGOnAway"), row = 7, col = 1)
fig.add_trace(go.Scatter(x = promoted_teams_2.groupby("Season")["TeamCornersPGOverall"].mean().index.astype(str), y = promoted_teams_2.groupby("Season")["TeamCornersPGOverall"].mean().values, name = "CornersPGOverall"), row = 7, col = 1)
fig.add_hline(y=promoted_teams_2.groupby("Season")["TeamCornersPGOverall"].mean().values.mean(), line_dash="dash", line_color="red", row = 7, col = 1)

fig.add_trace(go.Scatter(x = promoted_teams_2.groupby("Season")["TeamYellowCardsPGAtHome"].mean().index.astype(str), y = promoted_teams_2.groupby("Season")["TeamYellowCardsPGAtHome"].mean().values, name = "YellowCardsPGAtHome"), row = 8, col = 1)
fig.add_trace(go.Scatter(x = promoted_teams_2.groupby("Season")["TeamYellowCardsPGOnAway"].mean().index.astype(str), y = promoted_teams_2.groupby("Season")["TeamYellowCardsPGOnAway"].mean().values, name = "YellowCardsPGOnAway"), row = 8, col = 1)
fig.add_trace(go.Scatter(x = promoted_teams_2.groupby("Season")["TeamYellowCardsPGOverall"].mean().index.astype(str), y = promoted_teams_2.groupby("Season")["TeamYellowCardsPGOverall"].mean().values, name = "YellowCardsPGOverall"), row = 8, col = 1)
fig.add_hline(y=promoted_teams_2.groupby("Season")["TeamYellowCardsPGOverall"].mean().values.mean(), line_dash="dash", line_color="red", row = 8, col = 1)

fig.add_trace(go.Scatter(x = promoted_teams_2.groupby("Season")["TeamRedCardsPGAtHome"].mean().index.astype(str), y = promoted_teams_2.groupby("Season")["TeamRedCardsPGAtHome"].mean().values, name = "RedCardsPGAtHome"), row = 9, col = 1)
fig.add_trace(go.Scatter(x = promoted_teams_2.groupby("Season")["TeamRedCardsPGOnAway"].mean().index.astype(str), y = promoted_teams_2.groupby("Season")["TeamRedCardsPGOnAway"].mean().values, name = "RedCardsPGOnAway"), row = 9, col = 1)
fig.add_trace(go.Scatter(x = promoted_teams_2.groupby("Season")["TeamRedCardsPGOverall"].mean().index.astype(str), y = promoted_teams_2.groupby("Season")["TeamRedCardsPGOverall"].mean().values, name = "RedCardsPGOverall"), row = 9, col = 1)
fig.add_hline(y=promoted_teams_2.groupby("Season")["TeamRedCardsPGOverall"].mean().values.mean(), line_dash="dash", line_color="red", row = 9, col = 1)

fig.update_layout(height = 900, width = 800, title_text = "Promoted Teams Stats Over Time")
fig.update_xaxes(title_text = "Season", type='category')
fig.show()

Over time I feel like the majority of the metrics (maybe barring fouls) have not had any significant trend. I will probably just leave it as it rather than trying to do it per time period.

In [14]:
promoted_teams_stats = {
    "PointsPGAtHome" : {
        "Mean" : promoted_teams_2.groupby("Season")["TeamPointsPGAtHome"].mean().values.mean(),
        "SD"   : promoted_teams_2.groupby("Season")["TeamPointsPGAtHome"].std().values.mean()
    },
    "PointsPGOnAway" : {
        "Mean" : promoted_teams_2.groupby("Season")["TeamPointsPGOnAway"].mean().values.mean(),
        "SD"   : promoted_teams_2.groupby("Season")["TeamPointsPGOnAway"].std().values.mean()
    },
    "PointsPGOverall" : {
        "Mean" : promoted_teams_2.groupby("Season")["TeamPointsPGOverall"].mean().values.mean(),
        "SD"   : promoted_teams_2.groupby("Season")["TeamPointsPGOverall"].std().values.mean()
    },
    "GoalsScoredPGAtHome" : {
        "Mean" : promoted_teams_2.groupby("Season")["TeamGoalsScoredPGAtHome"].mean().values.mean(),
        "SD"   : promoted_teams_2.groupby("Season")["TeamGoalsScoredPGAtHome"].std().values.mean()
    },
    "GoalsScoredPGOnAway" : {
        "Mean" : promoted_teams_2.groupby("Season")["TeamGoalsScoredPGOnAway"].mean().values.mean(),
        "SD"   : promoted_teams_2.groupby("Season")["TeamGoalsScoredPGOnAway"].std().values.mean()
    },
    "GoalsScoredPGOverall" : {
        "Mean" : promoted_teams_2.groupby("Season")["TeamGoalsScoredPGOverall"].mean().values.mean(),
        "SD"   : promoted_teams_2.groupby("Season")["TeamGoalsScoredPGOverall"].std().values.mean()
    },
    "GoalsConcededPGAtHome" : {
        "Mean" : promoted_teams_2.groupby("Season")["TeamGoalsConcededPGAtHome"].mean().values.mean(),
        "SD"   : promoted_teams_2.groupby("Season")["TeamGoalsConcededPGAtHome"].std().values.mean()
    },
    "GoalsConcededPGOnAway" : {
        "Mean" : promoted_teams_2.groupby("Season")["TeamGoalsConcededPGOnAway"].mean().values.mean(),
        "SD"   : promoted_teams_2.groupby("Season")["TeamGoalsConcededPGOnAway"].std().values.mean()
    },
    "GoalsConcededPGOverall" : {
        "Mean" : promoted_teams_2.groupby("Season")["TeamGoalsConcededPGOverall"].mean().values.mean(),
        "SD"   : promoted_teams_2.groupby("Season")["TeamGoalsConcededPGOverall"].std().values.mean()
    },
    "GoalDifferencePGAtHome" : {
        "Mean" : promoted_teams_2.groupby("Season")["TeamGoalDifferencePGAtHome"].mean().values.mean(),
        "SD"   : promoted_teams_2.groupby("Season")["TeamGoalDifferencePGAtHome"].std().values.mean()
    },
    "GoalDifferencePGOnAway" : {
        "Mean" : promoted_teams_2.groupby("Season")["TeamGoalDifferencePGOnAway"].mean().values.mean(),
        "SD"   : promoted_teams_2.groupby("Season")["TeamGoalDifferencePGOnAway"].std().values.mean()
    },
    "GoalDifferencePGOverall" : {
        "Mean" : promoted_teams_2.groupby("Season")["TeamGoalDifferencePGOverall"].mean().values.mean(),
        "SD"   : promoted_teams_2.groupby("Season")["TeamGoalDifferencePGOverall"].std().values.mean()
    },
    "ShotsPGAtHome" : {
        "Mean" : promoted_teams_2.groupby("Season")["TeamShotsPGAtHome"].mean().values.mean(),
        "SD"   : promoted_teams_2.groupby("Season")["TeamShotsPGAtHome"].std().values.mean()
    },
    "ShotsPGOnAway" : {
        "Mean" : promoted_teams_2.groupby("Season")["TeamShotsPGOnAway"].mean().values.mean(),
        "SD"   : promoted_teams_2.groupby("Season")["TeamShotsPGOnAway"].std().values.mean()
    },
    "ShotsPGOverall" : {
        "Mean" : promoted_teams_2.groupby("Season")["TeamShotsPGOverall"].mean().values.mean(),
        "SD"   : promoted_teams_2.groupby("Season")["TeamShotsPGOverall"].std().values.mean()
    },
    "FoulsPGAtHome" : {
        "Mean" : promoted_teams_2.groupby("Season")["TeamFoulsPGAtHome"].mean().values.mean(),
        "SD"   : promoted_teams_2.groupby("Season")["TeamFoulsPGAtHome"].std().values.mean()
    },
    "FoulsPGOnAway" : {
        "Mean" : promoted_teams_2.groupby("Season")["TeamFoulsPGOnAway"].mean().values.mean(),
        "SD"   : promoted_teams_2.groupby("Season")["TeamFoulsPGOnAway"].std().values.mean()
    },
    "FoulsPGOverall" : {
        "Mean" : promoted_teams_2.groupby("Season")["TeamFoulsPGOverall"].mean().values.mean(),
        "SD"   : promoted_teams_2.groupby("Season")["TeamFoulsPGOverall"].std().values.mean()
    },
    "CornersPGAtHome" : {
        "Mean" : promoted_teams_2.groupby("Season")["TeamCornersPGAtHome"].mean().values.mean(),
        "SD"   : promoted_teams_2.groupby("Season")["TeamCornersPGAtHome"].std().values.mean()
    },
    "CornersPGOnAway" : {
        "Mean" : promoted_teams_2.groupby("Season")["TeamCornersPGOnAway"].mean().values.mean(),
        "SD"   : promoted_teams_2.groupby("Season")["TeamCornersPGOnAway"].std().values.mean()
    },
    "CornersPGOverall" : {
        "Mean" : promoted_teams_2.groupby("Season")["TeamCornersPGOverall"].mean().values.mean(),
        "SD"   : promoted_teams_2.groupby("Season")["TeamCornersPGOverall"].std().values.mean()
    },
    "YellowCardsPGAtHome" : {
        "Mean" : promoted_teams_2.groupby("Season")["TeamYellowCardsPGAtHome"].mean().values.mean(),
        "SD"   : promoted_teams_2.groupby("Season")["TeamYellowCardsPGAtHome"].std().values.mean()
    },
    "YellowCardsPGOnAway" : {
        "Mean" : promoted_teams_2.groupby("Season")["TeamYellowCardsPGOnAway"].mean().values.mean(),
        "SD"   : promoted_teams_2.groupby("Season")["TeamYellowCardsPGOnAway"].std().values.mean()
    },
    "YellowCardsPGOverall" : {
        "Mean" : promoted_teams_2.groupby("Season")["TeamYellowCardsPGOverall"].mean().values.mean(),
        "SD"   : promoted_teams_2.groupby("Season")["TeamYellowCardsPGOverall"].std().values.mean()
    },
    "RedCardsPGAtHome" : {
        "Mean" : promoted_teams_2.groupby("Season")["TeamRedCardsPGAtHome"].mean().values.mean(),
        "SD"   : promoted_teams_2.groupby("Season")["TeamRedCardsPGAtHome"].std().values.mean()
    },
    "RedCardsPGOnAway" : {
        "Mean" : promoted_teams_2.groupby("Season")["TeamRedCardsPGOnAway"].mean().values.mean(),
        "SD"   : promoted_teams_2.groupby("Season")["TeamRedCardsPGOnAway"].std().values.mean()
    },
    "RedCardsPGOverall" : {
        "Mean" : promoted_teams_2.groupby("Season")["TeamRedCardsPGOverall"].mean().values.mean(),
        "SD"   : promoted_teams_2.groupby("Season")["TeamRedCardsPGOverall"].std().values.mean()
    }
}

# Moving Stats to last-season's data

Currently stats are for a given season. We need "historical data" (or data for last season). We therefore need to move all the per season stats up by one season.

In [15]:
home_team_stats = test3.groupby(['Season', 'HomeTeam']).agg({
    'HomeTeamPointsPGAtHome': 'first',
    'HomeTeamPointsPGOverall': 'first',
    'HomeTeamGoalsScoredPGAtHome': 'first',
    'HomeTeamGoalsScoredPGOverall': 'first',
    'HomeTeamGoalsConcededPGAtHome': 'first',
    'HomeTeamGoalsConcededPGOverall': 'first',
    'HomeTeamGoalDifferencePGAtHome': 'first',
    'HomeTeamGoalDifferencePGOverall': 'first',
    'HomeTeamShotsPGAtHome': 'first',
    'HomeTeamShotsPGOverall': 'first',
    'HomeTeamFoulsPGAtHome': 'first',
    'HomeTeamFoulsPGOverall': 'first',
    'HomeTeamCornersPGAtHome': 'first',
    'HomeTeamCornersPGOverall': 'first',
    'HomeTeamYellowCardsPGAtHome': 'first',
    'HomeTeamYellowCardsPGOverall': 'first',
    'HomeTeamRedCardsPGAtHome': 'first',
    'HomeTeamRedCardsPGOverall': 'first',
    'HomeTeam_finish': 'first'
})

away_team_stats = test3.groupby(['Season', 'AwayTeam']).agg({
    'AwayTeamPointsPGOnAway': 'first',
    'AwayTeamPointsPGOverall': 'first',
    'AwayTeamGoalsScoredPGOnAway': 'first',
    'AwayTeamGoalsScoredPGOverall': 'first',
    'AwayTeamGoalsConcededPGOnAway': 'first',
    'AwayTeamGoalsConcededPGOverall': 'first',
    'AwayTeamGoalDifferencePGOnAway': 'first',
    'AwayTeamGoalDifferencePGOverall': 'first',
    'AwayTeamShotsPGOnAway': 'first',
    'AwayTeamShotsPGOverall': 'first',
    'AwayTeamFoulsPGOnAway': 'first',
    'AwayTeamFoulsPGOverall': 'first',
    'AwayTeamCornersPGOnAway': 'first',
    'AwayTeamCornersPGOverall': 'first',
    'AwayTeamYellowCardsPGOnAway': 'first',
    'AwayTeamYellowCardsPGOverall': 'first',
    'AwayTeamRedCardsPGOnAway': 'first',
    'AwayTeamRedCardsPGOverall': 'first',
    'AwayTeam_finish': 'first'
})

In [16]:
final_team_dataframe = test3[[
    'Season', 
    'HomeTeam', 
    'AwayTeam', 
    'FTR', 
    'HomeTeamPointsPGAtHomeAgainstOpponent',
    'AwayTeamPointsPGOnAwayAgainstOpponent',
    'HomeTeamPointsPGOverallAgainstOpponent',
    'AwayTeamPointsPGOverallAgainstOpponent',
    'HomeTeamGoalsScoredPGAtHomeAgainstOpponent',
    'AwayTeamGoalsScoredPGOnAwayAgainstOpponent',
    'HomeTeamGoalsScoredPGOverallAgainstOpponent',
    'AwayTeamGoalsScoredPGOverallAgainstOpponent',
    'HomeTeamShotsPGAtHomeAgainstOpponent',
    'AwayTeamShotsPGOnAwayAgainstOpponent',
    'HomeTeamShotsPGOverallAgainstOpponent',
    'AwayTeamShotsPGOverallAgainstOpponent',
    'HomeTeamFoulsPGAtHomeAgainstOpponent',
    'AwayTeamFoulsPGOnAwayAgainstOpponent',
    'HomeTeamFoulsPGOverallAgainstOpponent',
    'AwayTeamFoulsPGOverallAgainstOpponent',
    'HomeTeamCornersPGAtHomeAgainstOpponent',
    'AwayTeamCornersPGOnAwayAgainstOpponent',
    'HomeTeamCornersPGOverallAgainstOpponent',
    'AwayTeamCornersPGOverallAgainstOpponent',
    'HomeTeamYellowCardsPGAtHomeAgainstOpponent',
    'AwayTeamYellowCardsPGOnAwayAgainstOpponent',
    'HomeTeamYellowCardsPGOverallAgainstOpponent',
    'AwayTeamYellowCardsPGOverallAgainstOpponent',
    'HomeTeamRedCardsPGAtHomeAgainstOpponent',
    'AwayTeamRedCardsPGOnAwayAgainstOpponent',
    'HomeTeamRedCardsPGOverallAgainstOpponent',
    'AwayTeamRedCardsPGOverallAgainstOpponent',
]]

home_team_previous_season_stats = home_team_stats.groupby(level=1).shift(1)
away_team_previous_season_stats = away_team_stats.groupby(level=1).shift(1)

final_team_dataframe = final_team_dataframe.merge(home_team_previous_season_stats, left_on=["Season", "HomeTeam"], right_on=["Season", "HomeTeam"], how="left")
final_team_dataframe = final_team_dataframe.merge(away_team_previous_season_stats, left_on=["Season", "AwayTeam"], right_on=["Season", "AwayTeam"], how="left")
final_team_dataframe = final_team_dataframe[final_team_dataframe["Season"] != "2000-01"] # Wont have previous season stats

Finally, we need to fill in the missing values for promoted teams.

For goals for and against specific oponents, we will fill them in with 0. FOr previous season stats we will use the mean +- 1 std for the promoted teams.

In [17]:
against_opponent_columns = [
    'HomeTeamPointsPGAtHomeAgainstOpponent',
    'AwayTeamPointsPGOnAwayAgainstOpponent',
    'HomeTeamPointsPGOverallAgainstOpponent',
    'AwayTeamPointsPGOverallAgainstOpponent',
    'HomeTeamGoalsScoredPGAtHomeAgainstOpponent',
    'AwayTeamGoalsScoredPGOnAwayAgainstOpponent',
    'HomeTeamGoalsScoredPGOverallAgainstOpponent',
    'AwayTeamGoalsScoredPGOverallAgainstOpponent',
    'HomeTeamShotsPGAtHomeAgainstOpponent',
    'AwayTeamShotsPGOnAwayAgainstOpponent',
    'HomeTeamShotsPGOverallAgainstOpponent',
    'AwayTeamShotsPGOverallAgainstOpponent',
    'HomeTeamFoulsPGAtHomeAgainstOpponent',
    'AwayTeamFoulsPGOnAwayAgainstOpponent',
    'HomeTeamFoulsPGOverallAgainstOpponent',
    'AwayTeamFoulsPGOverallAgainstOpponent',
    'HomeTeamCornersPGAtHomeAgainstOpponent',
    'AwayTeamCornersPGOnAwayAgainstOpponent',
    'HomeTeamCornersPGOverallAgainstOpponent',
    'AwayTeamCornersPGOverallAgainstOpponent',
    'HomeTeamYellowCardsPGAtHomeAgainstOpponent',
    'AwayTeamYellowCardsPGOnAwayAgainstOpponent',
    'HomeTeamYellowCardsPGOverallAgainstOpponent',
    'AwayTeamYellowCardsPGOverallAgainstOpponent',
    'HomeTeamRedCardsPGAtHomeAgainstOpponent',
    'AwayTeamRedCardsPGOnAwayAgainstOpponent',
    'HomeTeamRedCardsPGOverallAgainstOpponent',
    'AwayTeamRedCardsPGOverallAgainstOpponent'
]

final_team_dataframe[against_opponent_columns] = final_team_dataframe[against_opponent_columns].fillna(0)

In [None]:
promoted_teams_stats

In [None]:
final_team_dataframe_2 = final_team_dataframe.copy()

for entry in promoted_teams_stats:
    min_val = promoted_teams_stats[entry]["Mean"] - promoted_teams_stats[entry]["SD"]
    max_val = promoted_teams_stats[entry]["Mean"] + promoted_teams_stats[entry]["SD"]
    for _, row in promoted_teams.iterrows():
        if "Home" in entry:
            final_team_dataframe_2.loc[(final_team_dataframe_2["Season"] == row["Season"]) & (final_team_dataframe_2["HomeTeam"] == row["Team"]), "HomeTeam" + entry] = np.random.uniform(min_val, max_val)
        elif "Away" in entry:
            final_team_dataframe_2.loc[(final_team_dataframe_2["Season"] == row["Season"]) & (final_team_dataframe_2["AwayTeam"] == row["Team"]), "AwayTeam" + entry] = np.random.uniform(min_val, max_val)
        else:
            final_team_dataframe_2.loc[(final_team_dataframe_2["Season"] == row["Season"]) & (final_team_dataframe_2["HomeTeam"] == row["Team"]), "HomeTeam" + entry] = np.random.uniform(min_val, max_val)
            final_team_dataframe_2.loc[(final_team_dataframe_2["Season"] == row["Season"]) & (final_team_dataframe_2["AwayTeam"] == row["Team"]), "AwayTeam" + entry] = np.random.uniform(min_val, max_val)
final_team_dataframe_2.info()

In [20]:
# Mean finishing position of promoted teams
final_team_dataframe_2["HomeTeam_finish"] = final_team_dataframe_2["HomeTeam_finish"].fillna(15)
final_team_dataframe_2["AwayTeam_finish"] = final_team_dataframe_2["AwayTeam_finish"].fillna(15)

In [21]:
final_team_dataframe_2.to_csv("../data/extracted_features_dataset.csv", index = False)

In [27]:
promoted_team_names = promoted_teams[["Season", "Team"]]
promoted_team_names.to_csv("../data/promoted_teams.csv", index = False)