# Datasets Generation

Here we create the needed datasets. For each season we'll create two datasets. One with the matches and betting odds and a simple dataset which will contain the season scoreboard.

In [1]:
import pandas as pd
from pathlib import Path

DATA=Path("data/raw")
OUTPUT=DATA.parent / "processed"

columns_of_interest = [
    "Date",
    "HomeTeam",
    "AwayTeam",
    "FTR", # Full Time Result
    "B365H" , # Bet365 home win odds
    "B365D" , # Bet365 draw odds
    "B365A" , # Bet365 away win odds
]
datasets = [pd.read_csv(f)[columns_of_interest] for f in filter(lambda x: x.suffix == ".csv", DATA.iterdir())]


In [None]:
def extract_leaderboard(df):
    """
    Extract the leaderboard from the dataframe.
    """
    teams = set(df["HomeTeam"].unique()).union(set(df["AwayTeam"].unique()))
    leaderboard = pd.DataFrame(teams, columns=["Team"])
    leaderboard["Points"] = 0
    for _ , row in df.iterrows():
        if row["FTR"] == "H":
            leaderboard.loc[leaderboard["Team"] == row["HomeTeam"], "Points"] += 3
        elif row["FTR"] == "A":
            leaderboard.loc[leaderboard["Team"] == row["AwayTeam"], "Points"] += 3
        elif row["FTR"] == "D":
            leaderboard.loc[leaderboard["Team"] == row["HomeTeam"], "Points"] += 1
            leaderboard.loc[leaderboard["Team"] == row["AwayTeam"], "Points"] += 1
    leaderboard = leaderboard.sort_values(by="Points", ascending=False)
    leaderboard.reset_index(drop=True, inplace=True)
    leaderboard.index += 1
    leaderboard.index.name = "Position"
    leaderboard.index = leaderboard.index.astype(int)
    return leaderboard
    


In [10]:

for df in datasets:
    filename = df.iloc[0]["Date"].split("/")[-1]
    filename = "20" + filename if len(filename) == 2 else filename
    filename = f"{filename}-{int(filename[-2:])+1:02}"
    df.to_csv( OUTPUT / f"season_{filename}.csv", index=False)
    leaderboard = extract_leaderboard(df)
    leaderboard.to_csv( OUTPUT / f"leaderboard_{filename}.csv", index=False)
