In [58]:
import random

import pandas as pd
from pathlib import Path

EVENTS_DIR = Path("events")

events_df = pd.read_csv(EVENTS_DIR.joinpath("events.csv"))
print(events_df.shape)

(941009, 22)


In [59]:
game_info_df = pd.read_csv(EVENTS_DIR.joinpath("ginf.csv"))
print(game_info_df.shape)

(10112, 18)


In [60]:
PLAYERS = ["lionel messi", "cristiano ronaldo", "erling haaland", "zlatan ibrahimovic", "luis suarez"]
EVENT_TYPES = [1, 3, 4, 5, 9]  # Attempt, foul, yellow card, second yellow card, offside
COLUMNS = ["event_type", "event_team", "player", "is_goal", "location", "season"]

full_df = pd.merge(events_df, game_info_df, how="left", on="id_odsp")

# Filter to only get useful rows. Remove 2017 season since it's incomplete and add it manually later.
filtered_df = full_df[(full_df["player"].isin(PLAYERS)) & (full_df["event_type"].isin(EVENT_TYPES)) & (
        full_df["event_type2"] != 15) & (full_df["location"].notna()) & (full_df["location"] != 19) & (
                              full_df["season"] != 2017)][COLUMNS]
filtered_df = filtered_df.replace(PLAYERS,
                                  [i.title() for i in PLAYERS])  # Capitalize first letter in first and last names
filtered_df = filtered_df.replace(["Barcelona", "Manchester Utd"],
                                  ["FC Barcelona", "Manchester United"])  # Change spelling of clubs
filtered_df = filtered_df.replace(filtered_df["season"].unique(), [i - 1 for i in filtered_df[
    "season"].unique()])  # Set season to be the first year of the season
print(filtered_df.shape)

(2961, 6)


In [61]:
goal_locations = filtered_df.loc[filtered_df["is_goal"] == 1]["location"].to_list()
miss_locations = filtered_df.loc[filtered_df["is_goal"] == 0]["location"].to_list()

miss_per_goal_factor = 5.5

STATS = {
    "Cristiano Ronaldo": {
        "2002": {
            "goals": 3,
            "misses": 3 * miss_per_goal_factor,
            "team": "Sporting CP"
        },
        "2003": {
            "goals": 4,
            "misses": 4 * miss_per_goal_factor,
            "team": "Manchester United"
        },
        "2004": {
            "goals": 5,
            "misses": 5 * miss_per_goal_factor,
            "team": "Manchester United"
        },
        "2005": {
            "goals": 9,
            "misses": 9 * miss_per_goal_factor,
            "team": "Manchester United"
        },
        "2006": {
            "goals": 17,
            "misses": 17 * miss_per_goal_factor,
            "team": "Manchester United"
        },
        "2007": {
            "goals": 31,
            "misses": 31 * miss_per_goal_factor,
            "team": "Manchester United"
        },
        "2008": {
            "goals": 18,
            "misses": 18 * miss_per_goal_factor,
            "team": "Manchester United"
        },
        "2009": {
            "goals": 26,
            "misses": 26 * miss_per_goal_factor,
            "team": "Real Madrid"
        },
        "2010": {
            "goals": 40,
            "misses": 40 * miss_per_goal_factor,
            "team": "Real Madrid"
        },
        "2016": {
            "goals": 25,
            "misses": 162,
            "team": "Real Madrid"
        },
        "2017": {
            "goals": 26,
            "misses": 174,
            "team": "Real Madrid"
        },
        "2018": {
            "goals": 21,
            "misses": 171,
            "team": "Juventus"
        },
        "2019": {
            "goals": 31,
            "misses": 195,
            "team": "Juventus"
        },
        "2020": {
            "goals": 29,
            "misses": 160,
            "team": "Juventus"
        },
        "2021": {
            "goals": 0,
            "misses": 1,
            "team": "Juventus"
        },
        "2022": {
            "goals": 1,
            "misses": 25,
            "team": "Al-Nassr"
        },
        "2023": {
            "goals": 14,
            "misses": 80,
            "team": "Al-Nassr"
        }
    },
    "Lionel Messi": {
        "2004": {
            "goals": 1,
            "misses": 1 * miss_per_goal_factor,
            "team": "FC Barcelona"
        },
        "2005": {
            "goals": 6,
            "misses": 6 * miss_per_goal_factor,
            "team": "FC Barcelona"
        },
        "2006": {
            "goals": 14,
            "misses": 14 * miss_per_goal_factor,
            "team": "FC Barcelona"
        },
        "2007": {
            "goals": 10,
            "misses": 10 * miss_per_goal_factor,
            "team": "FC Barcelona"
        },
        "2008": {
            "goals": 23,
            "misses": 23 * miss_per_goal_factor,
            "team": "FC Barcelona"
        },
        "2009": {
            "goals": 34,
            "misses": 34 * miss_per_goal_factor,
            "team": "FC Barcelona"
        },
        "2010": {
            "goals": 31,
            "misses": 31 * miss_per_goal_factor,
            "team": "FC Barcelona"
        },
        "2016": {
            "goals": 37,
            "misses": 179,
            "team": "FC Barcelona"
        },
        "2017": {
            "goals": 34,
            "misses": 193,
            "team": "FC Barcelona"
        },
        "2018": {
            "goals": 36,
            "misses": 165,
            "team": "FC Barcelona"
        },
        "2019": {
            "goals": 25,
            "misses": 154,
            "team": "FC Barcelona"
        },
        "2020": {
            "goals": 30,
            "misses": 191,
            "team": "FC Barcelona"
        },
        "2021": {
            "goals": 6,
            "misses": 89,
            "team": "Paris Saint-Germain"
        },
        "2022": {
            "goals": 16,
            "misses": 129,
            "team": "Paris Saint-Germain"
        },
        "2023": {
            "goals": 1,
            "misses": 20,
            "team": "Inter Miami"
        }
    },
    "Zlatan Ibrahimovic": {
        "2001": {
            "goals": 3,
            "misses": 6 * miss_per_goal_factor,
            "team": "Ajax"
        },
        "2002": {
            "goals": 13,
            "misses": 13 * miss_per_goal_factor,
            "team": "Ajax"
        },
        "2003": {
            "goals": 13,
            "misses": 13 * miss_per_goal_factor,
            "team": "Ajax"
        },
        "2004": {
            "goals": 3,
            "misses": 3 * miss_per_goal_factor,
            "team": "Ajax"
        },
        "2005": {
            "goals": 7,
            "misses": 7 * miss_per_goal_factor,
            "team": "Juventus"
        },
        "2006": {
            "goals": 15,
            "misses": 15 * miss_per_goal_factor,
            "team": "FC Internazionale"
        },
        "2007": {
            "goals": 17,
            "misses": 17 * miss_per_goal_factor,
            "team": "FC Internazionale"
        },
        "2008": {
            "goals": 25,
            "misses": 25 * miss_per_goal_factor,
            "team": "FC Internazionale"
        },
        "2009": {
            "goals": 16,
            "misses": 16 * miss_per_goal_factor,
            "team": "FC Barcelona"
        },
        "2010": {
            "goals": 14,
            "misses": 14 * miss_per_goal_factor,
            "team": "AC Milan"
        },
        "2016": {
            "goals": 17,
            "misses": 116,
            "team": "Manchester United"
        },
        "2017": {
            "goals": 0,
            "misses": 4,
            "team": "Manchester United"
        },
        "2018": {
            "goals": 22,
            "misses": 112,
            "team": "LA Galaxy"
        },
        "2019": {
            "goals": 30,
            "misses": 144,
            "team": "LA Galaxy"
        },
        "2020": {
            "goals": 15,
            "misses": 75,
            "team": "AC Milan"
        },
        "2021": {
            "goals": 8,
            "misses": 50,
            "team": "AC Milan"
        },
        "2022": {
            "goals": 1,
            "misses": 5,
            "team": "AC Milan"
        }
    },
    "Erling Haaland": {
        "2017": {
            "goals": 2,
            "misses": 2 * miss_per_goal_factor,
            "team": "Molde"
        },
        "2018": {
            "goals": 12,
            "misses": 12 * miss_per_goal_factor,
            "team": "Molde"
        },
        "2019": {
            "goals": 16,
            "misses": 53,
            "team": "RB Salzburg"
        },
        "2020": {
            "goals": 27,
            "misses": 89,
            "team": "Borussia Dortmund"
        },
        "2021": {
            "goals": 22,
            "misses": 74,
            "team": "Borussia Dortmund"
        },
        "2022": {
            "goals": 36,
            "misses": 116,
            "team": "Manchester City"
        },
        "2023": {
            "goals": 25,
            "misses": 105,
            "team": "Manchester City"
        }
    }
}

synthetic_data = {"event_type": [], "event_team": [], "player": [], "is_goal": [], "location": [], "season": []}
for player, years in STATS.items():
    for year, yearly_stats in years.items():
        for i in range(yearly_stats["goals"]):
            synthetic_data["event_type"].append(1)
            synthetic_data["event_team"].append(yearly_stats["team"])
            synthetic_data["player"].append(player)
            synthetic_data["is_goal"].append(1)
            synthetic_data["location"].append(random.choice(goal_locations))
            synthetic_data["season"].append(year)
        for i in range(round(yearly_stats["misses"])):
            synthetic_data["event_type"].append(1)
            synthetic_data["event_team"].append(yearly_stats["team"])
            synthetic_data["player"].append(player)
            synthetic_data["is_goal"].append(0)
            synthetic_data["location"].append(random.choice(miss_locations))
            synthetic_data["season"].append(year)

synthetic_data_df = pd.DataFrame(synthetic_data)
filtered_df = pd.concat([filtered_df, synthetic_data_df], ignore_index=True)
filtered_df.reset_index()
print(synthetic_data_df.shape)
print(filtered_df.shape)

(6277, 6)
(9238, 6)


The field used to map locations to x and y coordinates https://publications.fifa.com/en/football-stadiums-guidelines/technical-guideline/stadium-guidelines/pitch-dimensions-and-surrounding-areas/.

In the mapping the pitch is viewed with the short sides on the left and right. Origin is in the lower left corner.

First the mapping is done in yards where X = [0, 115] and Y = [0, 74] then the coordinates is scaled on the unitary square where X = [0, 1] and Y = [0 ,1].

In [62]:
coordinates = {"x": [], "y": []}
for index, row in filtered_df.iterrows():
    location = row["location"]
    match location:
        case 1:  # Attacking half
            x = random.randrange(58, 115)
            y = random.randrange(0, 74)
        case 2:  # Defensive half
            x = random.randrange(0, 57)
            y = random.randrange(0, 74)
        case 3:  # Centre of the box
            x = random.randrange(97, 109)
            y = random.randrange(27, 47)
        case 4:  # Left wing
            x = random.randrange(58, 115)
            y = random.randrange(59, 74)
        case 5:  # Right wing
            x = random.randrange(58, 115)
            y = random.randrange(0, 15)
        case 6:  # Difficult angle and long range
            x = random.randrange(58, 97)
            y = random.choice([random.randrange(0, 15), random.randrange(59, 74)])
        case 7:  # Difficult angle on the left
            x = random.randrange(70, 97)
            y = random.randrange(59, 74)
        case 8:  # Difficult angle on the right
            x = random.randrange(70, 97)
            y = random.randrange(0, 15)
        case 9:  # Left side of the box
            x = random.randrange(97, 115)
            y = random.randrange(59, 74)
        case 10:  # Left side of the six yard box
            x = random.randrange(109, 115)
            y = random.randrange(53, 59)
        case 11:  # Right side of the box
            x = random.randrange(97, 115)
            y = random.randrange(0, 15)
        case 12:  # Right side of the six yard box
            x = random.randrange(109, 115)
            y = random.randrange(15, 21)
        case 13:  # Very close range
            x = random.randrange(112, 115)
            y = random.randrange(30, 44)
        case 14:  # Penalty shot
            x = 105
            y = 37
        case 15:  # Outside the box
            x = random.randrange(86, 97)
            y = random.randrange(15, 59)
        case 16:  # Long range
            x = random.randrange(80, 86)
            y = random.randrange(15, 59)
        case 17:  # More than 35 yards
            x = random.randrange(75, 80)
            y = random.randrange(15, 59)
        case 18:  # More than 40 yards
            x = random.randrange(58, 75)
            y = random.randrange(15, 59)
    x_scaled = x / 115
    y_scaled = y / 74
    coordinates["x"].append(x_scaled)
    coordinates["y"].append(y_scaled)
final_df = filtered_df.assign(**coordinates)
print(final_df.shape)

(9238, 8)


In [63]:
final_df.to_json("events.json", orient='records', indent=2)