<a href="https://colab.research.google.com/github/gabbosanti/machine-learning-assignment/blob/main/assignement.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Football predictor

Download kaggle dataset

In [1]:
import kagglehub
import os, shutil
from pathlib import Path

path = kagglehub.dataset_download("secareanualin/football-events")
print("Kaggle dataset path:", path)

work_dir = Path("data")
work_dir.mkdir(exist_ok=True)

for fname in ["events.csv", "ginf.csv", "dictionary.txt"]:
    shutil.copy(Path(path) / fname, work_dir / fname)

events_path = work_dir / "events.csv"
ginf_path   = work_dir / "ginf.csv"


Downloading from https://www.kaggle.com/api/v1/datasets/download/secareanualin/football-events?dataset_version_number=1...


100%|██████████| 21.1M/21.1M [00:00<00:00, 50.9MB/s]

Extracting files...





Kaggle dataset path: /root/.cache/kagglehub/datasets/secareanualin/football-events/versions/1


Print out the data


In [2]:
import pandas as pd

events_df = pd.read_csv(events_path)
ginf_df   = pd.read_csv(ginf_path)


# Feature engeniering



In [12]:
def create_dataset():

    # Reduce events_df dataset, filtering only first-half events
    first_half_events = events_df[events_df['time'] <= 45].copy()
    # Identify home team for every match
    match_info = ginf_df[['id_odsp', 'ht', 'at', 'fthg', 'ftag']].copy()
    #List that will contain only relevant featur for every match
    matches_features = []

    for match_id in ginf_df['id_odsp'].unique(): #iterating for every match (based on his unique id)

        # Consider only the event of that game
        match_events = first_half_events[first_half_events['id_odsp'] == match_id]

        # Identify between home and away events, further dividing the dataset
        # Recall --> side: 1 = home, 2 = away
        home_events = match_events[match_events['side'] == 1]
        away_events = match_events[match_events['side'] == 2]

        #For every match we will consider both home/away significant feature

        ## Attacking (relevant) feature
        # On-target shots
        shots_on_target_home = len(home_events[
            (home_events['event_type'] == 1) &
            (home_events['shot_outcome'] == 1)
        ])
        shots_on_target_away = len(away_events[
            (away_events['event_type'] == 1) &
            (away_events['shot_outcome'] == 1)
        ])

        # Off-target shots
        shots_off_target_home = len(home_events[
            (home_events['event_type'] == 1) &
            (home_events['shot_outcome'] == 2)
        ])
        shots_off_target_away = len(away_events[
            (away_events['event_type'] == 1) &
            (away_events['shot_outcome'] == 2)
        ])

        # Corner
        corners_home = len(home_events[home_events['event_type'] == 2])
        corners_away = len(away_events[away_events['event_type'] == 2])

        # Free kicks gaines
        free_kicks_won_home = len(home_events[home_events['event_type'] == 8])
        free_kicks_won_away = len(away_events[away_events['event_type'] == 8])

        # Offsides gained
        offsides_home = len(home_events[home_events['event_type'] == 6])
        offsides_away = len(away_events[away_events['event_type'] == 6])

        ## Discipline feature
        # Fouls
        fouls_home = len(home_events[home_events['event_type'] == 3])
        fouls_away = len(away_events[away_events['event_type'] == 3])

        # Yellow cards
        yellow_cards_home = len(home_events[home_events['event_type'] == 7])
        yellow_cards_away = len(away_events[away_events['event_type'] == 7])

        # First half's team goal
        home_goals_first_half = len(home_events[home_events['is_goal'] == 1])
        away_goals_first_half = len(away_events[away_events['is_goal'] == 1])

        # TARGET: home goals in the second half --> fthg - hthg

        match_data = ginf_df[ginf_df['id_odsp'] == match_id]
        total_home_goals = match_data['fthg'].iloc[0]
        home_goals_first_half = len(home_events[home_events['is_goal'] == 1])
        home_goals_second_half = total_home_goals - home_goals_first_half
        goal_home_second_half = 1 if home_goals_second_half > 0 else 0 # this is the target

        #Let's append to the empty list all the relevant features
        matches_features.append({
            'id_odsp': match_id,
            'shots_on_target_home': shots_on_target_home,
            'shots_on_target_away': shots_on_target_away,
            'shots_off_target_home': shots_off_target_home,
            'shots_off_target_away': shots_off_target_away,
            'corners_home': corners_home,
            'corners_away': corners_away,
            'free_kicks_won_home': free_kicks_won_home,
            'free_kicks_won_away': free_kicks_won_away,
            'offsides_home': offsides_home,
            'offsides_away': offsides_away,
            'fouls_home': fouls_home,
            'fouls_away': fouls_away,
            'yellow_cards_home': yellow_cards_home,
            'yellow_cards_away': yellow_cards_away,
            'home_goals_first_half': home_goals_first_half,
            'away_goals_first_half': away_goals_first_half,
            'goal_home_second_half': goal_home_second_half
        })

    return pd.DataFrame(matches_features)


In [13]:
dataset = create_dataset()
dataset.head()

KeyboardInterrupt: 