<a href="https://colab.research.google.com/github/gabbosanti/machine-learning-assignment/blob/main/assignement.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Football predictor

Download kaggle dataset

In [None]:
import kagglehub
import os, shutil
from pathlib import Path

path = kagglehub.dataset_download("secareanualin/football-events")
print("Kaggle dataset path:", path)

work_dir = Path("data")
work_dir.mkdir(exist_ok=True)

for fname in ["events.csv", "ginf.csv", "dictionary.txt"]:
    shutil.copy(Path(path) / fname, work_dir / fname)

events_path = work_dir / "events.csv"
ginf_path   = work_dir / "ginf.csv"


Downloading from https://www.kaggle.com/api/v1/datasets/download/secareanualin/football-events?dataset_version_number=1...


100%|██████████| 21.1M/21.1M [00:00<00:00, 50.9MB/s]

Extracting files...





Kaggle dataset path: /root/.cache/kagglehub/datasets/secareanualin/football-events/versions/1


Print out the data


In [None]:
import pandas as pd

events_df = pd.read_csv(events_path)
ginf_df   = pd.read_csv(ginf_path)


# Feature engeniering



In [28]:
def create_dataset():

    # Reduce events_df dataset, filtering only first-half events
    first_half_events = events_df[events_df['time'] <= 45].copy()

    # Creating new features
    first_half_events['shot_on_target'] = (
      (first_half_events['event_type'] == 1) &
      (first_half_events['shot_outcome'] == 1)
    )

    first_half_events['shot_off_target'] = (
      (first_half_events['event_type'] == 1) &
      (first_half_events['shot_outcome'] == 2)
    ).astype(int)

    first_half_events['corner'] = (first_half_events['event_type'] == 2).astype(int)
    first_half_events['free_kick'] = (first_half_events['event_type'] == 8).astype(int)
    first_half_events['offside'] = (first_half_events['event_type'] == 9).astype(int)
    first_half_events['foul'] = (first_half_events['event_type'] == 3).astype(int)
    first_half_events['yellow_card'] = (first_half_events['event_type'] == 4).astype(int)
    first_half_events['goal'] = first_half_events['is_goal'].astype(int)

    # Aggregation : calculating how many occurences of every event
    matches_features = (first_half_events.groupby(['id_odsp', 'side'] , as_index=False).agg({
            'shot_on_target': 'sum',
            'shot_off_target': 'sum',
            'corner': 'sum',
            'free_kick': 'sum',
            'offside': 'sum',
            'foul': 'sum',
            'yellow_card': 'sum',
            'goal': 'sum'
        })
    )

    #Split between home and away
    home = matches_features[matches_features['side'] == 1].drop(columns='side').add_suffix('_home')
    away = matches_features[matches_features['side'] == 2].drop(columns='side').add_suffix('_away')

    #Renaming columns for merging
    home = home.rename(columns={'id_odsp_home': 'id_odsp'})
    away = away.rename(columns={'id_odsp_away': 'id_odsp'})

    #Merging to ginf_ds
    dataset = (
        ginf_df[['id_odsp', 'fthg']]
        .merge(home, on='id_odsp', how='left')
        .merge(away, on='id_odsp', how='left')
    )

    dataset['home_scored_second_half'] = (
        (dataset['fthg'] - dataset['goal_home']) > 0
    ).astype(int)

    dataset = dataset.drop(columns=['fthg'])
    return pd.DataFrame(dataset)


In [29]:
dataset = create_dataset()
dataset

Unnamed: 0,id_odsp,shot_on_target_home,shot_off_target_home,corner_home,free_kick_home,offside_home,foul_home,yellow_card_home,goal_home,shot_on_target_away,shot_off_target_away,corner_away,free_kick_away,offside_away,foul_away,yellow_card_away,goal_away,home_scored_second_half
0,UFot0hit/,3.0,5.0,4.0,5.0,1.0,5.0,0.0,2.0,0.0,2.0,2.0,5.0,0.0,6.0,1.0,0.0,1
1,Aw5DflLH/,3.0,3.0,2.0,15.0,0.0,8.0,2.0,0.0,1.0,1.0,2.0,9.0,0.0,15.0,1.0,0.0,1
2,bkjpaC6n/,2.0,2.0,2.0,15.0,3.0,5.0,0.0,0.0,1.0,0.0,2.0,7.0,4.0,15.0,1.0,0.0,1
3,CzPV312a/,0.0,3.0,2.0,5.0,3.0,10.0,1.0,0.0,2.0,1.0,2.0,11.0,4.0,4.0,1.0,1.0,0
4,GUOdmtII/,1.0,5.0,1.0,6.0,1.0,6.0,0.0,1.0,4.0,3.0,5.0,6.0,1.0,6.0,1.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10107,xAkY8l6R/,1.0,0.0,2.0,3.0,3.0,9.0,0.0,1.0,2.0,2.0,5.0,9.0,1.0,3.0,1.0,0.0,1
10108,xSU9scI9/,2.0,1.0,7.0,7.0,1.0,6.0,1.0,1.0,2.0,1.0,2.0,6.0,0.0,7.0,2.0,0.0,1
10109,xY7uZwOI/,6.0,5.0,3.0,7.0,2.0,4.0,0.0,3.0,0.0,0.0,1.0,4.0,0.0,7.0,1.0,0.0,1
10110,YyeGxMX8/,3.0,4.0,6.0,6.0,4.0,4.0,0.0,0.0,1.0,2.0,4.0,4.0,1.0,5.0,1.0,0.0,0
