<a href="https://colab.research.google.com/github/dmnarula/football-ds-learning/blob/main/week4-modeling/Week4_Modeling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
%cd /content
!rm -rf football-ds-learning
!git clone https://github.com/dmnarula/football-ds-learning.git

/content
Cloning into 'football-ds-learning'...
remote: Enumerating objects: 164, done.[K
remote: Counting objects: 100% (164/164), done.[K
remote: Compressing objects: 100% (154/154), done.[K
remote: Total 164 (delta 84), reused 0 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (164/164), 1.24 MiB | 7.83 MiB/s, done.
Resolving deltas: 100% (84/84), done.


In [3]:
%cd football-ds-learning
!ls

/content/football-ds-learning
README.md    src		  week2-pandas	       week4-modeling
sample_data  week1-python-basics  week3-visualization


In [4]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot  as plt
import os
import sys
sys.path.append("../..")
from src.merging import build_match_df
from sklearn.linear_model import LinearRegression

sns.set(style="whitegrid")
plt.rcParams["figure.figsize"] = (12, 8)

In [21]:
base_gw_url = ("https://raw.githubusercontent.com/olbauday/FPL-Elo-Insights/refs/heads/main/data/2025-2026/By%20Tournament/Premier%20League")

# Automate GW URLs
def get_gw_urls(gw: int):
  gw_path = f"{base_gw_url}/GW{gw}"
  url_stats = f"{gw_path}/playermatchstats.csv"
  url_players = f"{gw_path}/players.csv"
  url_teams = f"{gw_path}/teams.csv"
  url_matches = f"{gw_path}/matches.csv"

  return url_stats, url_players, url_teams, url_matches

In [23]:

# Season data loader
def build_season_df(start_gw, end_gw):
  all_gw_dfs = []

  for gw in range(start_gw, end_gw + 1):
    url_stats, url_players, url_teams, url_matches = get_gw_urls(gw)

    df_gw = build_match_df(
        url_stats=url_stats,
        url_players=url_players,
        url_teams=url_teams,
        url_matches=url_matches
    )

    df_gw["gameweek"]=gw
    all_gw_dfs.append(df_gw)

  return pd.concat(all_gw_dfs, ignore_index=True)

In [25]:
url_stats, url_players, url_teams, url_matches = get_gw_urls(15)
df_matches = pd.read_csv(url_matches)
df_matches.head(100)

Unnamed: 0,gameweek,kickoff_time,home_team,home_team_elo,home_score,away_score,away_team,away_team_elo,finished,match_id,...,away_walking_distance,home_running_distance,away_running_distance,home_sprinting_distance,away_sprinting_distance,home_number_of_sprints,away_number_of_sprints,home_top_speed,away_top_speed,tournament
0,15.0,2025-12-06T17:30:00,2.0,1718.7,3.0,3.0,14.0,1917.61,True,25-26-prem-leeds-united-vs-liverpool,...,,,,,,,,,,prem
1,15.0,2025-12-07T16:30:00,54.0,1775.79,1.0,2.0,31.0,1837.82,True,25-26-prem-fulham-vs-crystal-palace,...,,,,,,,,,,prem
2,15.0,2025-12-06T12:30:00,7.0,1890.53,2.0,1.0,3.0,2038.72,True,25-26-prem-aston-villa-vs-arsenal,...,,,,,,,,,,prem
3,15.0,2025-12-06T15:00:00,11.0,1797.06,3.0,0.0,17.0,1773.69,True,25-26-prem-everton-vs-nottingham-forest,...,,,,,,,,,,prem
4,15.0,2025-12-06T15:00:00,91.0,1796.38,0.0,0.0,8.0,1896.87,True,25-26-prem-afc-bournemouth-vs-chelsea,...,,,,,,,,,,prem
5,15.0,2025-12-06T15:00:00,43.0,1963.13,3.0,0.0,56.0,1652.47,True,25-26-prem-manchester-city-vs-sunderland,...,,,,,,,,,,prem
6,15.0,2025-12-06T15:00:00,6.0,1802.07,2.0,0.0,94.0,1813.2,True,25-26-prem-tottenham-hotspur-vs-brentford,...,,,,,,,,,,prem
7,15.0,2025-12-06T15:00:00,4.0,1866.28,2.0,1.0,90.0,1696.85,True,25-26-prem-newcastle-united-vs-burnley,...,,,,,,,,,,prem
8,15.0,,36.0,1842.47,1.0,1.0,21.0,1726.95,False,25-26-prem-brighton-hove-albion-vs-west-ham-un...,...,,,,,,,,,,prem
9,15.0,,39.0,1653.75,,,1.0,1817.47,False,25-26-prem-wolverhampton-wanderers-vs-manchest...,...,,,,,,,,,,prem


In [42]:
df_season = build_season_df(1, 14)
df_season_nogk = df_season[df_season["position"] != "Goalkeeper"]
# df_season.shape
# df_season.head()
# df_season["gameweek"].value_counts().sort_index()
# df_season.isna().mean().sort_values(ascending=False).head(100)

In [73]:
stats_cols = ["minutes_played", "goals", "assists",
              "total_shots", "xg", "xa", "shots_on_target",
              "final_third_passes", "successful_dribbles", "successful_dribbles_percent",
              "touches_opposition_box", "touches", "chances_created",
              "accurate_passes", "accurate_passes_percent",
              "accurate_crosses", "accurate_crosses_percent",
              "accurate_long_balls", "accurate_long_balls_percent"]
player_cols = ["player_code", "player_id", "web_name", "team_code", "position"]
match_cols  = ["gameweek", "home_team", "away_team",
                "match_id", "home_score", "away_score",
               "home_team_name", "home_team_short_name",
               "away_team_name", "away_team_short_name"]
team_cols   = ["team_name", "team_short_name"]
cols = (stats_cols
        + player_cols
        + match_cols
        + team_cols)

df_season_clean = df_season_nogk[cols].copy()
df_season_clean.head()

Unnamed: 0,minutes_played,goals,assists,total_shots,xg,xa,shots_on_target,final_third_passes,successful_dribbles,successful_dribbles_percent,...,away_team,match_id,home_score,away_score,home_team_name,home_team_short_name,away_team_name,away_team_short_name,team_name,team_short_name
0,80,0,0,0,0.0,0.02,0,6,0,0.0,...,3,25-26-prem-manchester-united-vs-arsenal,0.0,1.0,Man Utd,MUN,Arsenal,ARS,Man Utd,MUN
1,90,0,0,1,0.15,0.09,0,0,3,0.0,...,3,25-26-prem-manchester-united-vs-arsenal,0.0,1.0,Man Utd,MUN,Arsenal,ARS,Arsenal,ARS
2,79,2,0,2,0.92,0.01,2,2,1,0.0,...,94,25-26-prem-nottingham-forest-vs-brentford,3.0,1.0,Nott'm Forest,NFO,Brentford,BRE,Nott'm Forest,NFO
3,84,0,1,3,0.38,0.28,0,7,2,0.0,...,94,25-26-prem-nottingham-forest-vs-brentford,3.0,1.0,Nott'm Forest,NFO,Brentford,BRE,Nott'm Forest,NFO
4,90,0,1,3,0.39,0.28,1,12,0,0.0,...,94,25-26-prem-nottingham-forest-vs-brentford,3.0,1.0,Nott'm Forest,NFO,Brentford,BRE,Nott'm Forest,NFO


In [74]:
df_player_season = (
    df_season_clean
    .groupby(["player_id", "web_name", "team_short_name"], as_index=False)
    .agg({
        "minutes_played": "sum",
        "xg": "sum",
        "xa": "sum",
        "goals": "sum",
        "assists": "sum",
        "total_shots": "sum",
        "final_third_passes": "sum",
        "successful_dribbles": "sum",
        "touches_opposition_box": "sum",
        "touches": "sum",
        "chances_created": "sum",
        "accurate_passes": "sum",
        "accurate_crosses": "sum",
        "accurate_long_balls": "sum"

    })
)


df_player_season.head()

Unnamed: 0,player_id,web_name,team_short_name,minutes_played,xg,xa,goals,assists,total_shots,final_third_passes,successful_dribbles,touches_opposition_box,touches,chances_created,accurate_passes,accurate_crosses,accurate_long_balls
0,5,Gabriel,ARS,990,0.82,0.77,1,2,8,61,1,15,848,4.0,627,0,20
1,6,Saliba,ARS,815,0.26,0.41,0,0,2,53,2,6,863,1.0,729,1,21
2,7,Calafiori,ARS,1087,2.46,0.32,1,2,22,49,10,35,746,4.0,404,3,12
3,8,J.Timber,ARS,1109,2.88,0.78,2,2,15,76,8,46,816,13.0,447,2,8
4,10,Lewis-Skelly,ARS,132,0.0,0.02,0,0,0,6,2,3,107,0.0,71,0,1


In [75]:
# Aggregate metrics

# xGI & GI
df_player_season["xgi"] = df_player_season["xg"] + df_player_season["xa"]
df_player_season["gi"] = df_player_season["goals"] + df_player_season["assists"]


# Involvement %
df_player_season["team_xgi"] = (df_player_season.groupby("team_short_name")["xgi"].transform("sum"))
df_player_season["involvement_pct"] = (df_player_season["xgi"] / df_player_season["team_xgi"]).round(2)

# Possession contribution
df_player_season["team_total_touches"] = (df_player_season.groupby("team_short_name")["touches"].transform("sum"))
df_player_season["possession_contribution"] = (df_player_season["touches"] / df_player_season["team_total_touches"]).round(2)

df_player_season[df_player_season["web_name"] == "Haaland"]

Unnamed: 0,player_id,web_name,team_short_name,minutes_played,xg,xa,goals,assists,total_shots,final_third_passes,...,chances_created,accurate_passes,accurate_crosses,accurate_long_balls,xgi,gi,team_xgi,involvement_pct,team_total_touches,possession_contribution
249,430,Haaland,MCI,1218,13.69,1.15,15,3,56,17,...,8.0,110,0,2,14.84,18,46.22,0.32,9708,0.03


In [82]:
# Per 90 metrics

minutes_factor = df_player_season["minutes_played"] / 90
minutes_factor = minutes_factor.replace(0, np.nan)

per90_cols = [
    "xg",
    "xa",
    "xgi",
    "goals",
    "assists",
    "gi",
    "total_shots",
    "final_third_passes",
    "successful_dribbles",
    "touches_opposition_box",
    "touches",
    "chances_created",
    "accurate_passes",
    "accurate_crosses",
    "accurate_long_balls",
]

for col in per90_cols:
  df_player_season[f"{col}_per90"] = (df_player_season[col] / minutes_factor).round(2)

df_player_season[df_player_season["web_name"] == "Muñoz"]


Unnamed: 0,player_id,web_name,team_short_name,minutes_played,xg,xa,goals,assists,total_shots,final_third_passes,...,gi_per90,total_shots_per90,final_third_passes_per90,successful_dribbles_per90,touches_opposition_box_per90,touches_per90,chances_created_per90,accurate_passes_per90,accurate_crosses_per90,accurate_long_balls_per90
139,256,Muñoz,CRY,1259,1.78,2.0,3,2,15,45,...,0.36,1.07,3.22,0.07,3.0,49.54,0.5,20.59,0.5,0.93
