In [1]:
# =============================================================================
# STEP 0: Force reload modules (run this first after code changes!)
# =============================================================================
import importlib
import src.data_loader
import src.features
import src.pipeline
import src.models.goals
import src.models.base


In [2]:
# =============================================================================
# STEP 1: Update Data (optional - only if you need fresh gameweek data)
# =============================================================================
!python scrape_update_data.py --gameweek 25
# !python scrape_update_data.py --auto

FotMob Incremental Data Updater

Season: 2025/2026
Fetching fixture list...
Found 260 completed matches in 2025/2026
Checking existing data in: player_stats_8seasons_20260204_215756.csv
Found 2897 existing match IDs in data
Filtering to gameweeks [25]: 10 matches
  10 new matches to scrape
Matches per gameweek: {25: 10}
[1/10] GW25: Leeds United vs Nottingham Forest (30 players)
[2/10] GW25: Manchester United vs Tottenham Hotspur (32 players)
[3/10] GW25: AFC Bournemouth vs Aston Villa (31 players)
[4/10] GW25: Arsenal vs Sunderland (31 players)
[5/10] GW25: Burnley vs West Ham United (32 players)
[6/10] GW25: Fulham vs Everton (30 players)
[7/10] GW25: Wolverhampton Wanderers vs Chelsea (30 players)
[8/10] GW25: Newcastle United vs Brentford (30 players)
[9/10] GW25: Brighton & Hove Albion vs Crystal Palace (30 players)
[10/10] GW25: Liverpool vs Manchester City (27 players)

Saving data...
Appending to existing file: player_stats_8seasons_20260204_215756.csv
Saved: player_stats_8seas

In [3]:
# =============================================================================
# STEP 2: Run the Pipeline
# =============================================================================
from src.pipeline import FPLPipeline

pipeline = FPLPipeline('data')
pipeline.load_data()
pipeline.compute_features()

pipeline.tune(n_iter=100, use_subprocess=True)
pipeline.train()
predictions = pipeline.predict(gameweek=26, season='2025/2026')

LOADING DATA
Loading player stats from: player_stats_8seasons_20260214_121154.csv
  Loaded 82,238 player-match records
  Seasons: ['2018/2019', '2019/2020', '2020/2021', '2021/2022', '2022/2023', '2023/2024', '2024/2025', '2025/2026']
Loaded 2,921 fixtures
Filtered to seasons: ['2020/2021', '2021/2022', '2022/2023', '2023/2024', '2024/2025', '2025/2026']
Current season (2025/2026): 511 active players
Final dataset: 61,965 records

COMPUTING FEATURES
Computing rolling features...
  Computed 93 rolling/lifetime features

TUNING HYPERPARAMETERS WITH HOLDOUT TEST SET

Data split (temporal):
  Train: 49,562 samples
  Test:  12,390 samples (most recent 20%)
  Test set spans: ['2024/2025', '2025/2026']

------------------------------------------------------------
PHASE 1: Hyperparameter Tuning (5-fold CV on train set)
------------------------------------------------------------

Tuning GOALS (100 trials, 5-fold CV, MAE) in subprocess...
  Best CV MAE: 0.1899
  Features: 22/26 (87.9%)
  Params

  from .autonotebook import tqdm as notebook_tqdm
[I 2026-02-14 12:16:54,805] A new study created in memory with name: no-name-7eda4295-dd0f-402e-9efa-98ec46fc3809



Tuning GOALS_AGAINST (100 trials, 5-fold CV, Poisson Deviance, feature selection)...
  Team-matches: 3177, Avg conceded: 1.393, Total features: 20


Best trial: 0. Best value: 1.32469:   1%|          | 1/100 [00:02<03:37,  2.19s/it]

[I 2026-02-14 12:16:56,999] Trial 0 finished with value: 1.3246934799199732 and parameters: {'n_features_ratio': 0.6247240713084175, 'n_estimators': 288, 'max_depth': 7, 'learning_rate': 0.07661100707771368, 'min_child_weight': 2}. Best is trial 0 with value: 1.3246934799199732.


Best trial: 1. Best value: 1.24199:   2%|▏         | 2/100 [00:03<02:19,  1.43s/it]

[I 2026-02-14 12:16:57,890] Trial 1 finished with value: 1.2419912006761322 and parameters: {'n_features_ratio': 0.49359671220172163, 'n_estimators': 64, 'max_depth': 8, 'learning_rate': 0.07725378389307355, 'min_child_weight': 8}. Best is trial 1 with value: 1.2419912006761322.


Best trial: 2. Best value: 1.23724:   3%|▎         | 3/100 [00:04<02:07,  1.32s/it]

[I 2026-02-14 12:16:59,081] Trial 2 finished with value: 1.2372406851218878 and parameters: {'n_features_ratio': 0.41235069657748147, 'n_estimators': 293, 'max_depth': 7, 'learning_rate': 0.020589728197687916, 'min_child_weight': 2}. Best is trial 2 with value: 1.2372406851218878.


Best trial: 3. Best value: 1.20796:   4%|▍         | 4/100 [00:05<01:53,  1.18s/it]

[I 2026-02-14 12:17:00,050] Trial 3 finished with value: 1.2079562275650062 and parameters: {'n_features_ratio': 0.5100427059120604, 'n_estimators': 126, 'max_depth': 6, 'learning_rate': 0.04345454109729477, 'min_child_weight': 3}. Best is trial 3 with value: 1.2079562275650062.


Best trial: 4. Best value: 1.18085:   5%|▌         | 5/100 [00:06<01:43,  1.09s/it]

[I 2026-02-14 12:17:00,991] Trial 4 finished with value: 1.1808543867230785 and parameters: {'n_features_ratio': 0.7671117368334277, 'n_estimators': 85, 'max_depth': 4, 'learning_rate': 0.03476649150592621, 'min_child_weight': 5}. Best is trial 4 with value: 1.1808543867230785.


Best trial: 7. Best value: 1.17934:   9%|▉         | 9/100 [00:07<00:40,  2.24it/s]

[I 2026-02-14 12:17:01,874] Trial 5 finished with value: 1.217007033885426 and parameters: {'n_features_ratio': 0.8711055768358081, 'n_estimators': 100, 'max_depth': 6, 'learning_rate': 0.07500118950416987, 'min_child_weight': 1}. Best is trial 4 with value: 1.1808543867230785.
[I 2026-02-14 12:17:01,920] Trial 6 finished with value: 1.2010753345048868 and parameters: {'n_features_ratio': 0.764526911140863, 'n_estimators': 92, 'max_depth': 3, 'learning_rate': 0.2521267904777921, 'min_child_weight': 10}. Best is trial 4 with value: 1.1808543867230785.
[I 2026-02-14 12:17:01,974] Trial 7 finished with value: 1.1793371414516105 and parameters: {'n_features_ratio': 0.8850384088698766, 'n_estimators': 126, 'max_depth': 3, 'learning_rate': 0.1024932221692416, 'min_child_weight': 5}. Best is trial 7 with value: 1.1793371414516105.
[I 2026-02-14 12:17:02,027] Trial 8 finished with value: 1.2099323117444014 and parameters: {'n_features_ratio': 0.47322294090686734, 'n_estimators': 174, 'max_dept

Best trial: 7. Best value: 1.17934:  11%|█         | 11/100 [00:07<00:28,  3.07it/s]

[I 2026-02-14 12:17:02,183] Trial 9 finished with value: 1.2182840736186311 and parameters: {'n_features_ratio': 0.7975133706123891, 'n_estimators': 128, 'max_depth': 6, 'learning_rate': 0.06420330336297862, 'min_child_weight': 2}. Best is trial 7 with value: 1.1793371414516105.
[I 2026-02-14 12:17:02,327] Trial 10 finished with value: 1.1879386413249038 and parameters: {'n_features_ratio': 0.9858561899264302, 'n_estimators': 225, 'max_depth': 4, 'learning_rate': 0.010629965824084665, 'min_child_weight': 6}. Best is trial 7 with value: 1.1793371414516105.


Best trial: 7. Best value: 1.17934:  13%|█▎        | 13/100 [00:07<00:20,  4.33it/s]

[I 2026-02-14 12:17:02,442] Trial 11 finished with value: 1.1793601039075008 and parameters: {'n_features_ratio': 0.9223169824259325, 'n_estimators': 171, 'max_depth': 4, 'learning_rate': 0.031227201869348846, 'min_child_weight': 5}. Best is trial 7 with value: 1.1793371414516105.
[I 2026-02-14 12:17:02,559] Trial 12 finished with value: 1.2227336433771605 and parameters: {'n_features_ratio': 0.9962088134619758, 'n_estimators': 185, 'max_depth': 4, 'learning_rate': 0.14872014672673853, 'min_child_weight': 5}. Best is trial 7 with value: 1.1793371414516105.


Best trial: 7. Best value: 1.17934:  14%|█▍        | 14/100 [00:08<00:18,  4.70it/s]

[I 2026-02-14 12:17:02,725] Trial 13 finished with value: 1.1872667271002857 and parameters: {'n_features_ratio': 0.89690610608159, 'n_estimators': 172, 'max_depth': 5, 'learning_rate': 0.024795803650758556, 'min_child_weight': 7}. Best is trial 7 with value: 1.1793371414516105.
[I 2026-02-14 12:17:02,822] Trial 14 finished with value: 1.2076558794117347 and parameters: {'n_features_ratio': 0.8898657631926836, 'n_estimators': 220, 'max_depth': 3, 'learning_rate': 0.12967848934366005, 'min_child_weight': 4}. Best is trial 7 with value: 1.1793371414516105.


Best trial: 7. Best value: 1.17934:  17%|█▋        | 17/100 [00:08<00:13,  6.35it/s]

[I 2026-02-14 12:17:02,943] Trial 15 finished with value: 1.1940398912951424 and parameters: {'n_features_ratio': 0.653724300827545, 'n_estimators': 143, 'max_depth': 5, 'learning_rate': 0.015674975850890966, 'min_child_weight': 8}. Best is trial 7 with value: 1.1793371414516105.
[I 2026-02-14 12:17:03,070] Trial 16 finished with value: 1.2226621364916042 and parameters: {'n_features_ratio': 0.9343350905123895, 'n_estimators': 222, 'max_depth': 4, 'learning_rate': 0.1256126397869271, 'min_child_weight': 6}. Best is trial 7 with value: 1.1793371414516105.


Best trial: 18. Best value: 1.17346:  19%|█▉        | 19/100 [00:08<00:09,  8.12it/s]

[I 2026-02-14 12:17:03,141] Trial 17 finished with value: 1.1736826505113647 and parameters: {'n_features_ratio': 0.8262975127542996, 'n_estimators': 153, 'max_depth': 3, 'learning_rate': 0.04595859828253945, 'min_child_weight': 4}. Best is trial 17 with value: 1.1736826505113647.
[I 2026-02-14 12:17:03,217] Trial 18 finished with value: 1.17346021940978 and parameters: {'n_features_ratio': 0.8224051642788274, 'n_estimators': 149, 'max_depth': 3, 'learning_rate': 0.047852849994887724, 'min_child_weight': 4}. Best is trial 18 with value: 1.17346021940978.


Best trial: 18. Best value: 1.17346:  22%|██▏       | 22/100 [00:08<00:08,  8.76it/s]

[I 2026-02-14 12:17:03,381] Trial 19 finished with value: 1.2028714652973604 and parameters: {'n_features_ratio': 0.8188469260059139, 'n_estimators': 197, 'max_depth': 5, 'learning_rate': 0.04460205460691788, 'min_child_weight': 4}. Best is trial 18 with value: 1.17346021940978.
[I 2026-02-14 12:17:03,479] Trial 20 finished with value: 1.181470017623186 and parameters: {'n_features_ratio': 0.6813851662092139, 'n_estimators': 266, 'max_depth': 3, 'learning_rate': 0.04738180976312187, 'min_child_weight': 4}. Best is trial 18 with value: 1.17346021940978.
[I 2026-02-14 12:17:03,553] Trial 21 finished with value: 1.179821947405379 and parameters: {'n_features_ratio': 0.8453054337643152, 'n_estimators': 148, 'max_depth': 3, 'learning_rate': 0.09457692440469626, 'min_child_weight': 3}. Best is trial 18 with value: 1.17346021940978.


Best trial: 18. Best value: 1.17346:  24%|██▍       | 24/100 [00:08<00:07,  9.78it/s]

[I 2026-02-14 12:17:03,617] Trial 22 finished with value: 1.174766729737525 and parameters: {'n_features_ratio': 0.7286381280225896, 'n_estimators': 121, 'max_depth': 3, 'learning_rate': 0.05588221526137712, 'min_child_weight': 4}. Best is trial 18 with value: 1.17346021940978.
[I 2026-02-14 12:17:03,714] Trial 23 finished with value: 1.1784814089643603 and parameters: {'n_features_ratio': 0.737706040163319, 'n_estimators': 151, 'max_depth': 4, 'learning_rate': 0.05319534980587915, 'min_child_weight': 4}. Best is trial 18 with value: 1.17346021940978.
[I 2026-02-14 12:17:03,777] Trial 24 finished with value: 1.1787110053563032 and parameters: {'n_features_ratio': 0.7442136805033553, 'n_estimators': 111, 'max_depth': 3, 'learning_rate': 0.03281595413342121, 'min_child_weight': 3}. Best is trial 18 with value: 1.17346021940978.


Best trial: 18. Best value: 1.17346:  26%|██▌       | 26/100 [00:09<00:06, 11.64it/s]

[I 2026-02-14 12:17:03,822] Trial 25 finished with value: 1.2135149993534695 and parameters: {'n_features_ratio': 0.6140223818248427, 'n_estimators': 62, 'max_depth': 3, 'learning_rate': 0.024162232422222683, 'min_child_weight': 1}. Best is trial 18 with value: 1.17346021940978.
[I 2026-02-14 12:17:03,915] Trial 26 finished with value: 1.1847970512171169 and parameters: {'n_features_ratio': 0.5797654459738575, 'n_estimators': 200, 'max_depth': 4, 'learning_rate': 0.05909049717014485, 'min_child_weight': 7}. Best is trial 18 with value: 1.17346021940978.


Best trial: 18. Best value: 1.17346:  28%|██▊       | 28/100 [00:09<00:06, 10.59it/s]

[I 2026-02-14 12:17:04,047] Trial 27 finished with value: 1.1945309989538084 and parameters: {'n_features_ratio': 0.6892146148207179, 'n_estimators': 162, 'max_depth': 5, 'learning_rate': 0.03909468834836676, 'min_child_weight': 6}. Best is trial 18 with value: 1.17346021940978.
[I 2026-02-14 12:17:04,108] Trial 28 finished with value: 1.208448436680032 and parameters: {'n_features_ratio': 0.8217320632202557, 'n_estimators': 109, 'max_depth': 3, 'learning_rate': 0.015446672129400749, 'min_child_weight': 4}. Best is trial 18 with value: 1.17346021940978.


Best trial: 18. Best value: 1.17346:  30%|███       | 30/100 [00:09<00:07,  8.82it/s]

[I 2026-02-14 12:17:04,354] Trial 29 finished with value: 1.2511497095531625 and parameters: {'n_features_ratio': 0.714833328362861, 'n_estimators': 77, 'max_depth': 8, 'learning_rate': 0.07450991039271992, 'min_child_weight': 2}. Best is trial 18 with value: 1.17346021940978.


Best trial: 18. Best value: 1.17346:  32%|███▏      | 32/100 [00:10<00:10,  6.52it/s]

[I 2026-02-14 12:17:04,742] Trial 30 finished with value: 1.4799822879989903 and parameters: {'n_features_ratio': 0.7810259776053492, 'n_estimators': 249, 'max_depth': 7, 'learning_rate': 0.17970543731335023, 'min_child_weight': 3}. Best is trial 18 with value: 1.17346021940978.
[I 2026-02-14 12:17:04,838] Trial 31 finished with value: 1.1794254727388158 and parameters: {'n_features_ratio': 0.7241626191944243, 'n_estimators': 151, 'max_depth': 4, 'learning_rate': 0.055130229684462184, 'min_child_weight': 4}. Best is trial 18 with value: 1.17346021940978.


Best trial: 34. Best value: 1.1723:  34%|███▍      | 34/100 [00:10<00:08,  7.62it/s] 

[I 2026-02-14 12:17:04,931] Trial 32 finished with value: 1.1793336149790385 and parameters: {'n_features_ratio': 0.8403683481927986, 'n_estimators': 140, 'max_depth': 4, 'learning_rate': 0.04833408892728903, 'min_child_weight': 4}. Best is trial 18 with value: 1.17346021940978.
[I 2026-02-14 12:17:05,000] Trial 33 finished with value: 1.1728030446233182 and parameters: {'n_features_ratio': 0.6162871879088683, 'n_estimators': 160, 'max_depth': 3, 'learning_rate': 0.06575384680141821, 'min_child_weight': 5}. Best is trial 33 with value: 1.1728030446233182.
[I 2026-02-14 12:17:05,054] Trial 34 finished with value: 1.172299395957604 and parameters: {'n_features_ratio': 0.5674597388247817, 'n_estimators': 116, 'max_depth': 3, 'learning_rate': 0.08603342840005666, 'min_child_weight': 5}. Best is trial 34 with value: 1.172299395957604.


Best trial: 38. Best value: 1.17168:  38%|███▊      | 38/100 [00:10<00:05, 10.81it/s]

[I 2026-02-14 12:17:05,120] Trial 35 finished with value: 1.183649040386191 and parameters: {'n_features_ratio': 0.5461060069607167, 'n_estimators': 190, 'max_depth': 3, 'learning_rate': 0.09721711024489317, 'min_child_weight': 7}. Best is trial 34 with value: 1.172299395957604.
[I 2026-02-14 12:17:05,189] Trial 36 finished with value: 1.1743451695264426 and parameters: {'n_features_ratio': 0.6074064315937937, 'n_estimators': 162, 'max_depth': 3, 'learning_rate': 0.06887979631661403, 'min_child_weight': 5}. Best is trial 34 with value: 1.172299395957604.
[I 2026-02-14 12:17:05,229] Trial 37 finished with value: 1.1754310154645427 and parameters: {'n_features_ratio': 0.44088349367465884, 'n_estimators': 52, 'max_depth': 4, 'learning_rate': 0.08458907646816527, 'min_child_weight': 6}. Best is trial 34 with value: 1.172299395957604.
[I 2026-02-14 12:17:05,285] Trial 38 finished with value: 1.1716788339568314 and parameters: {'n_features_ratio': 0.5440857937532909, 'n_estimators': 133, 'ma

Best trial: 38. Best value: 1.17168:  40%|████      | 40/100 [00:10<00:04, 12.28it/s]

[I 2026-02-14 12:17:05,340] Trial 39 finished with value: 1.1728151389047852 and parameters: {'n_features_ratio': 0.5359589219489047, 'n_estimators': 134, 'max_depth': 3, 'learning_rate': 0.03697323186775219, 'min_child_weight': 10}. Best is trial 38 with value: 1.1716788339568314.
[I 2026-02-14 12:17:05,470] Trial 40 finished with value: 1.2114415775772867 and parameters: {'n_features_ratio': 0.5317652393230622, 'n_estimators': 94, 'max_depth': 7, 'learning_rate': 0.02749715722156113, 'min_child_weight': 10}. Best is trial 38 with value: 1.1716788339568314.


Best trial: 38. Best value: 1.17168:  44%|████▍     | 44/100 [00:10<00:04, 12.84it/s]

[I 2026-02-14 12:17:05,531] Trial 41 finished with value: 1.1726566804810759 and parameters: {'n_features_ratio': 0.5695920174006838, 'n_estimators': 137, 'max_depth': 3, 'learning_rate': 0.037662520360682065, 'min_child_weight': 9}. Best is trial 38 with value: 1.1716788339568314.
[I 2026-02-14 12:17:05,597] Trial 42 finished with value: 1.173127560812485 and parameters: {'n_features_ratio': 0.560239457943978, 'n_estimators': 128, 'max_depth': 3, 'learning_rate': 0.0385842978580625, 'min_child_weight': 9}. Best is trial 38 with value: 1.1716788339568314.
[I 2026-02-14 12:17:05,651] Trial 43 finished with value: 1.1940307162329078 and parameters: {'n_features_ratio': 0.48630397686798593, 'n_estimators': 112, 'max_depth': 3, 'learning_rate': 0.019392410724441134, 'min_child_weight': 9}. Best is trial 38 with value: 1.1716788339568314.


Best trial: 46. Best value: 1.17164:  48%|████▊     | 48/100 [00:11<00:04, 11.47it/s]

[I 2026-02-14 12:17:05,905] Trial 44 finished with value: 1.2331837269055501 and parameters: {'n_features_ratio': 0.5146977645059487, 'n_estimators': 135, 'max_depth': 8, 'learning_rate': 0.036445438027268795, 'min_child_weight': 9}. Best is trial 38 with value: 1.1716788339568314.
[I 2026-02-14 12:17:05,969] Trial 45 finished with value: 1.179051642415194 and parameters: {'n_features_ratio': 0.583911790544949, 'n_estimators': 104, 'max_depth': 4, 'learning_rate': 0.028568046027951897, 'min_child_weight': 10}. Best is trial 38 with value: 1.1716788339568314.
[I 2026-02-14 12:17:06,026] Trial 46 finished with value: 1.171643464775696 and parameters: {'n_features_ratio': 0.643667771388664, 'n_estimators': 120, 'max_depth': 3, 'learning_rate': 0.0664819069667193, 'min_child_weight': 8}. Best is trial 46 with value: 1.171643464775696.
[I 2026-02-14 12:17:06,073] Trial 47 finished with value: 1.1719618232705185 and parameters: {'n_features_ratio': 0.647036236172697, 'n_estimators': 81, 'max

Best trial: 46. Best value: 1.17164:  50%|█████     | 50/100 [00:11<00:04, 11.89it/s]

[I 2026-02-14 12:17:06,130] Trial 48 finished with value: 1.1824163336591174 and parameters: {'n_features_ratio': 0.6464640146473034, 'n_estimators': 82, 'max_depth': 4, 'learning_rate': 0.11554407416295971, 'min_child_weight': 8}. Best is trial 46 with value: 1.171643464775696.
[I 2026-02-14 12:17:06,226] Trial 49 finished with value: 1.2144777384853116 and parameters: {'n_features_ratio': 0.6513923649780311, 'n_estimators': 75, 'max_depth': 6, 'learning_rate': 0.08344870627946453, 'min_child_weight': 8}. Best is trial 46 with value: 1.171643464775696.
[I 2026-02-14 12:17:06,274] Trial 50 finished with value: 1.1743428720543263 and parameters: {'n_features_ratio': 0.45421366580140454, 'n_estimators': 94, 'max_depth': 3, 'learning_rate': 0.1089397310626048, 'min_child_weight': 9}. Best is trial 46 with value: 1.171643464775696.


Best trial: 52. Best value: 1.17055:  53%|█████▎    | 53/100 [00:11<00:03, 14.00it/s]

[I 2026-02-14 12:17:06,327] Trial 51 finished with value: 1.1710689684033693 and parameters: {'n_features_ratio': 0.5829933438193209, 'n_estimators': 120, 'max_depth': 3, 'learning_rate': 0.06606952762296296, 'min_child_weight': 7}. Best is trial 51 with value: 1.1710689684033693.
[I 2026-02-14 12:17:06,382] Trial 52 finished with value: 1.1705460142125896 and parameters: {'n_features_ratio': 0.5865237323803713, 'n_estimators': 118, 'max_depth': 3, 'learning_rate': 0.08073855614393495, 'min_child_weight': 7}. Best is trial 52 with value: 1.1705460142125896.
[I 2026-02-14 12:17:06,439] Trial 53 finished with value: 1.1723534038294263 and parameters: {'n_features_ratio': 0.6363588191608187, 'n_estimators': 118, 'max_depth': 3, 'learning_rate': 0.08319463896979079, 'min_child_weight': 7}. Best is trial 52 with value: 1.1705460142125896.


Best trial: 52. Best value: 1.17055:  57%|█████▋    | 57/100 [00:11<00:02, 15.59it/s]

[I 2026-02-14 12:17:06,502] Trial 54 finished with value: 1.1993002012390166 and parameters: {'n_features_ratio': 0.6711381517011082, 'n_estimators': 96, 'max_depth': 4, 'learning_rate': 0.13639363100172539, 'min_child_weight': 8}. Best is trial 52 with value: 1.1705460142125896.
[I 2026-02-14 12:17:06,548] Trial 55 finished with value: 1.1734384493212076 and parameters: {'n_features_ratio': 0.5882932648017031, 'n_estimators': 67, 'max_depth': 3, 'learning_rate': 0.07186603656295161, 'min_child_weight': 7}. Best is trial 52 with value: 1.1705460142125896.
[I 2026-02-14 12:17:06,609] Trial 56 finished with value: 1.1875773123990467 and parameters: {'n_features_ratio': 0.5983864192604815, 'n_estimators': 123, 'max_depth': 3, 'learning_rate': 0.15740178028448387, 'min_child_weight': 8}. Best is trial 52 with value: 1.1705460142125896.
[I 2026-02-14 12:17:06,673] Trial 57 finished with value: 1.180672014448827 and parameters: {'n_features_ratio': 0.5116111797446703, 'n_estimators': 106, 'm

Best trial: 52. Best value: 1.17055:  61%|██████    | 61/100 [00:12<00:02, 15.88it/s]

[I 2026-02-14 12:17:06,726] Trial 58 finished with value: 1.1728794942567133 and parameters: {'n_features_ratio': 0.5565839424559649, 'n_estimators': 86, 'max_depth': 3, 'learning_rate': 0.06574777178750406, 'min_child_weight': 8}. Best is trial 52 with value: 1.1705460142125896.
[I 2026-02-14 12:17:06,801] Trial 59 finished with value: 1.1752028782325434 and parameters: {'n_features_ratio': 0.6269466488757464, 'n_estimators': 103, 'max_depth': 4, 'learning_rate': 0.05967070916356803, 'min_child_weight': 6}. Best is trial 52 with value: 1.1705460142125896.
[I 2026-02-14 12:17:06,854] Trial 60 finished with value: 1.1724404907191275 and parameters: {'n_features_ratio': 0.6649190374480896, 'n_estimators': 87, 'max_depth': 3, 'learning_rate': 0.08689916330831272, 'min_child_weight': 7}. Best is trial 52 with value: 1.1705460142125896.


Best trial: 52. Best value: 1.17055:  65%|██████▌   | 65/100 [00:12<00:02, 16.65it/s]

[I 2026-02-14 12:17:06,913] Trial 61 finished with value: 1.1719150283053783 and parameters: {'n_features_ratio': 0.6416419900776826, 'n_estimators': 128, 'max_depth': 3, 'learning_rate': 0.07897222252153419, 'min_child_weight': 7}. Best is trial 52 with value: 1.1705460142125896.
[I 2026-02-14 12:17:06,973] Trial 62 finished with value: 1.1759958522568994 and parameters: {'n_features_ratio': 0.6960710295285316, 'n_estimators': 118, 'max_depth': 3, 'learning_rate': 0.07735734235970612, 'min_child_weight': 8}. Best is trial 52 with value: 1.1705460142125896.
[I 2026-02-14 12:17:07,029] Trial 63 finished with value: 1.1764914337072319 and parameters: {'n_features_ratio': 0.6260366928815094, 'n_estimators': 112, 'max_depth': 3, 'learning_rate': 0.114909303945198, 'min_child_weight': 7}. Best is trial 52 with value: 1.1705460142125896.
[I 2026-02-14 12:17:07,085] Trial 64 finished with value: 1.1719744843433668 and parameters: {'n_features_ratio': 0.5898081208390397, 'n_estimators': 128, '

Best trial: 67. Best value: 1.17042:  67%|██████▋   | 67/100 [00:12<00:02, 15.71it/s]

[I 2026-02-14 12:17:07,142] Trial 65 finished with value: 1.1720959721342492 and parameters: {'n_features_ratio': 0.598873107287235, 'n_estimators': 127, 'max_depth': 3, 'learning_rate': 0.05134757272458692, 'min_child_weight': 6}. Best is trial 52 with value: 1.1705460142125896.
[I 2026-02-14 12:17:07,228] Trial 66 finished with value: 1.177090313765802 and parameters: {'n_features_ratio': 0.6668515611415666, 'n_estimators': 139, 'max_depth': 4, 'learning_rate': 0.04282353668464062, 'min_child_weight': 7}. Best is trial 52 with value: 1.1705460142125896.
[I 2026-02-14 12:17:07,286] Trial 67 finished with value: 1.1704165296479707 and parameters: {'n_features_ratio': 0.5249047283486858, 'n_estimators': 127, 'max_depth': 3, 'learning_rate': 0.06400898168822325, 'min_child_weight': 8}. Best is trial 67 with value: 1.1704165296479707.


Best trial: 67. Best value: 1.17042:  71%|███████   | 71/100 [00:12<00:02, 14.18it/s]

[I 2026-02-14 12:17:07,347] Trial 68 finished with value: 1.1720084894240206 and parameters: {'n_features_ratio': 0.4236532376923122, 'n_estimators': 178, 'max_depth': 3, 'learning_rate': 0.0593141066994975, 'min_child_weight': 8}. Best is trial 67 with value: 1.1704165296479707.
[I 2026-02-14 12:17:07,468] Trial 69 finished with value: 1.2336184880395535 and parameters: {'n_features_ratio': 0.5343710660655762, 'n_estimators': 144, 'max_depth': 6, 'learning_rate': 0.07585251803222465, 'min_child_weight': 8}. Best is trial 67 with value: 1.1704165296479707.
[I 2026-02-14 12:17:07,527] Trial 70 finished with value: 1.22849946349702 and parameters: {'n_features_ratio': 0.4917734878159764, 'n_estimators': 158, 'max_depth': 3, 'learning_rate': 0.2942418954462013, 'min_child_weight': 9}. Best is trial 67 with value: 1.1704165296479707.


Best trial: 72. Best value: 1.17027:  73%|███████▎  | 73/100 [00:12<00:01, 14.13it/s]

[I 2026-02-14 12:17:07,591] Trial 71 finished with value: 1.1703835978542592 and parameters: {'n_features_ratio': 0.6353115988791458, 'n_estimators': 129, 'max_depth': 3, 'learning_rate': 0.06294052440575605, 'min_child_weight': 7}. Best is trial 71 with value: 1.1703835978542592.
[I 2026-02-14 12:17:07,669] Trial 72 finished with value: 1.170270860688748 and parameters: {'n_features_ratio': 0.6373291308562673, 'n_estimators': 124, 'max_depth': 3, 'learning_rate': 0.06247961408669571, 'min_child_weight': 7}. Best is trial 72 with value: 1.170270860688748.
[I 2026-02-14 12:17:07,739] Trial 73 finished with value: 1.1738339141960723 and parameters: {'n_features_ratio': 0.7040368541976547, 'n_estimators': 132, 'max_depth': 3, 'learning_rate': 0.06267858026006948, 'min_child_weight': 7}. Best is trial 72 with value: 1.170270860688748.


Best trial: 72. Best value: 1.17027:  77%|███████▋  | 77/100 [00:13<00:01, 14.56it/s]

[I 2026-02-14 12:17:07,803] Trial 74 finished with value: 1.1718348813307233 and parameters: {'n_features_ratio': 0.6146166051904014, 'n_estimators': 120, 'max_depth': 3, 'learning_rate': 0.04241996111351691, 'min_child_weight': 7}. Best is trial 72 with value: 1.170270860688748.
[I 2026-02-14 12:17:07,859] Trial 75 finished with value: 1.173453632726218 and parameters: {'n_features_ratio': 0.5204772757066742, 'n_estimators': 99, 'max_depth': 3, 'learning_rate': 0.04250512951262096, 'min_child_weight': 6}. Best is trial 72 with value: 1.170270860688748.
[I 2026-02-14 12:17:07,935] Trial 76 finished with value: 1.1738520617458694 and parameters: {'n_features_ratio': 0.6132438168219201, 'n_estimators': 121, 'max_depth': 4, 'learning_rate': 0.056889458945317675, 'min_child_weight': 7}. Best is trial 72 with value: 1.170270860688748.


Best trial: 72. Best value: 1.17027:  79%|███████▉  | 79/100 [00:13<00:01, 12.74it/s]

[I 2026-02-14 12:17:08,029] Trial 77 finished with value: 1.172629622655259 and parameters: {'n_features_ratio': 0.548902645726179, 'n_estimators': 300, 'max_depth': 3, 'learning_rate': 0.03321145159987925, 'min_child_weight': 8}. Best is trial 72 with value: 1.170270860688748.
[I 2026-02-14 12:17:08,138] Trial 78 finished with value: 1.202714410372849 and parameters: {'n_features_ratio': 0.5674434346566171, 'n_estimators': 167, 'max_depth': 5, 'learning_rate': 0.0708690775428818, 'min_child_weight': 7}. Best is trial 72 with value: 1.170270860688748.
[I 2026-02-14 12:17:08,202] Trial 79 finished with value: 1.1721897238658288 and parameters: {'n_features_ratio': 0.5783843168399324, 'n_estimators': 144, 'max_depth': 3, 'learning_rate': 0.04125375101084637, 'min_child_weight': 6}. Best is trial 72 with value: 1.170270860688748.


Best trial: 72. Best value: 1.17027:  83%|████████▎ | 83/100 [00:13<00:01, 14.60it/s]

[I 2026-02-14 12:17:08,260] Trial 80 finished with value: 1.171431045659924 and parameters: {'n_features_ratio': 0.6278723782386306, 'n_estimators': 111, 'max_depth': 3, 'learning_rate': 0.04972851350713317, 'min_child_weight': 8}. Best is trial 72 with value: 1.170270860688748.
[I 2026-02-14 12:17:08,319] Trial 81 finished with value: 1.1736708955467192 and parameters: {'n_features_ratio': 0.6795113735648699, 'n_estimators': 114, 'max_depth': 3, 'learning_rate': 0.04574399164584361, 'min_child_weight': 8}. Best is trial 72 with value: 1.170270860688748.
[I 2026-02-14 12:17:08,376] Trial 82 finished with value: 1.1716084868509373 and parameters: {'n_features_ratio': 0.6237155323318142, 'n_estimators': 107, 'max_depth': 3, 'learning_rate': 0.049718566657066504, 'min_child_weight': 7}. Best is trial 72 with value: 1.170270860688748.
[I 2026-02-14 12:17:08,431] Trial 83 finished with value: 1.1714863499093457 and parameters: {'n_features_ratio': 0.6292446106134157, 'n_estimators': 108, 'm

Best trial: 72. Best value: 1.17027:  87%|████████▋ | 87/100 [00:13<00:00, 15.66it/s]

[I 2026-02-14 12:17:08,487] Trial 84 finished with value: 1.1714133999012737 and parameters: {'n_features_ratio': 0.6316629305735126, 'n_estimators': 104, 'max_depth': 3, 'learning_rate': 0.05267785586737971, 'min_child_weight': 9}. Best is trial 72 with value: 1.170270860688748.
[I 2026-02-14 12:17:08,545] Trial 85 finished with value: 1.1719797492095372 and parameters: {'n_features_ratio': 0.6283100180066622, 'n_estimators': 107, 'max_depth': 3, 'learning_rate': 0.05200238589199353, 'min_child_weight': 9}. Best is trial 72 with value: 1.170270860688748.
[I 2026-02-14 12:17:08,611] Trial 86 finished with value: 1.1721233113303222 and parameters: {'n_features_ratio': 0.6009094368247148, 'n_estimators': 100, 'max_depth': 3, 'learning_rate': 0.04932891398116924, 'min_child_weight': 9}. Best is trial 72 with value: 1.170270860688748.


Best trial: 72. Best value: 1.17027:  89%|████████▉ | 89/100 [00:14<00:00, 14.84it/s]

[I 2026-02-14 12:17:08,701] Trial 87 finished with value: 1.1783082229303834 and parameters: {'n_features_ratio': 0.6592277825762828, 'n_estimators': 108, 'max_depth': 4, 'learning_rate': 0.06146808299032755, 'min_child_weight': 10}. Best is trial 72 with value: 1.170270860688748.
[I 2026-02-14 12:17:08,761] Trial 88 finished with value: 1.1739583246234389 and parameters: {'n_features_ratio': 0.6788662626062594, 'n_estimators': 91, 'max_depth': 3, 'learning_rate': 0.05533222731895749, 'min_child_weight': 9}. Best is trial 72 with value: 1.170270860688748.
[I 2026-02-14 12:17:08,831] Trial 89 finished with value: 1.1719781169191914 and parameters: {'n_features_ratio': 0.62627293163289, 'n_estimators': 102, 'max_depth': 3, 'learning_rate': 0.0690586112924612, 'min_child_weight': 9}. Best is trial 72 with value: 1.170270860688748.


Best trial: 72. Best value: 1.17027:  93%|█████████▎| 93/100 [00:14<00:00, 15.06it/s]

[I 2026-02-14 12:17:08,891] Trial 90 finished with value: 1.174907346813089 and parameters: {'n_features_ratio': 0.5787599017591332, 'n_estimators': 91, 'max_depth': 3, 'learning_rate': 0.046827833847860004, 'min_child_weight': 6}. Best is trial 72 with value: 1.170270860688748.
[I 2026-02-14 12:17:08,959] Trial 91 finished with value: 1.1715009366386269 and parameters: {'n_features_ratio': 0.6420630099087888, 'n_estimators': 123, 'max_depth': 3, 'learning_rate': 0.0640615846530497, 'min_child_weight': 8}. Best is trial 72 with value: 1.170270860688748.
[I 2026-02-14 12:17:09,023] Trial 92 finished with value: 1.1720275257010044 and parameters: {'n_features_ratio': 0.6056969724548988, 'n_estimators': 113, 'max_depth': 3, 'learning_rate': 0.05400032524523387, 'min_child_weight': 8}. Best is trial 72 with value: 1.170270860688748.


Best trial: 72. Best value: 1.17027:  95%|█████████▌| 95/100 [00:14<00:00, 15.10it/s]

[I 2026-02-14 12:17:09,089] Trial 93 finished with value: 1.1728147202780446 and parameters: {'n_features_ratio': 0.7126306176869618, 'n_estimators': 124, 'max_depth': 3, 'learning_rate': 0.06345226489551294, 'min_child_weight': 7}. Best is trial 72 with value: 1.170270860688748.
[I 2026-02-14 12:17:09,155] Trial 94 finished with value: 1.1709616782957197 and parameters: {'n_features_ratio': 0.6337467927645641, 'n_estimators': 133, 'max_depth': 3, 'learning_rate': 0.057780103987659176, 'min_child_weight': 8}. Best is trial 72 with value: 1.170270860688748.
[I 2026-02-14 12:17:09,232] Trial 95 finished with value: 1.18651154675768 and parameters: {'n_features_ratio': 0.6573782466506346, 'n_estimators': 155, 'max_depth': 3, 'learning_rate': 0.09285352985368993, 'min_child_weight': 9}. Best is trial 72 with value: 1.170270860688748.


Best trial: 72. Best value: 1.17027:  97%|█████████▋| 97/100 [00:14<00:00, 14.51it/s]

[I 2026-02-14 12:17:09,306] Trial 96 finished with value: 1.1757759609053287 and parameters: {'n_features_ratio': 0.6868509730878652, 'n_estimators': 138, 'max_depth': 3, 'learning_rate': 0.07205175065302227, 'min_child_weight': 8}. Best is trial 72 with value: 1.170270860688748.
[I 2026-02-14 12:17:09,395] Trial 97 finished with value: 1.1816852426417104 and parameters: {'n_features_ratio': 0.6332781364849133, 'n_estimators': 148, 'max_depth': 4, 'learning_rate': 0.058690853214757435, 'min_child_weight': 10}. Best is trial 72 with value: 1.170270860688748.


Best trial: 72. Best value: 1.17027: 100%|██████████| 100/100 [00:14<00:00,  6.74it/s]


[I 2026-02-14 12:17:09,574] Trial 98 finished with value: 1.235088756406433 and parameters: {'n_features_ratio': 0.589898711682034, 'n_estimators': 131, 'max_depth': 7, 'learning_rate': 0.06649908896513794, 'min_child_weight': 9}. Best is trial 72 with value: 1.170270860688748.
[I 2026-02-14 12:17:09,644] Trial 99 finished with value: 1.17798479575969 and parameters: {'n_features_ratio': 0.7533468449733057, 'n_estimators': 116, 'max_depth': 3, 'learning_rate': 0.10441634096671741, 'min_child_weight': 8}. Best is trial 72 with value: 1.170270860688748.
  Best CV Poisson Deviance: 1.1703
  Features: 12/20 (63.7%)
  Params: {'n_estimators': 124, 'max_depth': 3, 'learning_rate': 0.06247961408669571, 'min_child_weight': 7}

------------------------------------------------------------
PHASE 2: Evaluation on Held-Out Test Set
------------------------------------------------------------

Model           Metric          Test Value   MAE         
-------------------------------------------------

  self.df['opponent_norm'] = self.df['opponent'].apply(normalize_name)


  Mapped pred_team_goals to 61,965/61,965 player rows
  Mean pred_team_goals: 1.365
Training GoalsModel on 61,952 samples (23 features, tuned selection)...
  Target mean: 0.1073
  MAE: 0.1897
Training AssistsModel on 61,952 samples (15 features, tuned selection)...
  Target mean: 0.0763
  MAE: 0.1450
Training DefconModel on 61,952 samples...
  Mean defcon/90: 9.79
  MAE: 4.16
Training BonusModel (Monte Carlo, 1000 sims)...
  Estimating baseline BPS from stats (no actual BPS data)
Training BaselineBPSModel on 44282 samples...
  Features used: 30
  Avg baseline BPS: 18.0
  MAE: 0.67
  Loaded FPL availability for 1603 players

PREDICTING GW26 (2025/2026)
Found 511 active players with historical data
  Filtered out 77 unavailable players (injured/suspended)
Matched 434 players to GW26 fixtures

Saved predictions to: data\predictions\gw26_2025-2026.csv
Total players: 434


In [4]:
# =============================================================================
# STEP 3: View Top Players
# =============================================================================
# Top 20 by expected points with full prediction breakdown
display_cols = [
    'player_name', 'team', 'fpl_position', 'opponent', 'is_home',
    'pred_minutes', 'pred_exp_goals', 'pred_exp_assists', 
    'pred_cs_prob', 'pred_2plus_conceded',
    'pred_defcon_prob', 'pred_bonus', 'exp_total_pts'
]
available_cols = [c for c in display_cols if c in predictions.columns]
predictions.nlargest(20, 'exp_total_pts')[available_cols].round(2)

Unnamed: 0,player_name,team,fpl_position,opponent,is_home,pred_minutes,pred_exp_goals,pred_exp_assists,pred_cs_prob,pred_2plus_conceded,pred_defcon_prob,pred_bonus,exp_total_pts
39,Mohamed Salah,Liverpool,MID,Sunderland,0,85.919998,0.42,0.28,0.25,0.41,0.0,0.76,5.95
144,Ismaila Sarr,Crystal Palace,MID,Burnley,1,87.300003,0.41,0.17,0.39,0.25,0.02,0.56,5.52
52,Bruno Fernandes,Manchester United,MID,West Ham,0,87.519997,0.32,0.21,0.29,0.36,0.21,0.55,5.47
130,Tammy Abraham,Aston Villa,FWD,Brighton,1,76.480003,0.56,0.03,0.29,0.35,0.0,1.1,5.43
292,Riccardo Calafiori,Arsenal,DEF,Brentford,0,77.199997,0.09,0.03,0.5,0.15,0.18,0.41,5.26
265,Yéremi Pino,Crystal Palace,MID,Burnley,1,83.620003,0.23,0.36,0.39,0.25,0.06,0.49,5.25
161,Gabriel,Arsenal,DEF,Brentford,0,88.0,0.08,0.02,0.5,0.15,0.33,0.13,5.17
231,Maxence Lacroix,Crystal Palace,DEF,Burnley,1,89.18,0.06,0.04,0.39,0.25,0.6,0.13,5.13
219,Bryan Mbeumo,Manchester United,MID,West Ham,0,88.0,0.4,0.14,0.29,0.36,0.01,0.38,5.11
166,Reece James,Chelsea,DEF,Leeds,1,81.769997,0.08,0.1,0.45,0.19,0.25,0.21,5.1


In [5]:
# =============================================================================
# Top 10 by Position
# =============================================================================
import pandas as pd

# --- FWD & MID: ranked by expected goals + assists ---
for pos in ['FWD', 'MID']:
    df_pos = predictions[predictions['fpl_position'] == pos].copy()
    df_pos['g+a'] = df_pos['pred_exp_goals'] + df_pos['pred_exp_assists']
    cols = ['player_name', 'team', 'opponent', 'is_home',
            'pred_minutes', 'pred_exp_goals', 'pred_exp_assists', 'g+a', 'exp_total_pts']
    available = [c for c in cols if c in df_pos.columns]
    print(f"\n{'='*70}")
    print(f"  TOP 10 {pos} (ranked by expected G+A)")
    print(f"{'='*70}")
    display(df_pos.nlargest(10, 'g+a')[available].round(2))

# --- DEF: ranked by expected total pts, showing CS/DC/2+ conceded + G+A ---
df_def = predictions[predictions['fpl_position'] == 'DEF'].copy()
df_def['g+a'] = df_def['pred_exp_goals'] + df_def['pred_exp_assists']
cols_def = ['player_name', 'team', 'opponent', 'is_home',
            'pred_minutes', 'pred_exp_goals', 'pred_exp_assists', 'g+a',
            'pred_cs_prob', 'pred_2plus_conceded', 'pred_defcon_prob', 'exp_total_pts']
available_def = [c for c in cols_def if c in df_def.columns]
print(f"\n{'='*70}")
print(f"  TOP 10 DEF (ranked by exp total pts)")
print(f"{'='*70}")
display(df_def.nlargest(10, 'exp_total_pts')[available_def].round(2))

# --- GK ---
df_gk = predictions[predictions['fpl_position'] == 'GK'].copy()
cols_gk = ['player_name', 'team', 'opponent', 'is_home',
           'pred_minutes', 'pred_cs_prob', 'pred_2plus_conceded', 'pred_goals_against', 'exp_total_pts']
available_gk = [c for c in cols_gk if c in df_gk.columns]
print(f"\n{'='*70}")
print(f"  TOP 10 GK (ranked by exp total pts)")
print(f"{'='*70}")
display(df_gk.nlargest(10, 'exp_total_pts')[available_gk].round(2))


  TOP 10 FWD (ranked by expected G+A)


Unnamed: 0,player_name,team,opponent,is_home,pred_minutes,pred_exp_goals,pred_exp_assists,g+a,exp_total_pts
130,Tammy Abraham,Aston Villa,Brighton,1,76.480003,0.56,0.03,0.59,5.43
126,Erling Haaland,Manchester City,Fulham,1,86.529999,0.44,0.1,0.54,4.89
72,Ollie Watkins,Aston Villa,Brighton,1,85.790001,0.42,0.07,0.5,4.73
331,Hugo Ekitike,Liverpool,Sunderland,0,76.690002,0.31,0.12,0.43,4.3
290,Evanilson,Bournemouth,Everton,0,82.470001,0.34,0.08,0.42,4.3
38,Raul Jiménez,Fulham,Man City,0,82.589996,0.29,0.08,0.37,4.0
66,Jarrod Bowen,West Ham United,Man Utd,1,88.0,0.25,0.11,0.36,3.93
366,Igor Thiago,Brentford,Arsenal,1,88.07,0.32,0.04,0.36,4.12
171,Jørgen Strand Larsen,Crystal Palace,Burnley,1,67.449997,0.3,0.06,0.36,4.02
258,Joao Pedro,Chelsea,Leeds,1,72.75,0.26,0.08,0.34,3.85



  TOP 10 MID (ranked by expected G+A)


Unnamed: 0,player_name,team,opponent,is_home,pred_minutes,pred_exp_goals,pred_exp_assists,g+a,exp_total_pts
39,Mohamed Salah,Liverpool,Sunderland,0,85.919998,0.42,0.28,0.7,5.95
265,Yéremi Pino,Crystal Palace,Burnley,1,83.620003,0.23,0.36,0.59,5.25
144,Ismaila Sarr,Crystal Palace,Burnley,1,87.300003,0.41,0.17,0.58,5.52
219,Bryan Mbeumo,Manchester United,West Ham,0,88.0,0.4,0.14,0.54,5.11
52,Bruno Fernandes,Manchester United,West Ham,0,87.519997,0.32,0.21,0.53,5.47
244,Bukayo Saka,Arsenal,Brentford,0,81.059998,0.29,0.21,0.5,5.01
315,Florian Wirtz,Liverpool,Sunderland,0,84.620003,0.21,0.25,0.47,4.4
288,Cole Palmer,Chelsea,Leeds,1,78.370003,0.36,0.08,0.44,4.98
194,Matheus Cunha,Manchester United,West Ham,0,79.360001,0.34,0.1,0.44,4.73
165,Cody Gakpo,Liverpool,Sunderland,0,83.610001,0.3,0.14,0.44,4.53



  TOP 10 DEF (ranked by exp total pts)


Unnamed: 0,player_name,team,opponent,is_home,pred_minutes,pred_exp_goals,pred_exp_assists,g+a,pred_cs_prob,pred_2plus_conceded,pred_defcon_prob,exp_total_pts
292,Riccardo Calafiori,Arsenal,Brentford,0,77.199997,0.09,0.03,0.13,0.5,0.15,0.18,5.26
161,Gabriel,Arsenal,Brentford,0,88.0,0.08,0.02,0.1,0.5,0.15,0.33,5.17
231,Maxence Lacroix,Crystal Palace,Burnley,1,89.18,0.06,0.04,0.1,0.39,0.25,0.6,5.13
166,Reece James,Chelsea,Leeds,1,81.769997,0.08,0.1,0.18,0.45,0.19,0.25,5.1
160,Trevoh Chalobah,Chelsea,Leeds,1,88.220001,0.04,0.02,0.06,0.45,0.19,0.54,5.08
304,Tyrick Mitchell,Crystal Palace,Burnley,1,88.0,0.05,0.09,0.14,0.39,0.25,0.44,5.04
232,Chris Richards,Crystal Palace,Burnley,1,88.300003,0.06,0.04,0.1,0.39,0.25,0.53,5.04
201,Marc Cucurella,Chelsea,Leeds,1,85.449997,0.05,0.08,0.13,0.45,0.19,0.34,4.99
27,James Tarkowski,Everton,Bournemouth,1,89.230003,0.07,0.06,0.13,0.28,0.36,0.69,4.91
312,Piero Hincapié,Arsenal,Brentford,0,81.040001,0.02,0.06,0.07,0.5,0.15,0.35,4.89



  TOP 10 GK (ranked by exp total pts)


Unnamed: 0,player_name,team,opponent,is_home,pred_minutes,pred_cs_prob,pred_2plus_conceded,pred_goals_against,exp_total_pts
77,David Raya,Arsenal,Brentford,0,89.239998,0.5,0.15,0.7,4.07
155,Robert Sánchez,Chelsea,Leeds,1,89.32,0.45,0.19,0.79,3.94
336,Filip Jörgensen,Chelsea,Leeds,1,87.669998,0.45,0.19,0.79,3.84
121,Dean Henderson,Crystal Palace,Burnley,1,89.18,0.39,0.25,0.95,3.67
22,Alphonse Areola,West Ham United,Man Utd,1,89.190002,0.34,0.29,1.06,3.24
250,Mads Hermansen,West Ham United,Man Utd,1,89.230003,0.34,0.29,1.06,3.23
42,Jordan Pickford,Everton,Bournemouth,1,89.230003,0.28,0.36,1.27,3.14
94,Lucas Perri,Leeds United,Chelsea,0,89.230003,0.32,0.31,1.12,3.12
16,Marco Bizot,Aston Villa,Brighton,1,89.230003,0.29,0.35,1.23,3.11
9,Karl Darlow,Leeds United,Chelsea,0,89.230003,0.32,0.31,1.12,3.11


In [6]:
predictions[predictions['player_name'].str.contains('Chal', case=False, na=False)][available_cols].round(2)


Unnamed: 0,player_name,team,fpl_position,opponent,is_home,pred_minutes,pred_exp_goals,pred_exp_assists,pred_cs_prob,pred_2plus_conceded,pred_defcon_prob,pred_bonus,exp_total_pts
160,Trevoh Chalobah,Chelsea,DEF,Leeds,1,88.220001,0.04,0.02,0.45,0.19,0.54,0.06,5.08


In [7]:
# =============================================================================
# Arsenal xG Analysis: Actual Last 5 Games vs Computed Rolling Features
# =============================================================================

# Get Arsenal's last 5 games from raw match data
arsenal_matches = pipeline.df[
    (pipeline.df['team'].str.contains('Arsenal', case=False, na=False)) &
    (pipeline.df['season'] == '2025/2026')
].copy()

# Get one row per match (team-level aggregation)
arsenal_team = arsenal_matches.groupby(['team', 'opponent', 'season', 'gameweek', 'is_home']).agg({
    'xg': 'sum',  # Team's total xG for the match
    'goals': 'sum',  # Team's goals scored
}).reset_index().sort_values('gameweek', ascending=False)

# Get xGA (opponent's xG against Arsenal) by looking up opponent's xG in that match
# We need to find what the opponent generated against Arsenal
opp_stats = pipeline.df.groupby(['team', 'season', 'gameweek']).agg({
    'xg': 'sum', 
    'goals': 'sum'
}).reset_index()
opp_stats = opp_stats.rename(columns={'team': 'opponent', 'xg': 'xga', 'goals': 'goals_conceded'})

arsenal_team = arsenal_team.merge(opp_stats, on=['opponent', 'season', 'gameweek'], how='left')

print("=" * 70)
print("ARSENAL'S LAST 5 GAMES - ACTUAL xG FOR AND AGAINST")
print("=" * 70)
last5 = arsenal_team.head(5)[['gameweek', 'opponent', 'is_home', 'xg', 'goals', 'xga', 'goals_conceded']]
last5.columns = ['GW', 'Opponent', 'Home', 'xG For', 'Goals', 'xGA (opp xG)', 'Conceded']
print(last5.to_string(index=False))
print(f"\n--- Last 5 Game Averages (what roll5 should approximate) ---")
print(f"  Avg xG For:     {last5['xG For'].mean():.3f}")
print(f"  Avg xGA:        {last5['xGA (opp xG)'].mean():.3f}")
print(f"  Avg Goals:      {last5['Goals'].mean():.2f}")
print(f"  Avg Conceded:   {last5['Conceded'].mean():.2f}")

# Now show the rolling features from predictions
print("\n" + "=" * 70)
print("COMPUTED ROLLING FEATURES (grabbed by model for GW25 prediction)")
print("=" * 70)

# Get an Arsenal player from predictions
arsenal_pred = predictions[predictions['team'].str.contains('Arsenal', case=False, na=False)].iloc[0]
print(f"\nArsenal player sampled: {arsenal_pred['player_name']}")

# Show team xG features (offensive)
print(f"\n--- Team xG Rolling (Arsenal's attacking output) ---")
for window in [1, 3, 5, 10, 30]:
    col = f'team_xg_roll{window}'
    if col in arsenal_pred.index:
        print(f"  {col}: {arsenal_pred[col]:.3f}")

# Show team xGA features (defensive - what opponents generate against Arsenal)
print(f"\n--- Team xGA Rolling (what opponents generate vs Arsenal) ---")
for window in [1, 3, 5, 10, 30]:
    col = f'team_xga_roll{window}'
    if col in arsenal_pred.index:
        print(f"  {col}: {arsenal_pred[col]:.3f}")

# Show goals conceded
print(f"\n--- Team Goals Conceded Rolling ---")
for window in [1, 3, 5, 10, 30]:
    col = f'team_conceded_roll{window}'
    if col in arsenal_pred.index:
        print(f"  {col}: {arsenal_pred[col]:.3f}")

ARSENAL'S LAST 5 GAMES - ACTUAL xG FOR AND AGAINST
  GW          Opponent  Home  xG For  Goals  xGA (opp xG)  Conceded
25.0        Sunderland     1    1.49    3.0          0.18       0.0
24.0      Leeds United     0    2.20    4.0          0.15       0.0
23.0 Manchester United     1    1.20    1.0          0.70       3.0
22.0 Nottingham Forest     0    2.10    0.0          0.34       0.0
21.0         Liverpool     1    0.57    0.0          0.31       0.0

--- Last 5 Game Averages (what roll5 should approximate) ---
  Avg xG For:     1.512
  Avg xGA:        0.336
  Avg Goals:      1.60
  Avg Conceded:   0.60

COMPUTED ROLLING FEATURES (grabbed by model for GW25 prediction)

Arsenal player sampled: Christian Nørgaard

--- Team xG Rolling (Arsenal's attacking output) ---
  team_xg_roll5: 1.440
  team_xg_roll10: 1.866

--- Team xGA Rolling (what opponents generate vs Arsenal) ---
  team_xga_roll1: 0.150
  team_xga_roll3: 0.397
  team_xga_roll5: 0.586
  team_xga_roll10: 0.994
  team_xga_rol

In [8]:
# =============================================================================
# DIAGNOSTIC: Investigate xGA computation bug
# =============================================================================

# Check team name variations in the data
print("Team name variations:")
teams = pipeline.df['team'].unique()
for t in sorted(teams):
    if 'bournemouth' in t.lower() or 'arsenal' in t.lower():
        print(f"  '{t}'")

print("\nOpponent name variations:")
opponents = pipeline.df['opponent'].unique()
for o in sorted(opponents):
    if 'bournemouth' in o.lower() or 'arsenal' in o.lower():
        print(f"  '{o}'")

# Check Arsenal vs Bournemouth game from both perspectives
print("\n" + "=" * 70)
print("Arsenal vs Bournemouth GW20 - BOTH TEAM PERSPECTIVES")
print("=" * 70)

# Arsenal's view (team=Arsenal)
arsenal_view = pipeline.df[
    (pipeline.df['team'].str.contains('Arsenal', case=False)) &
    (pipeline.df['opponent'].str.contains('Bournemouth', case=False)) &
    (pipeline.df['gameweek'] == 20) &
    (pipeline.df['season'] == '2025/2026')
].groupby(['team', 'opponent', 'gameweek']).agg({'xg': 'sum', 'goals': 'sum'}).reset_index()
print("\nArsenal's perspective (team=Arsenal):")
print(arsenal_view)

# Bournemouth's view (team=Bournemouth)  
bournemouth_view = pipeline.df[
    (pipeline.df['team'].str.contains('Bournemouth', case=False)) &
    (pipeline.df['opponent'].str.contains('Arsenal', case=False)) &
    (pipeline.df['gameweek'] == 20) &
    (pipeline.df['season'] == '2025/2026')
].groupby(['team', 'opponent', 'gameweek']).agg({'xg': 'sum', 'goals': 'sum'}).reset_index()
print("\nBournemouth's perspective (team=Bournemouth):")
print(bournemouth_view)

# The xGA for Arsenal SHOULD be Bournemouth's xG (what Bournemouth generated against Arsenal)
if len(bournemouth_view) > 0:
    print(f"\n>>> Arsenal's xGA for GW20 should be: {bournemouth_view['xg'].values[0]:.3f}")
else:
    print("\n>>> ERROR: No Bournemouth perspective found! Team name mismatch?")

Team name variations:
  'Arsenal'
  'Bournemouth'

Opponent name variations:
  'AFC Bournemouth'
  'Arsenal'

Arsenal vs Bournemouth GW20 - BOTH TEAM PERSPECTIVES

Arsenal's perspective (team=Arsenal):
      team         opponent  gameweek    xg  goals
0  Arsenal  AFC Bournemouth      20.0  1.13    3.0

Bournemouth's perspective (team=Bournemouth):
          team opponent  gameweek    xg  goals
0  Bournemouth  Arsenal      20.0  1.43    2.0

>>> Arsenal's xGA for GW20 should be: 1.430


In [9]:
# Quick check: what format do team names actually use?
print("Sample team names:")
print(pipeline.df['team'].value_counts().head(10))
print("\nSample opponent names:")
print(pipeline.df['opponent'].value_counts().head(10))

Sample team names:
team
Brighton and Hove Albion    3168
Liverpool                   3136
Tottenham Hotspur           3130
Manchester United           3117
Wolverhampton Wanderers     3113
Chelsea                     3111
Newcastle United            3109
Aston Villa                 3106
Arsenal                     3100
Crystal Palace              3022
Name: count, dtype: int64

Sample opponent names:
opponent
Manchester City            3146
Wolverhampton Wanderers    3117
Tottenham Hotspur          3109
Aston Villa                3106
Brighton & Hove Albion     3105
Chelsea                    3101
Crystal Palace             3101
West Ham United            3092
Manchester United          3092
Newcastle United           3090
Name: count, dtype: int64
