In [1]:
# =============================================================================
# STEP 0: Force reload modules (run this first after code changes!)
# =============================================================================
import importlib
import src.data_loader
import src.features
import src.pipeline
import src.models.goals
import src.models.base


In [2]:
# =============================================================================
# STEP 1: Update Data (optional - only if you need fresh gameweek data)
# =============================================================================
!python scrape_update_data.py --gameweek 24
# !python scrape_update_data.py --auto

FotMob Incremental Data Updater

Season: 2025/2026
Fetching fixture list...
Found 240 completed matches in 2025/2026
Checking existing data in: player_stats_8seasons_20260204_215756.csv
Found 2897 existing match IDs in data
Filtering to gameweeks [24]: 10 matches
  Skipping 10 matches already in data
  0 new matches to scrape
No matches to scrape!


In [3]:
# =============================================================================
# STEP 2: Run the Pipeline
# =============================================================================
from src.pipeline import FPLPipeline

pipeline = FPLPipeline('data')
pipeline.load_data()
pipeline.compute_features()

pipeline.tune(n_iter=100, use_subprocess=True)
pipeline.train()
predictions = pipeline.predict(gameweek=25, season='2025/2026')

LOADING DATA
Loading player stats from: player_stats_8seasons_20260204_215756.csv
  Loaded 81,935 player-match records
  Removed 302 duplicate rows
  Seasons: ['2018/2019', '2019/2020', '2020/2021', '2021/2022', '2022/2023', '2023/2024', '2024/2025', '2025/2026']
Loaded 2,901 fixtures
  Removed 240 duplicate fixtures
Filtered to seasons: ['2020/2021', '2021/2022', '2022/2023', '2023/2024', '2024/2025', '2025/2026']
Current season (2025/2026): 501 active players
Final dataset: 61,662 records

COMPUTING FEATURES
Computing rolling features...
  Computed 93 rolling/lifetime features

TUNING HYPERPARAMETERS WITH HOLDOUT TEST SET

Data split (temporal):
  Train: 49,320 samples
  Test:  12,329 samples (most recent 20%)
  Test set spans: ['2024/2025', '2025/2026']

------------------------------------------------------------
PHASE 1: Hyperparameter Tuning (5-fold CV on train set)
------------------------------------------------------------

Tuning GOALS (100 trials, 5-fold CV, Poisson Deviance

  from .autonotebook import tqdm as notebook_tqdm
[I 2026-02-06 14:53:47,349] A new study created in memory with name: no-name-616d7ea5-8421-4854-b48a-c3c212051698



Tuning GOALS_AGAINST (100 trials, 5-fold CV, Poisson Deviance, feature selection)...
  Team-matches: 3159, Avg conceded: 1.393, Total features: 20


Best trial: 0. Best value: 1.31976:   1%|          | 1/100 [00:01<03:14,  1.97s/it]

[I 2026-02-06 14:53:49,315] Trial 0 finished with value: 1.3197606092870913 and parameters: {'n_features_ratio': 0.6247240713084175, 'n_estimators': 288, 'max_depth': 7, 'learning_rate': 0.07661100707771368, 'min_child_weight': 2}. Best is trial 0 with value: 1.3197606092870913.


Best trial: 1. Best value: 1.24714:   2%|▏         | 2/100 [00:02<02:08,  1.32s/it]

[I 2026-02-06 14:53:50,173] Trial 1 finished with value: 1.2471435323188296 and parameters: {'n_features_ratio': 0.49359671220172163, 'n_estimators': 64, 'max_depth': 8, 'learning_rate': 0.07725378389307355, 'min_child_weight': 8}. Best is trial 1 with value: 1.2471435323188296.


Best trial: 2. Best value: 1.23519:   3%|▎         | 3/100 [00:03<01:57,  1.21s/it]

[I 2026-02-06 14:53:51,263] Trial 2 finished with value: 1.2351933526724936 and parameters: {'n_features_ratio': 0.41235069657748147, 'n_estimators': 293, 'max_depth': 7, 'learning_rate': 0.020589728197687916, 'min_child_weight': 2}. Best is trial 2 with value: 1.2351933526724936.


Best trial: 3. Best value: 1.21045:   4%|▍         | 4/100 [00:04<01:42,  1.07s/it]

[I 2026-02-06 14:53:52,118] Trial 3 finished with value: 1.210450366246293 and parameters: {'n_features_ratio': 0.5100427059120604, 'n_estimators': 126, 'max_depth': 6, 'learning_rate': 0.04345454109729477, 'min_child_weight': 3}. Best is trial 3 with value: 1.210450366246293.


Best trial: 4. Best value: 1.18241:   5%|▌         | 5/100 [00:05<01:32,  1.02it/s]

[I 2026-02-06 14:53:52,930] Trial 4 finished with value: 1.1824078768149209 and parameters: {'n_features_ratio': 0.7671117368334277, 'n_estimators': 85, 'max_depth': 4, 'learning_rate': 0.03476649150592621, 'min_child_weight': 5}. Best is trial 4 with value: 1.1824078768149209.


Best trial: 7. Best value: 1.1815:   9%|▉         | 9/100 [00:06<00:36,  2.51it/s] 

[I 2026-02-06 14:53:53,696] Trial 5 finished with value: 1.2113966104281286 and parameters: {'n_features_ratio': 0.8711055768358081, 'n_estimators': 100, 'max_depth': 6, 'learning_rate': 0.07500118950416987, 'min_child_weight': 1}. Best is trial 4 with value: 1.1824078768149209.
[I 2026-02-06 14:53:53,743] Trial 6 finished with value: 1.2015006983867558 and parameters: {'n_features_ratio': 0.764526911140863, 'n_estimators': 92, 'max_depth': 3, 'learning_rate': 0.2521267904777921, 'min_child_weight': 10}. Best is trial 4 with value: 1.1824078768149209.
[I 2026-02-06 14:53:53,792] Trial 7 finished with value: 1.1815047472023545 and parameters: {'n_features_ratio': 0.8850384088698766, 'n_estimators': 126, 'max_depth': 3, 'learning_rate': 0.1024932221692416, 'min_child_weight': 5}. Best is trial 7 with value: 1.1815047472023545.
[I 2026-02-06 14:53:53,844] Trial 8 finished with value: 1.2297570108713303 and parameters: {'n_features_ratio': 0.47322294090686734, 'n_estimators': 174, 'max_dep

Best trial: 7. Best value: 1.1815:  11%|█         | 11/100 [00:06<00:25,  3.43it/s]

[I 2026-02-06 14:53:53,982] Trial 9 finished with value: 1.217230571552888 and parameters: {'n_features_ratio': 0.7975133706123891, 'n_estimators': 128, 'max_depth': 6, 'learning_rate': 0.06420330336297862, 'min_child_weight': 2}. Best is trial 7 with value: 1.1815047472023545.
[I 2026-02-06 14:53:54,119] Trial 10 finished with value: 1.1914256965408616 and parameters: {'n_features_ratio': 0.9858561899264302, 'n_estimators': 225, 'max_depth': 4, 'learning_rate': 0.010629965824084665, 'min_child_weight': 6}. Best is trial 7 with value: 1.1815047472023545.


Best trial: 11. Best value: 1.17964:  13%|█▎        | 13/100 [00:06<00:18,  4.82it/s]

[I 2026-02-06 14:53:54,223] Trial 11 finished with value: 1.1796405656255644 and parameters: {'n_features_ratio': 0.9223169824259325, 'n_estimators': 171, 'max_depth': 4, 'learning_rate': 0.031227201869348846, 'min_child_weight': 5}. Best is trial 11 with value: 1.1796405656255644.
[I 2026-02-06 14:53:54,330] Trial 12 finished with value: 1.2230625401916972 and parameters: {'n_features_ratio': 0.9962088134619758, 'n_estimators': 185, 'max_depth': 4, 'learning_rate': 0.14872014672673853, 'min_child_weight': 5}. Best is trial 11 with value: 1.1796405656255644.


Best trial: 11. Best value: 1.17964:  14%|█▍        | 14/100 [00:07<00:16,  5.23it/s]

[I 2026-02-06 14:53:54,480] Trial 13 finished with value: 1.1897609163238339 and parameters: {'n_features_ratio': 0.89690610608159, 'n_estimators': 172, 'max_depth': 5, 'learning_rate': 0.024795803650758556, 'min_child_weight': 7}. Best is trial 11 with value: 1.1796405656255644.
[I 2026-02-06 14:53:54,567] Trial 14 finished with value: 1.2072392093451323 and parameters: {'n_features_ratio': 0.8898657631926836, 'n_estimators': 220, 'max_depth': 3, 'learning_rate': 0.12967848934366005, 'min_child_weight': 4}. Best is trial 11 with value: 1.1796405656255644.


Best trial: 11. Best value: 1.17964:  17%|█▋        | 17/100 [00:07<00:12,  6.89it/s]

[I 2026-02-06 14:53:54,683] Trial 15 finished with value: 1.1972867921000685 and parameters: {'n_features_ratio': 0.653724300827545, 'n_estimators': 143, 'max_depth': 5, 'learning_rate': 0.015674975850890966, 'min_child_weight': 8}. Best is trial 11 with value: 1.1796405656255644.
[I 2026-02-06 14:53:54,806] Trial 16 finished with value: 1.2313474907204682 and parameters: {'n_features_ratio': 0.9343350905123895, 'n_estimators': 222, 'max_depth': 4, 'learning_rate': 0.1256126397869271, 'min_child_weight': 6}. Best is trial 11 with value: 1.1796405656255644.


Best trial: 17. Best value: 1.17305:  19%|█▉        | 19/100 [00:07<00:10,  7.65it/s]

[I 2026-02-06 14:53:54,875] Trial 17 finished with value: 1.1730457337357207 and parameters: {'n_features_ratio': 0.8262975127542996, 'n_estimators': 153, 'max_depth': 3, 'learning_rate': 0.04595859828253945, 'min_child_weight': 4}. Best is trial 17 with value: 1.1730457337357207.
[I 2026-02-06 14:53:55,026] Trial 18 finished with value: 1.1987882445690536 and parameters: {'n_features_ratio': 0.8294703407250098, 'n_estimators': 202, 'max_depth': 5, 'learning_rate': 0.03583635505049317, 'min_child_weight': 4}. Best is trial 17 with value: 1.1730457337357207.


Best trial: 17. Best value: 1.17305:  22%|██▏       | 22/100 [00:07<00:08,  9.56it/s]

[I 2026-02-06 14:53:55,157] Trial 19 finished with value: 1.1796555849544306 and parameters: {'n_features_ratio': 0.7153806647485877, 'n_estimators': 262, 'max_depth': 4, 'learning_rate': 0.026192795537745398, 'min_child_weight': 7}. Best is trial 17 with value: 1.1730457337357207.
[I 2026-02-06 14:53:55,221] Trial 20 finished with value: 1.1745783653804864 and parameters: {'n_features_ratio': 0.5922857262124869, 'n_estimators': 154, 'max_depth': 3, 'learning_rate': 0.049817402180519464, 'min_child_weight': 4}. Best is trial 17 with value: 1.1730457337357207.
[I 2026-02-06 14:53:55,284] Trial 21 finished with value: 1.174449232382968 and parameters: {'n_features_ratio': 0.5758348225999445, 'n_estimators': 151, 'max_depth': 3, 'learning_rate': 0.047501899407894184, 'min_child_weight': 4}. Best is trial 17 with value: 1.1730457337357207.


Best trial: 17. Best value: 1.17305:  26%|██▌       | 26/100 [00:08<00:05, 12.80it/s]

[I 2026-02-06 14:53:55,347] Trial 22 finished with value: 1.1741389721495346 and parameters: {'n_features_ratio': 0.5858204678300298, 'n_estimators': 150, 'max_depth': 3, 'learning_rate': 0.04646780669841367, 'min_child_weight': 3}. Best is trial 17 with value: 1.1730457337357207.
[I 2026-02-06 14:53:55,411] Trial 23 finished with value: 1.1745563050388825 and parameters: {'n_features_ratio': 0.579072446428897, 'n_estimators': 153, 'max_depth': 3, 'learning_rate': 0.05023247015177962, 'min_child_weight': 3}. Best is trial 17 with value: 1.1730457337357207.
[I 2026-02-06 14:53:55,466] Trial 24 finished with value: 1.1737075878920853 and parameters: {'n_features_ratio': 0.6940485968936864, 'n_estimators': 115, 'max_depth': 3, 'learning_rate': 0.042302710634488754, 'min_child_weight': 4}. Best is trial 17 with value: 1.1730457337357207.
[I 2026-02-06 14:53:55,522] Trial 25 finished with value: 1.201239875890865 and parameters: {'n_features_ratio': 0.6908432053817839, 'n_estimators': 111, 

Best trial: 17. Best value: 1.17305:  28%|██▊       | 28/100 [00:08<00:05, 13.78it/s]

[I 2026-02-06 14:53:55,569] Trial 26 finished with value: 1.181731062972723 and parameters: {'n_features_ratio': 0.7113126038083438, 'n_estimators': 51, 'max_depth': 4, 'learning_rate': 0.06305066845201185, 'min_child_weight': 3}. Best is trial 17 with value: 1.1730457337357207.
[I 2026-02-06 14:53:55,644] Trial 27 finished with value: 1.1887353763059287 and parameters: {'n_features_ratio': 0.6600308116115197, 'n_estimators': 75, 'max_depth': 5, 'learning_rate': 0.038660353010322446, 'min_child_weight': 2}. Best is trial 17 with value: 1.1730457337357207.
[I 2026-02-06 14:53:55,724] Trial 28 finished with value: 1.1737734847246515 and parameters: {'n_features_ratio': 0.7568483800455776, 'n_estimators': 199, 'max_depth': 3, 'learning_rate': 0.028075398790569944, 'min_child_weight': 4}. Best is trial 17 with value: 1.1730457337357207.


Best trial: 17. Best value: 1.17305:  30%|███       | 30/100 [00:09<00:12,  5.56it/s]

[I 2026-02-06 14:53:56,466] Trial 29 finished with value: 1.2384657665235748 and parameters: {'n_features_ratio': 0.8331466327017072, 'n_estimators': 249, 'max_depth': 8, 'learning_rate': 0.013990143785840469, 'min_child_weight': 4}. Best is trial 17 with value: 1.1730457337357207.


Best trial: 17. Best value: 1.17305:  32%|███▏      | 32/100 [00:09<00:12,  5.40it/s]

[I 2026-02-06 14:53:56,785] Trial 30 finished with value: 1.2337078658476066 and parameters: {'n_features_ratio': 0.7548389051753102, 'n_estimators': 199, 'max_depth': 7, 'learning_rate': 0.027560025744314848, 'min_child_weight': 6}. Best is trial 17 with value: 1.1730457337357207.
[I 2026-02-06 14:53:56,859] Trial 31 finished with value: 1.1762915080448109 and parameters: {'n_features_ratio': 0.5403376969605878, 'n_estimators': 197, 'max_depth': 3, 'learning_rate': 0.021671405154281797, 'min_child_weight': 3}. Best is trial 17 with value: 1.1730457337357207.
[I 2026-02-06 14:53:56,917] Trial 32 finished with value: 1.1761939412680866 and parameters: {'n_features_ratio': 0.6323301803786276, 'n_estimators': 111, 'max_depth': 3, 'learning_rate': 0.08171928023521809, 'min_child_weight': 4}. Best is trial 17 with value: 1.1730457337357207.


Best trial: 17. Best value: 1.17305:  36%|███▌      | 36/100 [00:09<00:08,  7.72it/s]

[I 2026-02-06 14:53:57,006] Trial 33 finished with value: 1.1805126058342816 and parameters: {'n_features_ratio': 0.8139566329840936, 'n_estimators': 140, 'max_depth': 4, 'learning_rate': 0.06006874720158079, 'min_child_weight': 2}. Best is trial 17 with value: 1.1730457337357207.
[I 2026-02-06 14:53:57,076] Trial 34 finished with value: 1.17334073259516 and parameters: {'n_features_ratio': 0.7251381742392039, 'n_estimators': 162, 'max_depth': 3, 'learning_rate': 0.040963994173701744, 'min_child_weight': 3}. Best is trial 17 with value: 1.1730457337357207.
[I 2026-02-06 14:53:57,169] Trial 35 finished with value: 1.1764713107664886 and parameters: {'n_features_ratio': 0.7377911674284587, 'n_estimators': 164, 'max_depth': 4, 'learning_rate': 0.03147468282418632, 'min_child_weight': 5}. Best is trial 17 with value: 1.1730457337357207.


Best trial: 17. Best value: 1.17305:  38%|███▊      | 38/100 [00:09<00:07,  8.70it/s]

[I 2026-02-06 14:53:57,228] Trial 36 finished with value: 1.1758498700502593 and parameters: {'n_features_ratio': 0.7876455240573669, 'n_estimators': 115, 'max_depth': 3, 'learning_rate': 0.04042479413299397, 'min_child_weight': 2}. Best is trial 17 with value: 1.1730457337357207.
[I 2026-02-06 14:53:57,332] Trial 37 finished with value: 1.2078199155217253 and parameters: {'n_features_ratio': 0.8558850929611829, 'n_estimators': 187, 'max_depth': 4, 'learning_rate': 0.08311122209776971, 'min_child_weight': 10}. Best is trial 17 with value: 1.1730457337357207.


Best trial: 40. Best value: 1.17252:  40%|████      | 40/100 [00:10<00:07,  7.80it/s]

[I 2026-02-06 14:53:57,598] Trial 38 finished with value: 1.2224650290044905 and parameters: {'n_features_ratio': 0.6814791397558863, 'n_estimators': 133, 'max_depth': 7, 'learning_rate': 0.030365305986821786, 'min_child_weight': 1}. Best is trial 17 with value: 1.1730457337357207.
[I 2026-02-06 14:53:57,649] Trial 39 finished with value: 1.1999401538914678 and parameters: {'n_features_ratio': 0.724744287020659, 'n_estimators': 94, 'max_depth': 3, 'learning_rate': 0.02164133430338479, 'min_child_weight': 3}. Best is trial 17 with value: 1.1730457337357207.
[I 2026-02-06 14:53:57,732] Trial 40 finished with value: 1.1725209108282555 and parameters: {'n_features_ratio': 0.7415404778020983, 'n_estimators': 210, 'max_depth': 3, 'learning_rate': 0.03810389128009855, 'min_child_weight': 5}. Best is trial 40 with value: 1.1725209108282555.


Best trial: 40. Best value: 1.17252:  44%|████▍     | 44/100 [00:10<00:05,  9.52it/s]

[I 2026-02-06 14:53:57,823] Trial 41 finished with value: 1.1747547365635085 and parameters: {'n_features_ratio': 0.7814106799347741, 'n_estimators': 245, 'max_depth': 3, 'learning_rate': 0.0405168415197999, 'min_child_weight': 5}. Best is trial 40 with value: 1.1725209108282555.
[I 2026-02-06 14:53:57,902] Trial 42 finished with value: 1.1736843065278122 and parameters: {'n_features_ratio': 0.7513722467974608, 'n_estimators': 189, 'max_depth': 3, 'learning_rate': 0.035438683498818284, 'min_child_weight': 4}. Best is trial 40 with value: 1.1725209108282555.
[I 2026-02-06 14:53:57,983] Trial 43 finished with value: 1.1776030075802217 and parameters: {'n_features_ratio': 0.6739661612231678, 'n_estimators': 214, 'max_depth': 3, 'learning_rate': 0.05470406581683597, 'min_child_weight': 5}. Best is trial 40 with value: 1.1725209108282555.


Best trial: 40. Best value: 1.17252:  46%|████▌     | 46/100 [00:10<00:05,  9.75it/s]

[I 2026-02-06 14:53:58,074] Trial 44 finished with value: 1.1951538200333738 and parameters: {'n_features_ratio': 0.6241758869564259, 'n_estimators': 181, 'max_depth': 4, 'learning_rate': 0.07308370456264977, 'min_child_weight': 6}. Best is trial 40 with value: 1.1725209108282555.
[I 2026-02-06 14:53:58,177] Trial 45 finished with value: 1.1733167706795158 and parameters: {'n_features_ratio': 0.7363265422849, 'n_estimators': 281, 'max_depth': 3, 'learning_rate': 0.03415237323777343, 'min_child_weight': 4}. Best is trial 40 with value: 1.1725209108282555.


Best trial: 40. Best value: 1.17252:  48%|████▊     | 48/100 [00:11<00:05,  9.34it/s]

[I 2026-02-06 14:53:58,310] Trial 46 finished with value: 1.1822327741207794 and parameters: {'n_features_ratio': 0.74533194244475, 'n_estimators': 270, 'max_depth': 4, 'learning_rate': 0.03188857640360016, 'min_child_weight': 5}. Best is trial 40 with value: 1.1725209108282555.
[I 2026-02-06 14:53:58,413] Trial 47 finished with value: 1.1747539659908395 and parameters: {'n_features_ratio': 0.8014031449840647, 'n_estimators': 284, 'max_depth': 3, 'learning_rate': 0.03593990167024814, 'min_child_weight': 3}. Best is trial 40 with value: 1.1725209108282555.


Best trial: 40. Best value: 1.17252:  48%|████▊     | 48/100 [00:11<00:05,  9.34it/s]

[I 2026-02-06 14:53:58,538] Trial 48 finished with value: 1.1777701349917764 and parameters: {'n_features_ratio': 0.835869477457852, 'n_estimators': 237, 'max_depth': 4, 'learning_rate': 0.0248606008283289, 'min_child_weight': 7}. Best is trial 40 with value: 1.1725209108282555.


Best trial: 40. Best value: 1.17252:  50%|█████     | 50/100 [00:11<00:07,  6.93it/s]

[I 2026-02-06 14:53:58,875] Trial 49 finished with value: 1.3310798906849501 and parameters: {'n_features_ratio': 0.7779122573737636, 'n_estimators': 163, 'max_depth': 8, 'learning_rate': 0.10046592795246612, 'min_child_weight': 4}. Best is trial 40 with value: 1.1725209108282555.


Best trial: 40. Best value: 1.17252:  52%|█████▏    | 52/100 [00:11<00:06,  6.91it/s]

[I 2026-02-06 14:53:59,081] Trial 50 finished with value: 1.2527319205459067 and parameters: {'n_features_ratio': 0.41403252158022213, 'n_estimators': 284, 'max_depth': 6, 'learning_rate': 0.05680759291690704, 'min_child_weight': 6}. Best is trial 40 with value: 1.1725209108282555.
[I 2026-02-06 14:53:59,185] Trial 51 finished with value: 1.1809554093039505 and parameters: {'n_features_ratio': 0.7038690288620253, 'n_estimators': 300, 'max_depth': 3, 'learning_rate': 0.04510312046238989, 'min_child_weight': 4}. Best is trial 40 with value: 1.1725209108282555.
[I 2026-02-06 14:53:59,257] Trial 52 finished with value: 1.1728740336913588 and parameters: {'n_features_ratio': 0.7374340568128004, 'n_estimators': 166, 'max_depth': 3, 'learning_rate': 0.03427804626016782, 'min_child_weight': 5}. Best is trial 40 with value: 1.1725209108282555.


Best trial: 40. Best value: 1.17252:  56%|█████▌    | 56/100 [00:12<00:04,  9.23it/s]

[I 2026-02-06 14:53:59,335] Trial 53 finished with value: 1.172828536104953 and parameters: {'n_features_ratio': 0.7290748364377221, 'n_estimators': 188, 'max_depth': 3, 'learning_rate': 0.03537441331319731, 'min_child_weight': 5}. Best is trial 40 with value: 1.1725209108282555.
[I 2026-02-06 14:53:59,418] Trial 54 finished with value: 1.1728386674548374 and parameters: {'n_features_ratio': 0.72079299290283, 'n_estimators': 211, 'max_depth': 3, 'learning_rate': 0.03352796005755181, 'min_child_weight': 5}. Best is trial 40 with value: 1.1725209108282555.
[I 2026-02-06 14:53:59,509] Trial 55 finished with value: 1.174532137850684 and parameters: {'n_features_ratio': 0.8579822130452216, 'n_estimators': 210, 'max_depth': 3, 'learning_rate': 0.024518830362397336, 'min_child_weight': 6}. Best is trial 40 with value: 1.1725209108282555.


Best trial: 40. Best value: 1.17252:  58%|█████▊    | 58/100 [00:12<00:04,  9.34it/s]

[I 2026-02-06 14:53:59,637] Trial 56 finished with value: 1.1781343355893457 and parameters: {'n_features_ratio': 0.6290128772601731, 'n_estimators': 232, 'max_depth': 4, 'learning_rate': 0.01777099991057254, 'min_child_weight': 5}. Best is trial 40 with value: 1.1725209108282555.
[I 2026-02-06 14:53:59,717] Trial 57 finished with value: 1.175494794737541 and parameters: {'n_features_ratio': 0.9500991263489873, 'n_estimators': 173, 'max_depth': 3, 'learning_rate': 0.0528623957504953, 'min_child_weight': 5}. Best is trial 40 with value: 1.1725209108282555.


Best trial: 59. Best value: 1.17193:  59%|█████▉    | 59/100 [00:12<00:04,  8.77it/s]

[I 2026-02-06 14:53:59,861] Trial 58 finished with value: 1.20807797288578 and parameters: {'n_features_ratio': 0.80817234489259, 'n_estimators': 258, 'max_depth': 4, 'learning_rate': 0.06886378185557446, 'min_child_weight': 5}. Best is trial 40 with value: 1.1725209108282555.
[I 2026-02-06 14:53:59,946] Trial 59 finished with value: 1.171926233871365 and parameters: {'n_features_ratio': 0.7658274251532022, 'n_estimators': 207, 'max_depth': 3, 'learning_rate': 0.034138285276878615, 'min_child_weight': 7}. Best is trial 59 with value: 1.171926233871365.


Best trial: 59. Best value: 1.17193:  63%|██████▎   | 63/100 [00:12<00:03,  9.73it/s]

[I 2026-02-06 14:54:00,106] Trial 60 finished with value: 1.1942936854570507 and parameters: {'n_features_ratio': 0.7780382558950969, 'n_estimators': 211, 'max_depth': 5, 'learning_rate': 0.028909947878971148, 'min_child_weight': 9}. Best is trial 59 with value: 1.171926233871365.
[I 2026-02-06 14:54:00,182] Trial 61 finished with value: 1.1729167485822567 and parameters: {'n_features_ratio': 0.7314069642739041, 'n_estimators': 179, 'max_depth': 3, 'learning_rate': 0.03297219267131375, 'min_child_weight': 7}. Best is trial 59 with value: 1.171926233871365.
[I 2026-02-06 14:54:00,255] Trial 62 finished with value: 1.1722746025373127 and parameters: {'n_features_ratio': 0.6589971299219612, 'n_estimators': 179, 'max_depth': 3, 'learning_rate': 0.03641352655483736, 'min_child_weight': 8}. Best is trial 59 with value: 1.171926233871365.


Best trial: 59. Best value: 1.17193:  65%|██████▌   | 65/100 [00:13<00:03, 10.60it/s]

[I 2026-02-06 14:54:00,334] Trial 63 finished with value: 1.1750325607640104 and parameters: {'n_features_ratio': 0.6538064814800328, 'n_estimators': 192, 'max_depth': 3, 'learning_rate': 0.023726810838815223, 'min_child_weight': 8}. Best is trial 59 with value: 1.171926233871365.
[I 2026-02-06 14:54:00,408] Trial 64 finished with value: 1.1737189009915294 and parameters: {'n_features_ratio': 0.6698244575765259, 'n_estimators': 182, 'max_depth': 3, 'learning_rate': 0.03723477940730784, 'min_child_weight': 8}. Best is trial 59 with value: 1.171926233871365.
[I 2026-02-06 14:54:00,497] Trial 65 finished with value: 1.17548488703346 and parameters: {'n_features_ratio': 0.6970846789487436, 'n_estimators': 219, 'max_depth': 3, 'learning_rate': 0.019304888233789883, 'min_child_weight': 7}. Best is trial 59 with value: 1.171926233871365.


Best trial: 59. Best value: 1.17193:  67%|██████▋   | 67/100 [00:13<00:03, 10.78it/s]

[I 2026-02-06 14:54:00,588] Trial 66 finished with value: 1.1735935574290501 and parameters: {'n_features_ratio': 0.7673570420963048, 'n_estimators': 208, 'max_depth': 3, 'learning_rate': 0.03282070136323238, 'min_child_weight': 9}. Best is trial 59 with value: 1.171926233871365.
[I 2026-02-06 14:54:00,721] Trial 67 finished with value: 1.2842699184312272 and parameters: {'n_features_ratio': 0.7202008463776043, 'n_estimators': 230, 'max_depth': 4, 'learning_rate': 0.1921400358957507, 'min_child_weight': 7}. Best is trial 59 with value: 1.171926233871365.


Best trial: 59. Best value: 1.17193:  69%|██████▉   | 69/100 [00:13<00:02, 10.33it/s]

[I 2026-02-06 14:54:00,799] Trial 68 finished with value: 1.173636656694641 and parameters: {'n_features_ratio': 0.6429058818526046, 'n_estimators': 180, 'max_depth': 3, 'learning_rate': 0.028119127748616288, 'min_child_weight': 7}. Best is trial 59 with value: 1.171926233871365.
[I 2026-02-06 14:54:00,887] Trial 69 finished with value: 1.1758811670161653 and parameters: {'n_features_ratio': 0.7070349363876375, 'n_estimators': 202, 'max_depth': 3, 'learning_rate': 0.02273246748244436, 'min_child_weight': 9}. Best is trial 59 with value: 1.171926233871365.


Best trial: 59. Best value: 1.17193:  73%|███████▎  | 73/100 [00:13<00:02, 10.09it/s]

[I 2026-02-06 14:54:01,053] Trial 70 finished with value: 1.4043711632489397 and parameters: {'n_features_ratio': 0.6125549842785503, 'n_estimators': 168, 'max_depth': 6, 'learning_rate': 0.2942418954462013, 'min_child_weight': 8}. Best is trial 59 with value: 1.171926233871365.
[I 2026-02-06 14:54:01,133] Trial 71 finished with value: 1.1731637399795356 and parameters: {'n_features_ratio': 0.7634166559866336, 'n_estimators': 156, 'max_depth': 3, 'learning_rate': 0.04521071528743179, 'min_child_weight': 6}. Best is trial 59 with value: 1.171926233871365.
[I 2026-02-06 14:54:01,219] Trial 72 finished with value: 1.173133120946245 and parameters: {'n_features_ratio': 0.8209926197322742, 'n_estimators': 141, 'max_depth': 3, 'learning_rate': 0.05030844535052955, 'min_child_weight': 6}. Best is trial 59 with value: 1.171926233871365.


Best trial: 59. Best value: 1.17193:  75%|███████▌  | 75/100 [00:14<00:02, 10.55it/s]

[I 2026-02-06 14:54:01,300] Trial 73 finished with value: 1.1724679649735499 and parameters: {'n_features_ratio': 0.738024522897103, 'n_estimators': 193, 'max_depth': 3, 'learning_rate': 0.037213105641348214, 'min_child_weight': 6}. Best is trial 59 with value: 1.171926233871365.
[I 2026-02-06 14:54:01,389] Trial 74 finished with value: 1.1719884959285185 and parameters: {'n_features_ratio': 0.7329938372146972, 'n_estimators': 193, 'max_depth': 3, 'learning_rate': 0.038003148228717316, 'min_child_weight': 7}. Best is trial 59 with value: 1.171926233871365.
[I 2026-02-06 14:54:01,469] Trial 75 finished with value: 1.1721675598224772 and parameters: {'n_features_ratio': 0.6873192697968084, 'n_estimators': 193, 'max_depth': 3, 'learning_rate': 0.037972254635174646, 'min_child_weight': 7}. Best is trial 59 with value: 1.171926233871365.


Best trial: 59. Best value: 1.17193:  77%|███████▋  | 77/100 [00:14<00:02, 11.02it/s]

[I 2026-02-06 14:54:01,552] Trial 76 finished with value: 1.1740378358149397 and parameters: {'n_features_ratio': 0.6751415459550403, 'n_estimators': 192, 'max_depth': 3, 'learning_rate': 0.03829121745619338, 'min_child_weight': 8}. Best is trial 59 with value: 1.171926233871365.
[I 2026-02-06 14:54:01,674] Trial 77 finished with value: 1.188139446513142 and parameters: {'n_features_ratio': 0.6941182121695647, 'n_estimators': 220, 'max_depth': 4, 'learning_rate': 0.04277432835924314, 'min_child_weight': 7}. Best is trial 59 with value: 1.171926233871365.


Best trial: 59. Best value: 1.17193:  79%|███████▉  | 79/100 [00:14<00:01, 10.61it/s]

[I 2026-02-06 14:54:01,757] Trial 78 finished with value: 1.1731108726795347 and parameters: {'n_features_ratio': 0.6043836273654273, 'n_estimators': 205, 'max_depth': 3, 'learning_rate': 0.026822807763680543, 'min_child_weight': 6}. Best is trial 59 with value: 1.171926233871365.
[I 2026-02-06 14:54:01,850] Trial 79 finished with value: 1.1736901453540027 and parameters: {'n_features_ratio': 0.794837320483316, 'n_estimators': 194, 'max_depth': 3, 'learning_rate': 0.029753703414263193, 'min_child_weight': 7}. Best is trial 59 with value: 1.171926233871365.


Best trial: 59. Best value: 1.17193:  83%|████████▎ | 83/100 [00:14<00:01, 11.19it/s]

[I 2026-02-06 14:54:01,939] Trial 80 finished with value: 1.1738298867017345 and parameters: {'n_features_ratio': 0.7149870864440784, 'n_estimators': 214, 'max_depth': 3, 'learning_rate': 0.04174981415352898, 'min_child_weight': 8}. Best is trial 59 with value: 1.171926233871365.
[I 2026-02-06 14:54:02,020] Trial 81 finished with value: 1.1730562959283242 and parameters: {'n_features_ratio': 0.7385014482983565, 'n_estimators': 187, 'max_depth': 3, 'learning_rate': 0.037270328078527364, 'min_child_weight': 5}. Best is trial 59 with value: 1.171926233871365.
[I 2026-02-06 14:54:02,099] Trial 82 finished with value: 1.1734560945213288 and parameters: {'n_features_ratio': 0.7537477448446255, 'n_estimators': 175, 'max_depth': 3, 'learning_rate': 0.03447336861538548, 'min_child_weight': 6}. Best is trial 59 with value: 1.171926233871365.


Best trial: 59. Best value: 1.17193:  85%|████████▌ | 85/100 [00:15<00:01, 11.32it/s]

[I 2026-02-06 14:54:02,179] Trial 83 finished with value: 1.1745132445237003 and parameters: {'n_features_ratio': 0.6865126231342809, 'n_estimators': 200, 'max_depth': 3, 'learning_rate': 0.04748952304554696, 'min_child_weight': 6}. Best is trial 59 with value: 1.171926233871365.
[I 2026-02-06 14:54:02,272] Trial 84 finished with value: 1.1732934216952764 and parameters: {'n_features_ratio': 0.6563012246832477, 'n_estimators': 228, 'max_depth': 3, 'learning_rate': 0.03125400916985957, 'min_child_weight': 5}. Best is trial 59 with value: 1.171926233871365.
[I 2026-02-06 14:54:02,352] Trial 85 finished with value: 1.1736053696022513 and parameters: {'n_features_ratio': 0.7652215388741181, 'n_estimators': 196, 'max_depth': 3, 'learning_rate': 0.026133167758642512, 'min_child_weight': 7}. Best is trial 59 with value: 1.171926233871365.


Best trial: 59. Best value: 1.17193:  87%|████████▋ | 87/100 [00:15<00:01, 11.12it/s]

[I 2026-02-06 14:54:02,459] Trial 86 finished with value: 1.177304505949491 and parameters: {'n_features_ratio': 0.727919384811311, 'n_estimators': 170, 'max_depth': 4, 'learning_rate': 0.04063334361363298, 'min_child_weight': 6}. Best is trial 59 with value: 1.171926233871365.
[I 2026-02-06 14:54:02,548] Trial 87 finished with value: 1.1775498084524374 and parameters: {'n_features_ratio': 0.7115041621505652, 'n_estimators': 205, 'max_depth': 3, 'learning_rate': 0.06012057432120409, 'min_child_weight': 5}. Best is trial 59 with value: 1.171926233871365.


Best trial: 59. Best value: 1.17193:  89%|████████▉ | 89/100 [00:15<00:01, 10.94it/s]

[I 2026-02-06 14:54:02,649] Trial 88 finished with value: 1.1727484364227017 and parameters: {'n_features_ratio': 0.7488751807478246, 'n_estimators': 239, 'max_depth': 3, 'learning_rate': 0.038036269675452115, 'min_child_weight': 7}. Best is trial 59 with value: 1.171926233871365.
[I 2026-02-06 14:54:02,745] Trial 89 finished with value: 1.1797432274036574 and parameters: {'n_features_ratio': 0.6647132527950061, 'n_estimators': 243, 'max_depth': 3, 'learning_rate': 0.05188812672390738, 'min_child_weight': 7}. Best is trial 59 with value: 1.171926233871365.


Best trial: 59. Best value: 1.17193:  93%|█████████▎| 93/100 [00:15<00:00, 10.84it/s]

[I 2026-02-06 14:54:02,861] Trial 90 finished with value: 1.1826473026169104 and parameters: {'n_features_ratio': 0.747063104762745, 'n_estimators': 217, 'max_depth': 4, 'learning_rate': 0.03822956798382703, 'min_child_weight': 8}. Best is trial 59 with value: 1.171926233871365.
[I 2026-02-06 14:54:02,946] Trial 91 finished with value: 1.172462494327541 and parameters: {'n_features_ratio': 0.7846111484289304, 'n_estimators': 186, 'max_depth': 3, 'learning_rate': 0.03458513676923639, 'min_child_weight': 7}. Best is trial 59 with value: 1.171926233871365.
[I 2026-02-06 14:54:03,029] Trial 92 finished with value: 1.1727228003001078 and parameters: {'n_features_ratio': 0.7803765193297146, 'n_estimators': 186, 'max_depth': 3, 'learning_rate': 0.04441100499223386, 'min_child_weight': 7}. Best is trial 59 with value: 1.171926233871365.


Best trial: 59. Best value: 1.17193:  95%|█████████▌| 95/100 [00:15<00:00, 10.96it/s]

[I 2026-02-06 14:54:03,112] Trial 93 finished with value: 1.172963621020406 and parameters: {'n_features_ratio': 0.793548214139101, 'n_estimators': 185, 'max_depth': 3, 'learning_rate': 0.04418845684330028, 'min_child_weight': 7}. Best is trial 59 with value: 1.171926233871365.
[I 2026-02-06 14:54:03,207] Trial 94 finished with value: 1.17440155539065 and parameters: {'n_features_ratio': 0.7795897584237711, 'n_estimators': 174, 'max_depth': 3, 'learning_rate': 0.0300617549557806, 'min_child_weight': 8}. Best is trial 59 with value: 1.171926233871365.


Best trial: 59. Best value: 1.17193:  97%|█████████▋| 97/100 [00:16<00:00, 11.05it/s]

[I 2026-02-06 14:54:03,299] Trial 95 finished with value: 1.1753192128008743 and parameters: {'n_features_ratio': 0.8075290979586358, 'n_estimators': 225, 'max_depth': 3, 'learning_rate': 0.048451894707520066, 'min_child_weight': 7}. Best is trial 59 with value: 1.171926233871365.
[I 2026-02-06 14:54:03,384] Trial 96 finished with value: 1.1730926408943272 and parameters: {'n_features_ratio': 0.766955992947433, 'n_estimators': 191, 'max_depth': 3, 'learning_rate': 0.03903221347326957, 'min_child_weight': 7}. Best is trial 59 with value: 1.171926233871365.
[I 2026-02-06 14:54:03,480] Trial 97 finished with value: 1.1753847473892065 and parameters: {'n_features_ratio': 0.8512826464708221, 'n_estimators': 177, 'max_depth': 3, 'learning_rate': 0.05609099669004492, 'min_child_weight': 8}. Best is trial 59 with value: 1.171926233871365.


Best trial: 59. Best value: 1.17193: 100%|██████████| 100/100 [00:16<00:00,  6.13it/s]


[I 2026-02-06 14:54:03,562] Trial 98 finished with value: 1.1733658933306266 and parameters: {'n_features_ratio': 0.7499010496285066, 'n_estimators': 159, 'max_depth': 3, 'learning_rate': 0.03590114953987486, 'min_child_weight': 7}. Best is trial 59 with value: 1.171926233871365.
[I 2026-02-06 14:54:03,650] Trial 99 finished with value: 1.1739251140729086 and parameters: {'n_features_ratio': 0.6831457420793348, 'n_estimators': 186, 'max_depth': 3, 'learning_rate': 0.04333912772775055, 'min_child_weight': 6}. Best is trial 59 with value: 1.171926233871365.
  Best CV Poisson Deviance: 1.1719
  Features: 15/20 (76.6%)
  Params: {'n_estimators': 207, 'max_depth': 3, 'learning_rate': 0.034138285276878615, 'min_child_weight': 7}

------------------------------------------------------------
PHASE 2: Evaluation on Held-Out Test Set
------------------------------------------------------------

Model           Metric          Test Value   MAE         
--------------------------------------------

  self.df['opponent_norm'] = self.df['opponent'].apply(normalize_name)


  Mapped pred_team_goals to 61,662/61,662 player rows
  Mean pred_team_goals: 1.366
Training GoalsModel on 61,649 samples (25 features, tuned selection)...
  Target mean: 0.1074
  MAE: 0.1927
Training AssistsModel on 61,649 samples (25 features, tuned selection)...
  Target mean: 0.0762
  MAE: 0.1466
Training DefconModel on 61,649 samples...
  Mean defcon/90: 9.79
  MAE: 4.15
Training BonusModel (Monte Carlo, 1000 sims)...
  Estimating baseline BPS from stats (no actual BPS data)
Training BaselineBPSModel on 44075 samples...
  Features used: 30
  Avg baseline BPS: 18.0
  MAE: 0.67
  Loaded FPL availability for 1603 players

PREDICTING GW25 (2025/2026)
Found 501 active players with historical data
  Filtered out 80 unavailable players (injured/suspended)
Matched 421 players to GW25 fixtures

Saved predictions to: data\predictions\gw25_2025-2026.csv
Total players: 421


In [9]:
# =============================================================================
# STEP 3: View Top Players
# =============================================================================
# Top 20 by expected points with full prediction breakdown
display_cols = [
    'player_name', 'team', 'fpl_position', 'opponent', 'is_home',
    'pred_minutes', 'pred_exp_goals', 'pred_exp_assists', 
    'pred_cs_prob', 'pred_2plus_conceded',
    'pred_defcon_prob', 'pred_bonus', 'exp_total_pts'
]
available_cols = [c for c in display_cols if c in predictions.columns]
predictions.nlargest(20, 'exp_total_pts')[available_cols].round(2)

Unnamed: 0,player_name,team,fpl_position,opponent,is_home,pred_minutes,pred_exp_goals,pred_exp_assists,pred_cs_prob,pred_2plus_conceded,pred_defcon_prob,pred_bonus,exp_total_pts
144,Marcos Senesi,Bournemouth,DEF,Aston Villa,1,88.0,0.05,0.1,0.37,0.26,0.74,0.32,5.6
50,Bruno Fernandes,Manchester United,MID,Spurs,1,87.18,0.3,0.26,0.35,0.28,0.18,0.62,5.6
38,Mohamed Salah,Liverpool,MID,Man City,1,86.279999,0.38,0.18,0.27,0.37,0.0,0.6,5.3
311,Adrien Truffert,Bournemouth,DEF,Aston Villa,1,88.919998,0.03,0.06,0.37,0.26,0.63,0.48,5.3
233,Bukayo Saka,Arsenal,MID,Sunderland,1,80.389999,0.31,0.22,0.38,0.25,0.01,0.52,5.16
210,Bryan Mbeumo,Manchester United,MID,Spurs,1,88.0,0.37,0.14,0.35,0.28,0.01,0.49,5.15
119,Erling Haaland,Manchester City,FWD,Liverpool,0,83.610001,0.47,0.09,0.29,0.35,0.0,0.92,5.08
321,Lewis Hall,Newcastle United,DEF,Brentford,1,84.150002,0.07,0.14,0.31,0.33,0.46,0.41,5.01
281,Riccardo Calafiori,Arsenal,DEF,Sunderland,1,84.559998,0.11,0.05,0.38,0.25,0.26,0.38,4.95
80,Nordi Mukiele,Sunderland,DEF,Arsenal,0,88.389999,0.03,0.09,0.32,0.32,0.69,0.18,4.91


In [11]:
# =============================================================================
# Top 10 by Position
# =============================================================================
import pandas as pd

# --- FWD & MID: ranked by expected goals + assists ---
for pos in ['FWD', 'MID']:
    df_pos = predictions[predictions['fpl_position'] == pos].copy()
    df_pos['g+a'] = df_pos['pred_exp_goals'] + df_pos['pred_exp_assists']
    cols = ['player_name', 'team', 'opponent', 'is_home',
            'pred_minutes', 'pred_exp_goals', 'pred_exp_assists', 'g+a', 'exp_total_pts']
    available = [c for c in cols if c in df_pos.columns]
    print(f"\n{'='*70}")
    print(f"  TOP 10 {pos} (ranked by expected G+A)")
    print(f"{'='*70}")
    display(df_pos.nlargest(10, 'g+a')[available].round(2))

# --- DEF: ranked by expected total pts, showing CS/DC/2+ conceded + G+A ---
df_def = predictions[predictions['fpl_position'] == 'DEF'].copy()
df_def['g+a'] = df_def['pred_exp_goals'] + df_def['pred_exp_assists']
cols_def = ['player_name', 'team', 'opponent', 'is_home',
            'pred_minutes', 'pred_exp_goals', 'pred_exp_assists', 'g+a',
            'pred_cs_prob', 'pred_2plus_conceded', 'pred_defcon_prob', 'exp_total_pts']
available_def = [c for c in cols_def if c in df_def.columns]
print(f"\n{'='*70}")
print(f"  TOP 10 DEF (ranked by exp total pts)")
print(f"{'='*70}")
display(df_def.nlargest(10, 'exp_total_pts')[available_def].round(2))

# --- GK ---
df_gk = predictions[predictions['fpl_position'] == 'GK'].copy()
cols_gk = ['player_name', 'team', 'opponent', 'is_home',
           'pred_minutes', 'pred_cs_prob', 'pred_2plus_conceded', 'pred_goals_against', 'exp_total_pts']
available_gk = [c for c in cols_gk if c in df_gk.columns]
print(f"\n{'='*70}")
print(f"  TOP 10 GK (ranked by exp total pts)")
print(f"{'='*70}")
display(df_gk.nlargest(10, 'exp_total_pts')[available_gk].round(2))


  TOP 10 FWD (ranked by expected G+A)


Unnamed: 0,player_name,team,opponent,is_home,pred_minutes,pred_exp_goals,pred_exp_assists,g+a,exp_total_pts
119,Erling Haaland,Manchester City,Liverpool,0,83.610001,0.47,0.09,0.56,5.08
63,Jarrod Bowen,West Ham United,Burnley,0,88.900002,0.36,0.13,0.5,4.74
37,Raul Jiménez,Fulham,Everton,1,81.440002,0.35,0.12,0.46,4.57
353,Igor Thiago,Brentford,Newcastle,0,88.0,0.41,0.05,0.45,4.68
68,Ollie Watkins,Aston Villa,Bournemouth,0,84.849998,0.35,0.08,0.42,4.34
319,Hugo Ekitike,Liverpool,Man City,1,75.480003,0.31,0.11,0.42,4.28
278,Evanilson,Bournemouth,Aston Villa,1,83.269997,0.33,0.08,0.41,4.32
83,Dominic Calvert-Lewin,Leeds United,Nott'm Forest,1,85.0,0.34,0.06,0.4,4.34
297,Rodrigo Muniz,Fulham,Everton,1,64.889999,0.32,0.06,0.39,4.08
246,Joao Pedro,Chelsea,Wolves,0,78.449997,0.27,0.08,0.35,3.86



  TOP 10 MID (ranked by expected G+A)


Unnamed: 0,player_name,team,opponent,is_home,pred_minutes,pred_exp_goals,pred_exp_assists,g+a,exp_total_pts
38,Mohamed Salah,Liverpool,Man City,1,86.279999,0.38,0.18,0.56,5.3
50,Bruno Fernandes,Manchester United,Spurs,1,87.18,0.3,0.26,0.56,5.6
233,Bukayo Saka,Arsenal,Sunderland,1,80.389999,0.31,0.22,0.54,5.16
210,Bryan Mbeumo,Manchester United,Spurs,1,88.0,0.37,0.14,0.51,5.15
58,Harry Wilson,Fulham,Everton,1,84.610001,0.25,0.17,0.42,4.34
185,Kaoru Mitoma,Brighton and Hove Albion,Crystal Palace,1,84.230003,0.25,0.16,0.41,4.55
262,Amad,Manchester United,Spurs,1,88.0,0.23,0.17,0.41,4.61
160,Phil Foden,Manchester City,Liverpool,0,80.129997,0.23,0.18,0.41,4.34
158,Cody Gakpo,Liverpool,Man City,1,79.93,0.29,0.11,0.4,4.31
105,Samuel Chukwueze,Fulham,Everton,1,51.119999,0.19,0.21,0.39,2.8



  TOP 10 DEF (ranked by exp total pts)


Unnamed: 0,player_name,team,opponent,is_home,pred_minutes,pred_exp_goals,pred_exp_assists,g+a,pred_cs_prob,pred_2plus_conceded,pred_defcon_prob,exp_total_pts
144,Marcos Senesi,Bournemouth,Aston Villa,1,88.0,0.05,0.1,0.15,0.37,0.26,0.74,5.6
311,Adrien Truffert,Bournemouth,Aston Villa,1,88.919998,0.03,0.06,0.09,0.37,0.26,0.63,5.3
321,Lewis Hall,Newcastle United,Brentford,1,84.150002,0.07,0.14,0.21,0.31,0.33,0.46,5.01
281,Riccardo Calafiori,Arsenal,Sunderland,1,84.559998,0.11,0.05,0.16,0.38,0.25,0.26,4.95
80,Nordi Mukiele,Sunderland,Arsenal,0,88.389999,0.03,0.09,0.12,0.32,0.32,0.69,4.91
363,Diego Gomez,Brighton and Hove Albion,Crystal Palace,1,71.129997,0.18,0.07,0.26,0.35,0.28,0.09,4.87
301,Malick Thiaw,Newcastle United,Brentford,1,88.0,0.08,0.03,0.11,0.31,0.33,0.62,4.86
241,Daniel Ballard,Sunderland,Arsenal,0,85.940002,0.09,0.03,0.12,0.32,0.32,0.54,4.78
382,Max Alleyne,Manchester City,Liverpool,0,87.470001,0.07,0.06,0.13,0.29,0.35,0.55,4.75
16,Kieran Trippier,Newcastle United,Brentford,1,88.0,0.03,0.21,0.24,0.31,0.33,0.34,4.7



  TOP 10 GK (ranked by exp total pts)


Unnamed: 0,player_name,team,opponent,is_home,pred_minutes,pred_cs_prob,pred_2plus_conceded,pred_goals_against,exp_total_pts
260,Djordje Petrovic,Bournemouth,Aston Villa,1,89.410004,0.37,0.26,0.99,3.58
73,David Raya,Arsenal,Sunderland,1,89.529999,0.38,0.25,0.96,3.57
189,Altay Bayindir,Manchester United,Spurs,1,89.150002,0.35,0.28,1.04,3.4
312,Senne Lammens,Manchester United,Spurs,1,89.389999,0.35,0.28,1.04,3.39
299,Bart Verbruggen,Brighton and Hove Albion,Crystal Palace,1,89.529999,0.35,0.28,1.05,3.36
121,Aaron Ramsdale,Newcastle United,Brentford,1,88.190002,0.31,0.33,1.18,3.3
296,Robin Roefs,Sunderland,Arsenal,0,89.510002,0.32,0.32,1.14,3.23
35,Nick Pope,Newcastle United,Brentford,1,88.989998,0.31,0.33,1.18,3.2
115,Dean Henderson,Crystal Palace,Brighton,0,89.510002,0.3,0.34,1.21,3.1
86,Gianluigi Donnarumma,Manchester City,Liverpool,0,89.410004,0.29,0.35,1.23,3.06


In [5]:
predictions[predictions['player_name'].str.contains('Chal', case=False, na=False)][available_cols].round(2)


Unnamed: 0,player_name,team,fpl_position,opponent,is_home,pred_minutes,pred_exp_goals,pred_exp_assists,pred_team_goals,pred_cs_prob,pred_2plus_conceded,pred_defcon_prob,pred_bonus,exp_total_pts
153,Trevoh Chalobah,Chelsea,DEF,Wolves,0,88.169998,0.05,0.02,1.3,0.27,0.38,0.54,0.06,4.17


In [6]:
# =============================================================================
# Arsenal xG Analysis: Actual Last 5 Games vs Computed Rolling Features
# =============================================================================

# Get Arsenal's last 5 games from raw match data
arsenal_matches = pipeline.df[
    (pipeline.df['team'].str.contains('Arsenal', case=False, na=False)) &
    (pipeline.df['season'] == '2025/2026')
].copy()

# Get one row per match (team-level aggregation)
arsenal_team = arsenal_matches.groupby(['team', 'opponent', 'season', 'gameweek', 'is_home']).agg({
    'xg': 'sum',  # Team's total xG for the match
    'goals': 'sum',  # Team's goals scored
}).reset_index().sort_values('gameweek', ascending=False)

# Get xGA (opponent's xG against Arsenal) by looking up opponent's xG in that match
# We need to find what the opponent generated against Arsenal
opp_stats = pipeline.df.groupby(['team', 'season', 'gameweek']).agg({
    'xg': 'sum', 
    'goals': 'sum'
}).reset_index()
opp_stats = opp_stats.rename(columns={'team': 'opponent', 'xg': 'xga', 'goals': 'goals_conceded'})

arsenal_team = arsenal_team.merge(opp_stats, on=['opponent', 'season', 'gameweek'], how='left')

print("=" * 70)
print("ARSENAL'S LAST 5 GAMES - ACTUAL xG FOR AND AGAINST")
print("=" * 70)
last5 = arsenal_team.head(5)[['gameweek', 'opponent', 'is_home', 'xg', 'goals', 'xga', 'goals_conceded']]
last5.columns = ['GW', 'Opponent', 'Home', 'xG For', 'Goals', 'xGA (opp xG)', 'Conceded']
print(last5.to_string(index=False))
print(f"\n--- Last 5 Game Averages (what roll5 should approximate) ---")
print(f"  Avg xG For:     {last5['xG For'].mean():.3f}")
print(f"  Avg xGA:        {last5['xGA (opp xG)'].mean():.3f}")
print(f"  Avg Goals:      {last5['Goals'].mean():.2f}")
print(f"  Avg Conceded:   {last5['Conceded'].mean():.2f}")

# Now show the rolling features from predictions
print("\n" + "=" * 70)
print("COMPUTED ROLLING FEATURES (grabbed by model for GW25 prediction)")
print("=" * 70)

# Get an Arsenal player from predictions
arsenal_pred = predictions[predictions['team'].str.contains('Arsenal', case=False, na=False)].iloc[0]
print(f"\nArsenal player sampled: {arsenal_pred['player_name']}")

# Show team xG features (offensive)
print(f"\n--- Team xG Rolling (Arsenal's attacking output) ---")
for window in [1, 3, 5, 10, 30]:
    col = f'team_xg_roll{window}'
    if col in arsenal_pred.index:
        print(f"  {col}: {arsenal_pred[col]:.3f}")

# Show team xGA features (defensive - what opponents generate against Arsenal)
print(f"\n--- Team xGA Rolling (what opponents generate vs Arsenal) ---")
for window in [1, 3, 5, 10, 30]:
    col = f'team_xga_roll{window}'
    if col in arsenal_pred.index:
        print(f"  {col}: {arsenal_pred[col]:.3f}")

# Show goals conceded
print(f"\n--- Team Goals Conceded Rolling ---")
for window in [1, 3, 5, 10, 30]:
    col = f'team_conceded_roll{window}'
    if col in arsenal_pred.index:
        print(f"  {col}: {arsenal_pred[col]:.3f}")

ARSENAL'S LAST 5 GAMES - ACTUAL xG FOR AND AGAINST
  GW          Opponent  Home  xG For  Goals  xGA (opp xG)  Conceded
24.0      Leeds United     0    2.20    4.0          0.15       0.0
23.0 Manchester United     1    1.20    1.0          0.70       3.0
22.0 Nottingham Forest     0    2.10    0.0          0.34       0.0
21.0         Liverpool     1    0.57    0.0          0.31       0.0
20.0   AFC Bournemouth     0    1.13    3.0           NaN       NaN

--- Last 5 Game Averages (what roll5 should approximate) ---
  Avg xG For:     1.440
  Avg xGA:        0.375
  Avg Goals:      1.60
  Avg Conceded:   0.75

COMPUTED ROLLING FEATURES (grabbed by model for GW25 prediction)

Arsenal player sampled: Christian Nørgaard

--- Team xG Rolling (Arsenal's attacking output) ---
  team_xg_roll5: 1.608
  team_xg_roll10: 1.826

--- Team xGA Rolling (what opponents generate vs Arsenal) ---
  team_xga_roll1: 0.700
  team_xga_roll3: 0.450
  team_xga_roll5: 1.218
  team_xga_roll10: 1.014
  team_xga_rol

In [7]:
# =============================================================================
# DIAGNOSTIC: Investigate xGA computation bug
# =============================================================================

# Check team name variations in the data
print("Team name variations:")
teams = pipeline.df['team'].unique()
for t in sorted(teams):
    if 'bournemouth' in t.lower() or 'arsenal' in t.lower():
        print(f"  '{t}'")

print("\nOpponent name variations:")
opponents = pipeline.df['opponent'].unique()
for o in sorted(opponents):
    if 'bournemouth' in o.lower() or 'arsenal' in o.lower():
        print(f"  '{o}'")

# Check Arsenal vs Bournemouth game from both perspectives
print("\n" + "=" * 70)
print("Arsenal vs Bournemouth GW20 - BOTH TEAM PERSPECTIVES")
print("=" * 70)

# Arsenal's view (team=Arsenal)
arsenal_view = pipeline.df[
    (pipeline.df['team'].str.contains('Arsenal', case=False)) &
    (pipeline.df['opponent'].str.contains('Bournemouth', case=False)) &
    (pipeline.df['gameweek'] == 20) &
    (pipeline.df['season'] == '2025/2026')
].groupby(['team', 'opponent', 'gameweek']).agg({'xg': 'sum', 'goals': 'sum'}).reset_index()
print("\nArsenal's perspective (team=Arsenal):")
print(arsenal_view)

# Bournemouth's view (team=Bournemouth)  
bournemouth_view = pipeline.df[
    (pipeline.df['team'].str.contains('Bournemouth', case=False)) &
    (pipeline.df['opponent'].str.contains('Arsenal', case=False)) &
    (pipeline.df['gameweek'] == 20) &
    (pipeline.df['season'] == '2025/2026')
].groupby(['team', 'opponent', 'gameweek']).agg({'xg': 'sum', 'goals': 'sum'}).reset_index()
print("\nBournemouth's perspective (team=Bournemouth):")
print(bournemouth_view)

# The xGA for Arsenal SHOULD be Bournemouth's xG (what Bournemouth generated against Arsenal)
if len(bournemouth_view) > 0:
    print(f"\n>>> Arsenal's xGA for GW20 should be: {bournemouth_view['xg'].values[0]:.3f}")
else:
    print("\n>>> ERROR: No Bournemouth perspective found! Team name mismatch?")

Team name variations:
  'Arsenal'
  'Bournemouth'

Opponent name variations:
  'AFC Bournemouth'
  'Arsenal'

Arsenal vs Bournemouth GW20 - BOTH TEAM PERSPECTIVES

Arsenal's perspective (team=Arsenal):
      team         opponent  gameweek    xg  goals
0  Arsenal  AFC Bournemouth      20.0  1.13    3.0

Bournemouth's perspective (team=Bournemouth):
          team opponent  gameweek    xg  goals
0  Bournemouth  Arsenal      20.0  1.43    2.0

>>> Arsenal's xGA for GW20 should be: 1.430


In [8]:
# Quick check: what format do team names actually use?
print("Sample team names:")
print(pipeline.df['team'].value_counts().head(10))
print("\nSample opponent names:")
print(pipeline.df['opponent'].value_counts().head(10))

Sample team names:
team
Brighton and Hove Albion    3152
Liverpool                   3123
Tottenham Hotspur           3114
Manchester United           3101
Wolverhampton Wanderers     3098
Chelsea                     3096
Newcastle United            3093
Aston Villa                 3090
Arsenal                     3084
Crystal Palace              3008
Name: count, dtype: int64

Sample opponent names:
opponent
Manchester City            3133
Wolverhampton Wanderers    3102
Tottenham Hotspur          3093
Brighton & Hove Albion     3091
Aston Villa                3091
Chelsea                    3086
Crystal Palace             3085
Newcastle United           3076
West Ham United            3076
Manchester United          3076
Name: count, dtype: int64
