In [2]:
import pandas as pd
import numpy as np

pd.set_option("display.width", 140)
pd.set_option("display.max_rows", 50)

# ---- Tiny synthetic dataset: 12 matches across months ----
matches = pd.DataFrame({
    "match_id":  np.arange(1, 13),
    "date": [
        "2025-01-05","2025-01-12","2025-01-19",
        "2025-02-02","2025-02-09","2025-02-16",
        "2025-03-02","2025-03-09","2025-03-16",
        "2025-04-06","2025-04-13","2025-04-20"
    ],
    "home_team": ["Spain","Germany","Brazil","France","Italy","Argentina","Spain","Germany","Brazil","France","Italy","Argentina"],
    "away_team": ["France","Spain","Argentina","Italy","Germany","Brazil","France","Italy","Argentina","Spain","Germany","Brazil"],
    "home_score":[2,1,3,0,2,1,1,2,2,0,1,3],
    "away_score":[1,2,2,1,1,2,0,1,2,2,2,1]
})

matches.head()

df = matches.copy()

# Parse "date" to real datetime, coerce bad strings to NaT instead of erroring
df["date"] = pd.to_datetime(df["date"], errors="coerce")

# Make time-series friendly: use the date as index and sort chronologically
df = df.set_index("date").sort_index()

# Quick feature: total goals in the match (home+away)
df["total_goals"] = df["home_score"] + df["away_score"]

df.head()

# Sum total goals per calendar month; keep it as a DataFrame with a nice column name
monthly_goals = df["total_goals"].resample("M").sum().to_frame(name="goals_monthly")

monthly_goals

base = df.reset_index()

home = (base[["date","home_team","home_score","away_team","away_score"]]
        .rename(columns={"home_team":"team","home_score":"gf","away_team":"opp_team","away_score":"ga"}))

away = (base[["date","away_team","away_score","home_team","home_score"]]
        .rename(columns={"away_team":"team","away_score":"gf","home_team":"opp_team","home_score":"ga"}))

long = pd.concat([home, away], ignore_index=True).sort_values(["team","date"]).reset_index(drop=True)
long.head(10)

long = long.sort_values(["team","date"]).reset_index(drop=True)

long["gf_ma3"] = (long
                  .groupby("team")["gf"]
                  .rolling(window=3, min_periods=1)
                  .mean()
                  .reset_index(level=0, drop=True))
long.head(10)

long["gf_lag1"] = long.groupby("team")["gf"].shift(1)
long["gf_diff1"] = long.groupby("team")["gf"].diff(1)
long["gf_pct1"] = long.groupby("team")["gf"].pct_change(1)

long.head(10)

long["result"] = np.select(
    [
        long["gf"] > long["ga"],
        long["gf"] < long["ga"]
    ],
    ["Win","Loss"],
    default="Draw"
)

long["points"] = np.select(
    [
        long["result"] == "Win",
        long["result"] == "Loss"
    ],
    [3,1],
    default=0
)

long["form5"] = (long
                 .groupby("team")["points"]
                 .rolling(window=5,min_periods=1)
                 .sum()
                 .reset_index(level=0,drop=True))

long[["team","date","gf","ga","result","points","form5"]].head(12)

# Booleans for wins
long["is_win"] = long["gf"] > long["ga"]

# Current running win-streak per team:
# Trick: (~is_win).cumsum() creates a new "group id" whenever a NON-win appears.
# Inside each group, cumsum of True (1) counts consecutive wins since last non-win.
long["win_streak"] = (
    long
    .groupby("team")["is_win"]
    .transform(lambda s: s.groupby((~s).cumsum()).cumsum())
)

# Best streak per team
best_streak = long.groupby("team")["win_streak"].max().reset_index(name="best_win_streak")

best_streak, long[["team","date","is_win","win_streak"]].head(12)








  monthly_goals = df["total_goals"].resample("M").sum().to_frame(name="goals_monthly")


(        team  best_win_streak
 0  Argentina                1
 1     Brazil                2
 2     France                0
 3    Germany                2
 4      Italy                2
 5      Spain                4,
          team       date  is_win  win_streak
 0   Argentina 2025-01-19   False           0
 1   Argentina 2025-02-16   False           0
 2   Argentina 2025-03-16   False           0
 3   Argentina 2025-04-20    True           1
 4      Brazil 2025-01-19    True           1
 5      Brazil 2025-02-16    True           2
 6      Brazil 2025-03-16   False           0
 7      Brazil 2025-04-20   False           0
 8      France 2025-01-05   False           0
 9      France 2025-02-02   False           0
 10     France 2025-03-02   False           0
 11     France 2025-04-06   False           0)