# Loading

In [1]:
import requests
import json
import numpy as np
import pandas as pd
from patsy import dmatrix
from statsmodels.duration.hazard_regression import PHReg
import os

from tqdm.auto import tqdm
tqdm.pandas()  # enables progress_apply on groupby/apply

from sklearn.model_selection import GroupShuffleSplit
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df_events = pd.read_pickle("/Users/jasminejiax/Desktop/TQE/Statsbomb/df_events_all.pkl")

# Survival

In [15]:
pd.set_option('display.max_columns', None)
print(df_events.columns.tolist())

['id', 'index', 'period', 'timestamp', 'minute', 'second', 'possession', 'duration', 'match_id', 'type.id', 'type.name', 'possession_team.id', 'possession_team.name', 'play_pattern.id', 'play_pattern.name', 'team.id', 'team.name', 'tactics.formation', 'tactics.lineup', 'related_events', 'player.id', 'player.name', 'position.id', 'position.name', 'pass.recipient.id', 'pass.recipient.name', 'pass.length', 'pass.angle', 'pass.height.id', 'pass.height.name', 'pass.body_part.id', 'pass.body_part.name', 'pass.type.id', 'pass.type.name', 'pass.outcome.id', 'pass.outcome.name', 'ball_receipt.outcome.id', 'ball_receipt.outcome.name', 'out', 'under_pressure', 'clearance.body_part.id', 'clearance.body_part.name', 'clearance.left_foot', 'clearance.right_foot', 'pass.switch', 'counterpress', 'duel.outcome.id', 'duel.outcome.name', 'duel.type.id', 'duel.type.name', 'off_camera', 'dribble.outcome.id', 'dribble.outcome.name', 'pass.technique.id', 'pass.technique.name', 'pass.outswinging', 'clearance.a

## Survival Model 1: T1-T2

In [19]:
def minute_float(df, minute_col='minute', second_col='second'):
    m = pd.to_numeric(df[minute_col], errors='coerce').fillna(0).astype(float)
    s = pd.to_numeric(df[second_col], errors='coerce').fillna(0).astype(float)
    return m + s / 60.0

# Build Intervals
EPS = 1e-9
df = df_events.copy()

# All goals (Shot + Goal), with time t in minutes
is_goal = (df['type.name'] == 'Shot') & (df['shot.outcome.name'] == 'Goal')
goals = df.loc[is_goal, ['match_id','team.name','shot.statsbomb_xg','minute','second']].copy()
goals['t'] = minute_float(goals, 'minute', 'second')
goals = goals.sort_values(['match_id','t']).reset_index(drop=True)

# Match end time (for censoring horizon)
df['_t_all'] = minute_float(df, 'minute', 'second')
match_end = df.groupby('match_id')['_t_all'].max().rename('match_end')

# Home team mapping if available (may not exist in your df_events)
home_map = None
if 'home_team.home_team_name' in df.columns:
    home_map = (df[['match_id','home_team.home_team_name']]
                .dropna().drop_duplicates('match_id')
                .set_index('match_id')['home_team.home_team_name'])

# Build one row per match: gap from first goal to second (or censor at FT)
rows = []
for mid, g in goals.groupby('match_id', sort=False):
    g = g.sort_values('t').reset_index(drop=True)
    if len(g) >= 1:
        g1 = g.iloc[0]
        t1 = float(g1['t'])
        mend = float(match_end.loc[mid]) if mid in match_end.index else t1  # safe fallback

        if len(g) >= 2:
            t2 = float(g.iloc[1]['t'])
            event = 1
        else:
            t2 = mend
            event = 0

        # Ensure positive duration
        dur = max(t2 - t1, 0.0) + EPS

        row = {
            'match_id': mid,
            't1': t1,                              # entry time (left truncation)
            't2': t2,                              # stop time
            'time_to_second_goal': dur,            # for reference
            'event': int(event),
            'first_goal_xg': float(g1['shot.statsbomb_xg']) if pd.notna(g1['shot.statsbomb_xg']) else np.nan,
            'first_goal_team': g1['team.name']
        }
        if home_map is not None:
            row['first_goal_home'] = int(g1['team.name'] == home_map.get(mid, None))
        rows.append(row)

df_intervals = pd.DataFrame(rows)

# Basic sanity
n = len(df_intervals)
events = int(df_intervals['event'].sum())
print(f"Built intervals: n={n}, events={events}, censored={n - events}")

Built intervals: n=345, events=271, censored=74
                                         Results: PHReg
Model:                                PH Reg                   Sample size:                  345
Dependent variable:                   y                        Num. events:                  271
Ties:                                 Breslow                                                   
------------------------------------------------------------------------------------------------
                                            log HR log HR SE   HR      t    P>|t|  [0.025 0.975]
------------------------------------------------------------------------------------------------
Intercept                                   0.0000       nan 1.0000     nan    nan    nan    nan
C(first_goal_team)[T.Arsenal]              -0.0010    0.3390 0.9990 -0.0030 0.9976 0.5141 1.9414
C(first_goal_team)[T.Aston Villa]           0.2092    0.4856 1.2327  0.4309 0.6666 0.4759 3.1930
C(first_goal_team)[T.Ch

  return np.sqrt(np.diag(self.cov_params()))


In [None]:
# Left truncate
if 'first_goal_home' in df_intervals.columns:
    # Design matrix without t1 (time handled by entry)
    design = dmatrix("first_goal_home + first_goal_xg", df_intervals, return_type='dataframe')
else:
    design = dmatrix("C(first_goal_team) + first_goal_xg", df_intervals, return_type='dataframe')

# Drop rows with missing key covariates
X = design
y_stop = df_intervals['t2'].astype(float).values
y_entry = df_intervals['t1'].astype(float).values
status = df_intervals['event'].astype(int).values

mask = np.isfinite(y_stop) & np.isfinite(y_entry) & np.isfinite(status) & np.isfinite(X.values).all(axis=1)
X = X.loc[mask]
y_stop = y_stop[mask]
y_entry = y_entry[mask]
status = status[mask]

# Fit Cox PH with left truncation
model = PHReg(endog=y_stop, exog=X, status=status, entry=y_entry, ties='breslow')
result = model.fit()
print(result.summary())

# Hazard ratios
names = getattr(result.model, "exog_names", [f"beta_{i}" for i in range(len(result.params))])
hr = pd.Series(np.exp(result.params), index=names)
print("\nHazard ratios (exp(beta)):\n", hr.sort_values(ascending=False))

## Survival Model 2: Time-varying survival for football matches (lifelines)

models the hazard of the next goal as a function of score_diff, rc_diff, and time_elapsed.

In [3]:
import numpy as np
import pandas as pd
from lifelines import CoxTimeVaryingFitter

def minute_float(df, minute_col='minute', second_col='second'):
    m = pd.to_numeric(df[minute_col], errors='coerce').fillna(0).astype(float)
    s = pd.to_numeric(df[second_col], errors='coerce').fillna(0).astype(float)
    return m + s / 60.0

def infer_teams_map(df):
    teams_by_match = (
        df.groupby('match_id')['team.name']
          .apply(lambda s: pd.unique(s.dropna()))
    )
    mapping = {}
    skipped = set()
    for mid, arr in teams_by_match.items():
        if arr.size < 2:
            skipped.add(mid)
            continue
        mapping[mid] = (arr[0], arr[1])
    return mapping, skipped

def detect_red_cards(df):
    cand_cols = [c for c in df.columns if c.endswith('card.name')]
    if not cand_cols:
        return pd.DataFrame(columns=['match_id','team.name','_t'])
    red_mask = pd.Series(False, index=df.index)
    for c in cand_cols:
        red_mask |= df[c].isin(['Red Card', 'Second Yellow'])
    reds = df.loc[red_mask, ['match_id','team.name']].copy()
    if reds.empty:
        return pd.DataFrame(columns=['match_id','team.name','_t'])
    reds['_t'] = minute_float(df.loc[reds.index])
    return reds

def detect_goals(df):
    is_goal = (df['type.name'] == 'Shot') & (df['shot.outcome.name'] == 'Goal')
    goals = df.loc[is_goal, ['match_id','team.name']].copy()
    if goals.empty:
        return pd.DataFrame(columns=['match_id','team.name','_t'])
    goals['_t'] = minute_float(df.loc[goals.index])
    return goals

def build_timevarying(df_events):
    df = df_events.copy()
    required = {'match_id', 'team.name', 'minute', 'second', 'type.name', 'shot.outcome.name'}
    missing = required - set(df.columns)
    if missing:
        raise ValueError(f"df_events is missing required columns: {sorted(missing)}")
    df['_t'] = minute_float(df)
    match_end = df.groupby('match_id')['_t'].max()
    goals = detect_goals(df).sort_values(['match_id','_t'])
    reds  = detect_red_cards(df).sort_values(['match_id','_t'])
    teams_map, skipped_matches = infer_teams_map(df)
    rows = []
    for mid, group in df.groupby('match_id', sort=False):
        if mid in skipped_matches:
            continue
        t_end = float(match_end.get(mid, np.nan))
        if not np.isfinite(t_end):
            continue
        team_A, team_B = teams_map[mid]
        g = goals.loc[goals['match_id'] == mid].copy()
        r = reds.loc[reds['match_id'] == mid].copy()
        cps = []
        for _, row in g.iterrows():
            cps.append(('goal', float(row['_t']), row['team.name']))
        for _, row in r.iterrows():
            cps.append(('red',  float(row['_t']), row['team.name']))
        cps.append(('end', t_end, None))
        cps.sort(key=lambda x: x[1])
        score_A = score_B = 0
        rc_A = rc_B = 0
        t_prev = 0.0
        for etype, t_evt, tm in cps:
            if not np.isfinite(t_evt):
                continue
            if t_evt <= t_prev:
                t_evt = np.nextafter(t_prev, np.inf)
            score_diff = score_A - score_B
            rc_diff    = rc_A - rc_B
            event      = int(etype == 'goal')
            rows.append({
                'match_id': mid,
                'start': t_prev,
                'stop': t_evt,
                'event': event,
                'score_diff': score_diff,
                'rc_diff': rc_diff,
                'time_elapsed': t_prev
            })
            if etype == 'goal':
                if tm == team_A:
                    score_A += 1
                elif tm == team_B:
                    score_B += 1
            elif etype == 'red':
                if tm == team_A:
                    rc_A += 1
                elif tm == team_B:
                    rc_B += 1
            t_prev = t_evt
    tv = pd.DataFrame(rows)
    if tv.empty:
        raise ValueError("No intervals built. Check that df_events has goals and valid times.")
    tv = tv[(tv['stop'] > tv['start']) &
            tv[['score_diff','rc_diff','time_elapsed']].notna().all(axis=1)].copy()
    for c in ['start','stop','event','score_diff','rc_diff','time_elapsed']:
        tv[c] = pd.to_numeric(tv[c], errors='coerce')
    tv = tv.dropna(subset=['start','stop','event'])
    return tv

def fit_timevarying_cox_simple(tv_df):
    df = tv_df.copy()
    mu = df['time_elapsed'].mean()
    sd = df['time_elapsed'].std(ddof=0)
    if sd == 0 or not np.isfinite(sd):
        df['time_elapsed_z'] = 0.0
    else:
        df['time_elapsed_z'] = (df['time_elapsed'] - mu) / sd
    covars = ['score_diff', 'time_elapsed_z']
    if df['rc_diff'].std(ddof=0) > 0:
        covars.insert(1, 'rc_diff')
    cols = ['match_id', 'start', 'stop', 'event'] + covars
    df = df[cols].copy()
    ctv = CoxTimeVaryingFitter()
    ctv.fit(
        df,
        id_col='match_id',
        start_col='start',
        stop_col='stop',
        event_col='event',
        show_progress=False
    )
    return ctv, covars


In [4]:
# use
tv_df = build_timevarying(df_events)   # from the earlier code you ran
ctv, used_covars = fit_timevarying_cox_simple(tv_df)
print("Covariates used:", used_covars)
print(ctv.summary)
print("\nHazard ratios (exp(beta)):\n", np.exp(ctv.params_))

Covariates used: ['score_diff', 'rc_diff', 'time_elapsed_z']
                    coef  exp(coef)  se(coef)  coef lower 95%  coef upper 95%  \
covariate                                                                       
score_diff      0.004924   1.004936  0.026641       -0.047292        0.057140   
rc_diff        -0.050812   0.950458  0.127565       -0.300835        0.199212   
time_elapsed_z  0.032354   1.032883  0.054958       -0.075361        0.140070   

                exp(coef) lower 95%  exp(coef) upper 95%  cmp to         z  \
covariate                                                                    
score_diff                 0.953809             1.058804     0.0  0.184832   
rc_diff                    0.740200             1.220441     0.0 -0.398318   
time_elapsed_z             0.927408             1.150354     0.0  0.588709   

                       p  -log2(p)  
covariate                           
score_diff      0.853360  0.228773  
rc_diff         0.690396  0.534

model 2, but more variables

In [7]:
import numpy as np
import pandas as pd
from lifelines import CoxTimeVaryingFitter

def minute_float(df, minute_col='minute', second_col='second'):
    m = pd.to_numeric(df[minute_col], errors='coerce').fillna(0).astype(float)
    s = pd.to_numeric(df[second_col], errors='coerce').fillna(0).astype(float)
    return m + s / 60.0

def has_home_away(df):
    return {'home_team.home_team_name','away_team.away_team_name'}.issubset(df.columns)

def get_home_away_maps(df):
    if has_home_away(df):
        home = (df[['match_id','home_team.home_team_name']]
                .dropna().drop_duplicates('match_id')
                .set_index('match_id')['home_team.home_team_name'])
        away = (df[['match_id','away_team.away_team_name']]
                .dropna().drop_duplicates('match_id')
                .set_index('match_id')['away_team.away_team_name'])
        return home, away
    teams_by_match = (
        df.groupby('match_id')['team.name']
          .apply(lambda s: pd.unique(s.dropna()))
    )
    home, away = {}, {}
    for mid, arr in teams_by_match.items():
        if arr.size >= 2:
            home[mid] = arr[0]
            away[mid] = arr[1]
    return pd.Series(home), pd.Series(away)

def detect_goals(df):
    is_goal = (df['type.name'] == 'Shot') & (df['shot.outcome.name'] == 'Goal')
    goals = df.loc[is_goal, ['match_id','team.name']].copy()
    if goals.empty:
        return pd.DataFrame(columns=['match_id','team.name','_t'])
    goals['_t'] = minute_float(df.loc[goals.index])
    return goals

def detect_reds(df):
    cand_cols = [c for c in df.columns if c.endswith('card.name')]
    if not cand_cols:
        return pd.DataFrame(columns=['match_id','team.name','_t'])
    m = pd.Series(False, index=df.index)
    for c in cand_cols:
        m |= df[c].isin(['Red Card','Second Yellow'])
    reds = df.loc[m, ['match_id','team.name']].copy()
    if reds.empty:
        return pd.DataFrame(columns=['match_id','team.name','_t'])
    reds['_t'] = minute_float(df.loc[reds.index])
    return reds

def half_from_time(t):
    return 1 if t <= 45.0 else 2

def build_tv_with_home(df_events, roll_window_min=10.0):
    req = {'match_id','team.name','minute','second','type.name','shot.outcome.name'}
    missing = req - set(df_events.columns)
    if missing:
        raise ValueError(f"df_events missing required columns: {sorted(missing)}")
    df = df_events.copy()
    df['_t'] = minute_float(df)
    home_map, away_map = get_home_away_maps(df)
    match_end = df.groupby('match_id')['_t'].max()
    goals = detect_goals(df).sort_values(['match_id','_t'])
    reds  = detect_reds(df).sort_values(['match_id','_t'])
    shots = df.loc[df['type.name']=='Shot', ['match_id','team.name','_t','shot.statsbomb_xg']].copy()
    if 'shot.statsbomb_xg' not in shots.columns:
        shots['shot.statsbomb_xg'] = np.nan
    shots['shot.statsbomb_xg'] = pd.to_numeric(shots['shot.statsbomb_xg'], errors='coerce')
    rows = []
    for mid, g in df.groupby('match_id', sort=False):
        t_end = float(match_end.get(mid, np.nan))
        if not np.isfinite(t_end): 
            continue
        home = home_map.get(mid, None)
        away = away_map.get(mid, None)
        if pd.isna(home) or pd.isna(away):
            continue
        g_goals = goals.loc[goals['match_id']==mid].copy()
        g_reds  = reds.loc[reds['match_id']==mid].copy()
        g_shots = shots.loc[shots['match_id']==mid].copy()
        cps = []
        for _, r in g_goals.iterrows():
            cps.append(('goal', float(r['_t']), r['team.name']))
        for _, r in g_reds.iterrows():
            cps.append(('red', float(r['_t']), r['team.name']))
        cps.append(('end', t_end, None))
        cps.sort(key=lambda x: x[1])
        score_h = score_a = 0
        red_h = red_a = 0
        t_prev = 0.0
        for etype, t_evt, tm in cps:
            if not np.isfinite(t_evt):
                continue
            if t_evt <= t_prev:
                t_evt = np.nextafter(t_prev, np.inf)
            home_score_diff = score_h - score_a
            home_rc_diff    = red_h  - red_a
            half            = half_from_time(t_prev)
            time_elapsed    = t_prev
            W = roll_window_min
            t_lo = max(0.0, t_prev - W)
            sh = g_shots.loc[(g_shots['_t'] > t_lo) & (g_shots['_t'] <= t_prev)].copy()
            if sh.empty:
                roll_xg_home = 0.0
                roll_xg_away = 0.0
            else:
                roll_xg_home = sh.loc[sh['team.name']==home, 'shot.statsbomb_xg'].fillna(0).sum()
                roll_xg_away = sh.loc[sh['team.name']==away, 'shot.statsbomb_xg'].fillna(0).sum()
            roll_xg_diff = roll_xg_home - roll_xg_away
            event = int(etype == 'goal')
            rows.append({
                'match_id': mid,
                'start': t_prev,
                'stop': t_evt,
                'event': event,
                'home_score_diff': home_score_diff,
                'home_rc_diff': home_rc_diff,
                'half': half,
                'time_elapsed': time_elapsed,
                'roll_xg_diff': roll_xg_diff
            })
            if etype == 'goal':
                if tm == home:
                    score_h += 1
                elif tm == away:
                    score_a += 1
            elif etype == 'red':
                if tm == home:
                    red_h += 1
                elif tm == away:
                    red_a += 1
            t_prev = t_evt
    tv = pd.DataFrame(rows)
    tv = tv[(tv['stop'] > tv['start'])].copy()
    for c in ['start','stop','event','home_score_diff','home_rc_diff','half','time_elapsed','roll_xg_diff']:
        tv[c] = pd.to_numeric(tv[c], errors='coerce')
    tv = tv.dropna(subset=['start','stop','event','home_score_diff','half'])
    return tv

def add_score_state_features(tv):
    s = tv['home_score_diff']
    tv = tv.copy()
    tv['score_tied']      = (s == 0).astype(int)
    tv['home_up_1']       = (s == 1).astype(int)
    tv['home_up_2plus']   = (s >= 2).astype(int)
    tv['home_down_1']     = (s == -1).astype(int)
    tv['home_down_2plus'] = (s <= -2).astype(int)
    return tv

def add_half_columns(tv):
    tv = tv.copy()
    tv['half'] = tv['half'].astype(int)
    return tv

def zscore(series):
    mu = series.mean()
    sd = series.std(ddof=0)
    if not np.isfinite(sd) or sd == 0:
        return pd.Series(0.0, index=series.index)
    return (series - mu) / sd

def finalize_design(tv):
    tv = tv.copy()
    tv['time_elapsed_z'] = zscore(tv['time_elapsed'])
    tv['roll_xg_diff_z'] = zscore(tv['roll_xg_diff'].fillna(0.0))
    covars = [
        'home_up_1','home_up_2plus','home_down_1','home_down_2plus',
        'home_rc_diff',
        'time_elapsed_z',
        'roll_xg_diff_z'
    ]
    if tv['home_rc_diff'].std(ddof=0) == 0:
        covars.remove('home_rc_diff')
    cols = ['match_id','start','stop','event','half'] + covars
    tv = tv[cols].copy()
    return tv, covars

def fit_ctv(tv, covars, stratify_by_half=True):
    ctv = CoxTimeVaryingFitter()
    if stratify_by_half:
        ctv.fit(
            tv,
            id_col='match_id',
            start_col='start',
            stop_col='stop',
            event_col='event',
            strata=['half'],
            show_progress=False
        )
    else:
        tv = tv.copy()
        tv['half2'] = (tv['half'] == 2).astype(int)
        covars2 = covars + ['half2']
        keep = ['match_id','start','stop','event'] + covars2
        ctv.fit(
            tv[keep],
            id_col='match_id',
            start_col='start',
            stop_col='stop',
            event_col='event',
            show_progress=False
        )
    return ctv

ROLL_WINDOW_MINUTES = 10.0
tv0 = build_tv_with_home(df_events, roll_window_min=ROLL_WINDOW_MINUTES)
tv1 = add_score_state_features(tv0)
tv2 = add_half_columns(tv1)
tv_final, used_covars = finalize_design(tv2)
print("Design covariates used:", used_covars)
print("N intervals:", len(tv_final), "   N matches:", tv_final['match_id'].nunique())
ctv = fit_ctv(tv_final, used_covars, stratify_by_half=True)
print(ctv.summary)
print("\nHazard ratios (exp(beta)):\n", np.exp(ctv.params_))