In [1]:
from run_batch_gpt_pass2 import load_sequence_jsonl
import pandas as pd
import numpy as np
llm_predictions = load_sequence_jsonl("pass2_predictions.jsonl")
sequences = load_sequence_jsonl("sequence_pass2.jsonl")

In [18]:
print("Events per match")
float(np.mean([len(y["events"]) for x in sequences for y in x["history"]["team1"]]))

Events per match


16.88005923000987

In [19]:
preds = [x["pred"] for x in llm_predictions]
reality = [x["target"]["team1_result"] for x in sequences]
from sklearn.metrics import accuracy_score
print("Total accuracy we achieve with LLMs:")
accuracy_score(preds, reality)

Total accuracy we achieve with LLM:


0.6110056925996205

In [3]:
def short_date(iso):
    return iso.split('T')[0]

In [27]:
def describe_sample(input_sample):
    print("-----")
    team1_history = input_sample["history"]["team1"]
    team2_history = input_sample["history"]["team2"]

    team1_results = [x["pov_win"] for x in team1_history]
    team2_results = [x["pov_win"] for x in team2_history]

    target = input_sample["target"]

    dates1 = [short_date(x["date"]) for x in team1_history]
    dates2 = [short_date(x["date"]) for x in team2_history]

    print("Match for which we predict the outcome:")
    print(
        target["team1_name"],
        target["team2_name"],
        ",Result: " + str(target["team1_result"]) + ",",
        short_date(target["date"])
    )
    print("team1_players:", target["team1_players"])
    print("team2_players:", target["team2_players"])
    print(f"Team1 match history spans from {dates1[0]} to {dates1[-1]}")
    print(f"Team2 match history spans from {dates2[0]} to {dates2[-1]}")

    print("\nLast matches results (oldest -> newest):")
    print("team1:", team1_results)
    print("team2:", team2_results)

    print("-----")

input_sample = sequences[333]
describe_sample(input_sample)
print("Post mortem: Maybe here we see that H=8 isn't enough as for team1 as we just get five games history (particularly relevant for chapter 4,5).")

-----
Match for which we predict the outcome:
Bilibili Gaming Anyone's Legend ,Result: 1, 2025-06-14
team1_players: ['Bin', 'Beichuan', 'Knight', 'Elk', 'ON']
team2_players: ['Flandre', 'Tarzan', 'Shanks', 'Hope', 'Kael']
Team1 match history spans from 2025-06-09 to 2025-06-14
Team2 match history spans from 2025-06-03 to 2025-06-14

Last matches results (oldest -> newest):
team1: [1, 0, 0, 1, 1, 1, 0, 0]
team2: [1, 1, 0, 1, 0, 1, 1, 1]
-----


In [20]:
print("It could be the case that LLM just pays attention to the outcome history, ignoring the events. Let's check that.")
print("We first take a look at the correlation between LLM predicted label and the history-based signals.")
rows = []
for i, s in enumerate(sequences):
    h1 = s["history"]["team1"]
    h2 = s["history"]["team2"]
    r1 = [x["pov_win"] for x in h1]
    r2 = [x["pov_win"] for x in h2]

    rows.append({
        "idx": i,
        "pred": preds[i],
        "real": reality[i],
        "winrate_team1": np.mean(r1),
        "winrate_team2": np.mean(r2),
        "winrate_diff": np.mean(r1) - np.mean(r2),
        "last3_wr_diff": np.mean(r1[-3:]) - np.mean(r2[-3:]),
        "last_match_diff": r1[-1] - r2[-1],
        "n1": len(r1),
        "n2": len(r2),
    })

df = pd.DataFrame(rows)

print("LLM accuracy vs reality:", accuracy_score(df["real"], df["pred"]))

# Correlation between LLM predicted label and history-based signals
cols = ["winrate_diff", "last3_wr_diff", "last_match_diff", "winrate_team1", "winrate_team2", "n1", "n2"]
print("\nCorrelation with LLM predicted label (pred):")
print(df[["pred"] + cols].corr(numeric_only=True)["pred"].sort_values(ascending=False))

print("\n We observe notable correlations on winrate based signals (0.46-0.6). The strongest correlation is to the winrate difference.")

It could be the case that LLM just pays attention to the outcome history, ignoring the events. Let's check that.
We first take a look at the correlation between LLM predicted label and the history-based signals.
LLM accuracy vs reality: 0.6110056925996205

Correlation with LLM predicted label (pred):
pred               1.000000
winrate_diff       0.614228
last3_wr_diff      0.514779
winrate_team1      0.466385
last_match_diff    0.278117
n1                 0.006713
n2                 0.002231
winrate_team2     -0.372578
Name: pred, dtype: float64

 We observe notable correlations on winrate based signals (0.46-0.6). The strongest correlation is to the winrate difference.


In [32]:
print("We take a look at matches where LLM is correct, but \"history vote\" disagrees.")
print("History vote means predicting the team with more wins in their history.")

df2 = df.copy()
df2["history_vote"] = (df2["winrate_diff"] > 0).astype(int)  # naive rule

# keep only cases where LLM matches reality, then compare to history_vote
df2["llm_correct"] = (df2["pred"] == df2["real"])
df2["llm_vs_history"] = (df2["pred"] != df2["history_vote"])

hard = df2[df2["llm_correct"] & df2["llm_vs_history"]].sort_values("winrate_diff", ascending=False)

hard_idx = hard["idx"].head(10).tolist()
print(hard_idx)

# Take one sample and save it for later inspection
disagreement_sample = sequences[hard_idx[0]]


We take a look at matches where LLM is correct, but "history vote" disagrees.
History vote means predicting the team with more wins in their history.
[461, 153, 278, 434, 239, 157, 247, 279, 285, 435]


In [23]:
print("We now look at the dependency of accuracy on how evenly matched the opposing teams are.")
print("If LLM just pays attention to outcome history we would see clear trend in rising accuracy.")
df3 = df.copy()
df3["correct"] = (df3["pred"] == df3["real"]).astype(int)

# bin by abs(winrate_diff)
df3["abs_wr_diff"] = df3["winrate_diff"].abs()
bins = pd.qcut(df3["abs_wr_diff"], 5, duplicates="drop")
summary = df3.groupby(bins).agg(
    n=("correct","size"),
    acc=("correct","mean"),
    mean_abs_diff=("abs_wr_diff","mean")
).reset_index(drop=True)

print(summary)
print("We see that the accuracy in bins increases monotonically as opponents get less evenly matched. That shows that the LLMs largely pay attention to the outcome history, modeling team strengths.")


We now look at the dependency of accuracy on how evenly matched the opposing teams are.
If LLM just pays attention to outcome history we would see clear trend in rising accuracy.
     n       acc  mean_abs_diff
0  234  0.585470       0.079767
1  142  0.605634       0.242111
2   72  0.625000       0.365526
3   79  0.683544       0.577532
We see that the accuracy in bins increases monotonically as opponents get less evenly matched. That shows that the LLMs largely pay attention to the outcome history, modeling team strengths.


  summary = df3.groupby(bins).agg(


In [35]:
print("Finally, let's take a look at the disagreement sample and try to uncover why the LLM decided this way.")

describe_sample(disagreement_sample)

print("\nIt's natural to assume that the most recent match has the most predictive signal and it could be a match, which is part of the BO series.")
last_game_t1_events = disagreement_sample["history"]["team1"][-1]["events"]
last_game_t2_events = disagreement_sample["history"]["team2"][-1]["events"]

print("The last game is shared for both teams (as can be seen in Jupyter variable view)")
print("\nThe last event says...")
print(last_game_t1_events[-1])

print("By this event, we can see that LLM is catching onto the game one (this is game two) and believes the team WE to win again.")




Finally, let's take a look at the disagreement sample and try to uncover why the LLM decided this way.
-----
Match for which we predict the outcome:
Anyone's Legend Team WE ,Result: 0, 2025-08-27
team1_players: ['Flandre', 'Tarzan', 'Shanks', 'Hope', 'Kael']
team2_players: ['Cube', 'Monki', 'Karis', 'Taeyoon', 'Vampire']
Team1 match history spans from 2025-08-17 to 2025-08-27
Team2 match history spans from 2025-08-12 to 2025-08-27

Last matches results (oldest -> newest):
team1: [1, 0, 0, 1, 1, 1, 1, 0]
team2: [1, 0, 0, 0, 0, 0, 0, 1]
-----

It's natural to assume that the most recent match has the most predictive signal and it could be a match, which is part of the BO series.
The last game is shared for both teams (as can be seen in Jupyter variable view)

The last event says...
{'label': 'match_winner', 'text': "and it is a WE master class in game number one. Al, where have you gone? We don't know, but we have taken your place and game one is theirs."}


In [37]:
print("Finally, let's take a look at the disagreement sample and try to uncover why the LLM decided this way.")
disagreement_sample2 = sequences[hard_idx[1]]
describe_sample(disagreement_sample2)

print("\nIt's natural to assume that the most recent match has the most predictive signal and it could be a match, which is part of the BO series.")
last_game_t1_events = disagreement_sample2["history"]["team1"][-1]["events"]
last_game_t2_events = disagreement_sample2["history"]["team2"][-1]["events"]

print("The last game is shared for both teams (as can be seen in Jupyter variable view)")
print("\nThe last event says...")
print(last_game_t1_events[-1])

print("By this event, again, we can see that LLM is catching onto the game one (this is game two), stating that JDG won in a dominant fashion, thus believing they will win again.")




Finally, let's take a look at the disagreement sample and try to uncover why the LLM decided this way.
-----
Match for which we predict the outcome:
Anyone's Legend JD Gaming ,Result: 0, 2025-04-24
team1_players: ['Flandre', 'Tarzan', 'Shanks', 'Hope', 'Kael']
team2_players: ['Ale', 'Xun', 'Scout', 'Peyz', 'MISSING']
Team1 match history spans from 2025-04-10 to 2025-04-24
Team2 match history spans from 2025-04-14 to 2025-04-24

Last matches results (oldest -> newest):
team1: [1, 0, 0, 1, 1, 1, 1, 0]
team2: [0, 0, 1, 0, 0, 1, 0, 1]
-----

It's natural to assume that the most recent match has the most predictive signal and it could be a match, which is part of the BO series.
The last game is shared for both teams (as can be seen in Jupyter variable view)

The last event says...
{'label': 'match_winner', 'text': "They're on to the nexus and JD Gaming pick up game number one over AL in dominant fashion."}
By this event, we can see that LLM is catching onto the game one (this is game two) a