Before analyzing individual time series, let's explore the full dataset first. The dataset is a collection of `.parquet` files, each consisting of an array of game state data of shape `(frames, features)` extracted from raw `.replay` files.

In [1]:
from impulse import ReplayDataset, ReplayData

import matplotlib.pyplot as plt

In [2]:
from pathlib import Path

project_root = Path('/Users/david/dev/impulse')
data_dir = project_root / 'replays/parsed'
db_path = project_root / 'impulse.db'

replays = ReplayDataset(db_path=str(db_path), data_dir=str(data_dir))

Let's check how many parsed replays we have:

In [3]:
len(replays)

Found 192 parsed replays in database


192

What does a `replay` object (as a member of a `ReplayDataset`) look like? It's a `ReplayData` dataclass with three attributes: `replay_id`, `frames`, and `metadata`. Metadata fields can also be accessed directly as attributes (e.g., `replay.team_size`):

In [4]:
sample_list = replays.load_sample(1)        # load a random replay
sample = sample_list[0]
print(type(sample))
print(f"replay_id: {type(sample.replay_id)}")
print(f"frames: {type(sample.frames)}")
print(f"metadata: {type(sample.metadata)}")

Loaded 1 replays
<class 'impulse.replay_dataset.ReplayData'>
replay_id: <class 'str'>
frames: <class 'pandas.core.frame.DataFrame'>
metadata: <class 'dict'>


In [5]:
sample.replay_id

'0918290B4C954DCD04EB2E918E2B31F7'

In [6]:
sample.frames.head()

Unnamed: 0,frame,current time,frame time,seconds remaining,Ball - position x,Ball - position y,Ball - position z,Ball - linear velocity x,Ball - linear velocity y,Ball - linear velocity z,...,p7_angular velocity z,p7_quaternion x,p7_quaternion y,p7_quaternion z,p7_quaternion w,p7_boost level,p7_dodge active,p7_jump active,p7_double jump active,p7_player demolished by
0,0,15.782539,15.782658,300.0,-241.339996,-41.52,154.830002,-3890.01001,39.779999,934.619995,...,,,,,,,,,,
1,1,15.88254,15.882656,300.0,-565.049988,-38.220001,230.119995,-3880.110107,39.68,878.099976,...,,,,,,,,,,
2,2,15.98254,15.982655,300.0,-1016.869995,-33.599998,327.630005,-3866.25,39.540001,799.22998,...,,,,,,,,,,
3,3,16.082541,16.082653,300.0,-1402.859985,-29.639999,403.910004,-3854.47998,39.419998,731.849976,...,,,,,,,,,,
4,4,16.182541,16.182652,300.0,-1787.670044,-25.68,473.470001,-3842.719971,39.299999,664.669983,...,,,,,,,,,,


In [7]:
sample.metadata

{'replay_id': '0918290B4C954DCD04EB2E918E2B31F7',
 'frame_count': 3050,
 'feature_count': 161,
 'fps': 10.0,
 'parsed_at': '2026-01-31T00:10:00.014867+00:00',
 'source_file': '/Users/david/dev/impulse/replays/raw/rlcs/2024/World Championship/[1] Swiss Stage/Round 4/GGM1 vs QTPG/3dc4ade5-c401-4e78-bacf-5398810ee1a6.replay',
 'replay_name': 'WORLDS AC QTPG vs GGM1 G3 2024-09-13.13.20',
 'date': '2024-09-13 13-20-05',
 'map': 'Stadium_P',
 'match_type': 'Lan',
 'team_size': 3,
 'num_frames': 9731,
 'duration_seconds': 305.0,
 'team_0_score': None,
 'team_1_score': 2,
 'goals': [{'PlayerName': 'Chronic', 'PlayerTeam': 1, 'frame': 5963},
  {'PlayerName': 'ApparentlyJack', 'PlayerTeam': 1, 'frame': 6849}],
 'highlights': [{'BallName': 'Ball_TA_151',
   'CarName': 'Car_TA_945',
   'GoalActorName': 'None',
   'frame': 546},
  {'BallName': 'Ball_TA_151',
   'CarName': 'Car_TA_940',
   'GoalActorName': 'None',
   'frame': 1505},
  {'BallName': 'Ball_TA_151',
   'CarName': 'Car_TA_943',
   'GoalA

### Exploring a single replay dataframe

In [None]:
replay_id = sample.replay_id
df = sample.frames
metadata = sample.metadata

In [32]:
df.columns

Index(['frame', 'current time', 'frame time', 'seconds remaining',
       'Ball - position x', 'Ball - position y', 'Ball - position z',
       'Ball - linear velocity x', 'Ball - linear velocity y',
       'Ball - linear velocity z',
       ...
       'p7_angular velocity z', 'p7_quaternion x', 'p7_quaternion y',
       'p7_quaternion z', 'p7_quaternion w', 'p7_boost level',
       'p7_dodge active', 'p7_jump active', 'p7_double jump active',
       'p7_player demolished by'],
      dtype='object', length=161)

In [33]:
df.shape

(3086, 161)

As seen above, the shape of the dataframe is `(frame_count, feature_count)`:

In [34]:
frame_count = metadata['frame_count']
feature_count = metadata['feature_count']
print(f"Frame count: {frame_count},\nFeature count: {feature_count}")

Frame count: 3086,
Feature count: 161


In [35]:
df.describe()

Unnamed: 0,frame,current time,frame time,seconds remaining,Ball - position x,Ball - position y,Ball - position z,Ball - linear velocity x,Ball - linear velocity y,Ball - linear velocity z,...,p7_angular velocity z,p7_quaternion x,p7_quaternion y,p7_quaternion z,p7_quaternion w,p7_boost level,p7_dodge active,p7_jump active,p7_double jump active,p7_player demolished by
count,3086.0,3086.0,3086.0,3086.0,3086.0,3086.0,3086.0,3086.0,3086.0,3086.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
mean,1542.5,179.573863,179.590709,147.939728,-347.505469,-92.035618,801.741202,-5.065907,-17.259164,0.538633,...,,,,,,,,,,
std,890.995791,94.173006,94.172926,88.354683,2567.896973,2959.139016,540.168875,1128.005462,1332.724275,577.391917,...,,,,,,,,,,
min,0.0,13.27889,13.310669,0.0,-4002.540039,-5036.240234,82.879997,-3178.47998,-3022.060059,-1756.800049,...,,,,,,,,,,
25%,771.25,106.202833,106.220886,71.0,-2693.150024,-2795.889893,308.479996,-730.990005,-1012.650009,-378.515007,...,,,,,,,,,,
50%,1542.5,183.330872,183.360016,148.0,-521.174988,-168.764999,734.829987,-136.400002,-52.789999,-5.555,...,,,,,,,,,,
75%,2313.75,260.460564,260.464027,225.0,2062.710022,2472.510071,1222.887512,626.050003,1055.799988,372.470001,...,,,,,,,,,,
max,3085.0,337.590271,337.598877,300.0,4003.129883,5078.609863,1951.670044,3270.320068,3507.02002,2137.860107,...,,,,,,,,,,


The dataframe is padded with additional columns to accomodate features for up to 8 total players (two teams of four players). However, this replay is from a 3v3 RLCS match: 

In [36]:
team_size = metadata['team_size']
team_size

3

Therefore we can remove the extra columns. Let's drop the columns that consist exclusively of NaN values:

In [37]:
df.dropna(axis=1, how='all', inplace=True) 
df.shape

(3086, 125)

In [38]:
df.isnull().sum()

frame                      0
current time               0
frame time                 0
seconds remaining          0
Ball - position x          0
                          ..
p5_boost level             0
p5_dodge active            0
p5_jump active             0
p5_double jump active      0
p5_player demolished by    0
Length: 125, dtype: int64

Now there are no NaN values in the dataframe. 

## Looking at dataset-level statistics

### Replay length

In [None]:
frame_counts = [replay.frame_count for replay in replays]
duration_seconds = [replay.duration_seconds for replay in replays]

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))

ax1.hist(frame_counts, bins=60)
ax1.set_title('Distribution of Frame Counts in Replays')
ax1.set_xlabel('Frame Count')
ax1.set_ylabel('Number of Replays')

ax2.hist(duration_seconds, bins=60)
ax2.set_title('Distribution of Replay Durations')
ax2.set_xlabel('Duration (seconds)')
ax2.set_ylabel('Number of Replays')
plt.tight_layout()
plt.show()

Since replay files are sampled at 10 FPS by the parser, it makes sense that these plots look identical and that the scale of their horizontal axes differs by a factor of 10. 

This data comes from the RLCS 2024 World Championship, so each game should be a 3v3 game. We can verify that all replays have a `team_size` of 3:

In [None]:
team_sizes = [replay.team_size for replay in replays]
team_size_set = set(team_sizes)
team_size_set