# Time Series Analysis Demo

This notebook demonstrates the usage of the TimeSeriesAnalyzer class for analyzing time series data.

In [1]:
import pandas as pd
import numpy as np
import pyarrow as pa
import lark
from evaluator import EvaluateExpression
from time_series_analyzer import TimeSeriesAnalyzer

## Initialize and Review Data

In [2]:
from time_series_analyzer import TimeSeriesAnalyzer

file_path = "events.arrow"
time_col = "time"
trajectory_col = "user_id"
analyzer = TimeSeriesAnalyzer(file_path, time_col, trajectory_col)



In [3]:

analyzer.query("exists {purchase} before #now at every {purchase}")

100%|██████████| 1/1 [00:01<00:00,  1.92s/it]


<TimeSeries purchase: 37346 rows>
              id      time  purchase
0          59177    775598         0
1         230337    410149         0
2         307700   1029889         0
3         307700   1029996         1
4         307700   1031539         1
...          ...       ...       ...
37341  257772564  13597486         0
37342  257772564  13597555         1
37343  257772564  13597673         1
37344  257781820  13600052         0
37345  257784860  13601742         0

[37346 rows x 3 columns]

In [5]:
#print original dataset stats
print("Original dataset:")
print(f"Total events: {analyzer.get_total_events():,}")
print(f"Unique trajectories: {analyzer.get_unique_trajectories():,}")

#create train/val/test splits
splits = analyzer.split(train=0.7, val=0.15, test=0.15)

#print stats for each split
for name, split_analyzer in splits.items():
    print(f"\n{name.upper()} split:")
    print(f"Total events: {split_analyzer.get_total_events():,}")
    print(f"Unique trajectories: {split_analyzer.get_unique_trajectories():,}")
    print(f"% of original events: {split_analyzer.get_total_events() / analyzer.get_total_events() * 100:.2f}%")
    print(f"% of original trajectories: {split_analyzer.get_unique_trajectories() / analyzer.get_unique_trajectories() * 100:.2f}%")

#test a query on each split
query = "exists {purchase} before #now at every {purchase}"
print("\nQuery results:")
for name, split_analyzer in splits.items():
    result = split_analyzer.query(query)
    print(f"{name}: {len(result) if hasattr(result, '__len__') else 'N/A'} results")

Original dataset:
Total events: 2,206,675
Unique trajectories: 407,264

TRAIN split:
Total events: 1,550,857
Unique trajectories: 285,085
% of original events: 70.28%
% of original trajectories: 70.00%

VAL split:
Total events: 325,579
Unique trajectories: 61,090
% of original events: 14.75%
% of original trajectories: 15.00%

TEST split:
Total events: 330,239
Unique trajectories: 61,089
% of original events: 14.97%
% of original trajectories: 15.00%

Query results:


100%|██████████| 1/1 [00:00<00:00,  5.72it/s]


train: 26105 results


100%|██████████| 1/1 [00:00<00:00, 23.24it/s]


val: 5542 results


100%|██████████| 1/1 [00:00<00:00, 22.05it/s]

test: 5699 results



