In [1]:
import os
import json
import pandas as pd
from datetime import datetime

In [3]:
data_dir = '../data'
jsonl_files = [f for f in os.listdir(data_dir) if f.endswith('.jsonl')]
print(f"Found {len(jsonl_files)} .jsonl files in {data_dir}")

Found 1447 .jsonl files in ../data


In [5]:
FAST_MODE = True
SAMPLE_SIZE = 50
files_to_load = jsonl_files[:SAMPLE_SIZE] if FAST_MODE else jsonl_files

In [6]:
all_records = []
for file in files_to_load:
    path = os.path.join(data_dir, file)
    with open(path, 'r') as f:
        for line in f:
            try:
                all_records.append(json.loads(line))
            except json.JSONDecodeError:
                continue  

df = pd.DataFrame(all_records)
print(f"Loaded {len(df)} events from {len(files_to_load)} files.")

Loaded 100730 events from 50 files.


In [7]:
df['eventTimestamp'] = pd.to_datetime(df['eventTimestamp'], unit='ms', errors='coerce')

In [8]:
print("\nEvent types:")
print(df['eventName'].value_counts())

print("\nEvent sources:")
print(df['eventSource'].value_counts())


Event types:
eventName
text-insert          85824
text-delete           6544
suggestion-hover      3637
cursor-forward        1030
cursor-backward        724
suggestion-get         656
suggestion-close       644
suggestion-open        638
suggestion-select      482
cursor-select          298
suggestion-down        140
suggestion-up           57
system-initialize       50
suggestion-reopen        6
Name: count, dtype: int64

Event sources:
eventSource
user    99036
api      1694
Name: count, dtype: int64


In [9]:
if 'currentSuggestions' in df.columns:
    df['numSuggestions'] = df['currentSuggestions'].apply(lambda x: len(x) if isinstance(x, list) else 0)
    print("\nSuggestion counts:")
    print(df['numSuggestions'].describe())


Suggestion counts:
count    100730.000000
mean          0.026626
std           0.344789
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max           5.000000
Name: numSuggestions, dtype: float64


In [10]:
print("\nEvents per minute:")
print(df.groupby(df['eventTimestamp'].dt.floor('min')).size().head())


Events per minute:
eventTimestamp
2021-08-16 07:00:00      7
2021-08-16 07:01:00    134
2021-08-16 07:02:00    293
2021-08-16 07:03:00    158
2021-08-16 07:04:00    122
dtype: int64


In [11]:
print("\nSample text insertions:")
if 'textDelta' in df.columns:
    print(df[df['eventName'] == 'text-insert']['textDelta'].head())


Sample text insertions:
1    {'ops': [{'retain': 244}, {'insert': ' '}]}
2    {'ops': [{'retain': 245}, {'insert': ' '}]}
3    {'ops': [{'retain': 246}, {'insert': ' '}]}
7    {'ops': [{'retain': 244}, {'insert': ' '}]}
8    {'ops': [{'retain': 245}, {'insert': 'R'}]}
Name: textDelta, dtype: object
