In [1]:
# Demonstrate impact of filter based on requiring a gap of >= 4 hours in raw data
import json
from tqdm import tqdm

DATASET_DIR = '/workspace/datasets'
INPUT_CSV = '/workspace/datasets/profiles-v2/profiles-v2.csv'

# the raw data was kept in the profiles-v2 jsons
def LoadRawTimesV2(input_csv_path):
    data_list = []
    with open(input_csv_path, "r", encoding="utf-8") as file:
        for line in tqdm(file, desc="Loading JSON lines"):
            try:
                data = json.loads(line.strip())
                data_list.append(data['raw_data']['time'])
            except json.JSONDecodeError as e:
                tqdm.write(f"Skipping line due to error: {e}")
    return data_list

def NumProfilesWithMinGap(times_list, min_gap_hours=4):
    count = 0
    for times in tqdm(times_list, desc="Gap search"):
        if not times:
            continue
        gaps = [t2 - t1 for t1, t2 in zip(times, times[1:])]
        if any(gap >= min_gap_hours for gap in gaps):
            count += 1
    return count

In [2]:
raw_times = LoadRawTimesV2(INPUT_CSV)

Loading JSON lines: 3891424it [04:33, 14218.68it/s]


In [3]:
num_profiles_kept = NumProfilesWithMinGap(raw_times, 4)
print(f"Number of profiles kept: {num_profiles_kept}")

Gap search: 100%|██████████| 3891424/3891424 [00:19<00:00, 199356.99it/s]

Number of profiles kept: 168315



