In [56]:
import os
import pickle as pkl
from tqdm import tqdm

In [37]:
gt_ds_path = "/ptmp/dduka/databases/ego4d/ego4d_train.pkl"

original_timestamps = []
with open(gt_ds_path, 'rb') as f:
    for sample in pkl.load(f):
        original_timestamps.append(sample[2] - sample[1])

## (Additive Scaling) Visualization of the change in distribution based on different additive scaling factor

In [None]:
additive_scaling_base_path = "/ptmp/dduka/databases/ego4d/rewritten_timestamps/symmetric_additive_offset"

# Find all .pkl files in the base_path
pkl_files = [f for f in os.listdir(additive_scaling_base_path) if f.endswith('.pkl')]

# Load the first pkl file
additive_scaling_all_data = {}
for pkl_file in tqdm(pkl_files):
    additive_scaling_value = pkl_file.split('_')[-1].split('.')[0]
    
    if additive_scaling_value not in additive_scaling_all_data:
        additive_scaling_all_data[additive_scaling_value] = []
    
    with open(os.path.join(additive_scaling_base_path, pkl_file), 'rb') as f:
        for sample in pkl.load(f):
            additive_scaling_all_data[additive_scaling_value].append(sample[2] - sample[1])

additive_scaling_all_data["0.0"] = original_timestamps

## (Multiplicative Scaling) Visualization of the change in distribution based on different scale size

## (Noun and Verbs) Visualization of the change in distribution based on different gap size

In [None]:
base_path = "/ptmp/dduka/databases/ego4d/rewritten_timestamps/nouns_and_verbs"

# Find all .pkl files in the base_path
pkl_files = [f for f in os.listdir(base_path) if f.endswith('.pkl')]

# Load the first pkl file
all_data = {}
for pkl_file in tqdm(pkl_files):
    with open(os.path.join(base_path, pkl_file), 'rb') as f:
        gap_value = pkl_file.split('_')[-1].split('.')[0:2]
        gap_value = float(".".join(gap_value))

        all_data[gap_value] = []
        for sample in pkl.load(f):
            all_data[gap_value].append(sample[2] - sample[1])

all_data["0.0"] = original_timestamps

100%|██████████| 21/21 [05:47<00:00, 16.57s/it]


In [61]:
import numpy as np
import matplotlib.pyplot as plt

def plot_old_vs_new_histogram(
    old_durations,
    new_durations,
    *,
    gap_label=None,        # e.g., args.gap -> "2.1"
    bins=150,
    alpha=0.45,
    clip_high_pct=99.9,    # clip extreme outliers for a nicer x-range
    xmax=None,             # set (e.g., 1100) to match your fixed axis, or leave None
    fig_size=(10, 6),
    dpi=150
):
    old = np.asarray(old_durations, dtype=float)
    new = np.asarray(new_durations, dtype=float)

    both = np.concatenate([old, new]) if old.size and new.size else (old if old.size else new)
    lo = max(0.0, float(np.nanmin(both))) if both.size else 0.0
    hi_clip = float(np.nanpercentile(both, clip_high_pct)) if both.size else 1.0
    hi = min(hi_clip, float(xmax)) if xmax is not None else hi_clip

    # Shared bin edges so bars align perfectly
    edges = np.linspace(lo, hi, bins + 1)

    fig, ax = plt.subplots(figsize=fig_size, dpi=dpi)
    ax.hist(old, bins=edges, density=False, log=True, alpha=alpha, label="Old")
    ax.hist(new, bins=edges, density=False, log=True, alpha=alpha, label="New")

    title = "Old vs New Segment Length Distribution"
    if gap_label is not None:
        title += f" (gap={gap_label}s)"
    ax.set_title(title)
    ax.set_xlabel("Segment length (seconds)")
    ax.set_ylabel("Count (log scale)")
    ax.set_xlim(lo, hi)
    ax.legend()
    ax.grid(True, which="both", axis="y", alpha=0.2)
    fig.tight_layout()
    return fig, ax

old_durations = all_data["0.0"]
new_durations = all_data[1.1]

fig, ax = plot_old_vs_new_histogram(
    old_durations,
    new_durations,
    gap_label=1.1,
    bins=150,
    alpha=0.45,
    xmax=1100,        # optional; set None to auto-scale
)

fig.savefig("old_vs_new_segment_length_distribution.png")