### Sample 10% of videos from outputs folder

In [2]:
import os
import random
import math

base_dir = '/Users/eveyhuang/Documents/NICO/gemini_code/outputs'

# Collect all JSON files and map them to their video file names
video_file_map = {}
for root, dirs, files in os.walk(base_dir):
    for file in files:
        if file.endswith('.json') and not file.startswith('all_') and not file.startswith('verbal_'):
            video_name = file.replace('.json', '')
            full_path = os.path.join(root, file)
            if video_name not in video_file_map:
                video_file_map[video_name] = []
            video_file_map[video_name].append(full_path)

# Get all unique video names
all_video_names = list(video_file_map.keys())
print(f"Total unique video names found: {len(all_video_names)}")
n_sample = max(1, math.ceil(0.1 * len(all_video_names)))  # At least 1

print(f"Number of videos to sample: {n_sample}")
# Randomly sample 10% of video names
random.seed(42)  # For reproducibility
sampled_video_names = random.sample(all_video_names, n_sample)

# Get all file paths for the sampled videos
sampled_file_paths = []
for name in sampled_video_names:
    sampled_file_paths.extend(video_file_map[name])

# Print or save the sampled file paths
print("Sampled JSON files for verification:")
for path in sampled_file_paths:
    print(path)

# Optionally, save to a text file
with open('sampled_json_files.txt', 'w') as f:
    for path in sampled_file_paths:
        f.write(path + '\n')

Total unique video names found: 781
Number of videos to sample: 79
Sampled JSON files for verification:
/Users/eveyhuang/Documents/NICO/gemini_code/outputs/2021MND/output_2021_04_22_MND_S6/Breakout_Room_4_Part_2_2021_04_22_13_14_53/Breakout_Room_4_Part_2_2021_04_22_13_14_53_chunk6.json
/Users/eveyhuang/Documents/NICO/gemini_code/outputs/2021ABI/output_2021_05_21_ABI_S5/bot5p3_Room_5_Zoom_Meeting_5_21_2021_10_59_19_AM/bot5p3_Room_5_Zoom_Meeting_5_21_2021_10_59_19_AM_chunk3.json
/Users/eveyhuang/Documents/NICO/gemini_code/outputs/2021ABI/output_2021_05_20_ABI_S4/bot2p2_Zoom_Meeting_2021_05_20_12_37_07/bot2p2_Zoom_Meeting_2021_05_20_12_37_07_chunk3.json
/Users/eveyhuang/Documents/NICO/gemini_code/outputs/2021MZT/output_2021_10_01_MZT_S1/B1.2_Zoom_Meeting_Room_1_2021_10_01_11_07_45/B1.2_Zoom_Meeting_Room_1_2021_10_01_11_07_45.json
/Users/eveyhuang/Documents/NICO/gemini_code/outputs/2021SLU/output_2021_06_10_SLU_S5/botB2_2021_06_10_12_35_06/botB2_2021_06_10_12_35_06_chunk2.json
/Users/eveyh

In [10]:
import json
sampled_dict = {}

with open('sampled_json_files.txt', 'r') as f:
    file_paths = [line.strip() for line in f if line.strip()]

def extract_key_and_subkey(path):
    # Get the part after '/outputs/'
    rest = path.split('/outputs/')[1]
    parts = rest.split(os.sep)
    key = os.path.join(parts[0], parts[1])
    sub_key = os.path.join(*parts[2:])  # Join everything after the key
    return key, sub_key

for path in file_paths:
    # Extract key after 'output_'
    try:
        key, sub_key = extract_key_and_subkey(path)
        
    except Exception as e:
        print(f"Skipping {path}: {e}")
        continue

    # Load JSON and sample a value
    try:
        with open(path, 'r') as jf:
            data = json.load(jf)
        if isinstance(data, list) and data:
            sampled_value = random.choice(data)
        elif isinstance(data, dict) and data:
            filtered = [ann for ann in data["meeting_annotations"] if ann["speaking duration"] > 10]
            if filtered:
                sampled_value = random.choice(filtered)
            else:
                # Fallback to any annotation if none meet the criteria
                sampled_value = None
        else:
            sampled_value = data
    except Exception as e:
        print(f"Error reading {path}: {e}")
        continue

    if key not in sampled_dict:
        sampled_dict[key] = {}
    sampled_dict[key][sub_key] = sampled_value

# Save to a new JSON file
with open('sampled_verification.json', 'w') as out_f:
    json.dump(sampled_dict, out_f, indent=2)

Error reading /Users/eveyhuang/Documents/NICO/gemini_code/outputs/2021MZT/output_2021_10_01_MZT_S1/B1.1_Zoom_Meeting_Room_1_2021_10_01_11_04_18/B1.1_Zoom_Meeting_Room_1_2021_10_01_11_04_18.json: 'speaking duration'
Error reading /Users/eveyhuang/Documents/NICO/gemini_code/outputs/2020NES/output_2020_11_05_NES_S3/3_Sorbents_Zoom_Meeting_2020_11_05_12_20_44/3_Sorbents_Zoom_Meeting_2020_11_05_12_20_44_chunk3.json: 'speaking duration'
Error reading /Users/eveyhuang/Documents/NICO/gemini_code/outputs/2020NES/output_2020_11_05_NES_S3/3_Sorbents_Zoom_Meeting_2020_11_05_12_20_44/3_Sorbents_Zoom_Meeting_2020_11_05_12_20_44_chunk4.json: 'speaking duration'
Error reading /Users/eveyhuang/Documents/NICO/gemini_code/outputs/2020NES/output_2020_11_05_NES_S6/6_Theory_and_Expt_Zoom_Meeting_2020_11_05_10_28_39/6_Theory_and_Expt_Zoom_Meeting_2020_11_05_10_28_39_chunk1.json: 'speaking duration'
Error reading /Users/eveyhuang/Documents/NICO/gemini_code/outputs/2020NES/output_2020_11_06_NES_S7/1_beyond_co2