In [5]:
import json
import sys
from typing import Dict, List, Tuple

def load_snapshots(path: str) -> Dict[int, List[dict]]:
    with open(path, "r", encoding="utf-8") as f:
        data = json.load(f)  # expects a list of {"_id": ..., "users": [...]}
    by_id = {}
    for item in data:
        _id = item["_id"]
        users = item.get("users", [])
        by_id[_id] = users
    return by_id

def normalize_users(users: List[dict]) -> List[Tuple[str, int, int]]:
    # Convert to sortable, order-insensitive structure
    # Assumes each user has userId (str), score (int), lastUpdateTime (int)
    norm = []
    for u in users:
        norm.append((
            str(u.get("userId")),
            int(u.get("score", 0)),
            int(u.get("lastUpdateTime", 0)),
        ))
    # Sort for deterministic comparison
    norm.sort()
    return norm

def main(file1: str, file2: str):
    snapshots1 = load_snapshots(file1)
    snapshots2 = load_snapshots(file2)

    total_ids = len(snapshots1)
    if total_ids == 0:
        print("File 1 không có _id nào. Tỉ lệ trùng: 0.00%")
        return

    matched_equal = 0
    matched_ids = 0
    missing_in_file2 = []
    different_users = []

    for _id, users1 in snapshots1.items():
        if _id not in snapshots2:
            missing_in_file2.append(_id)
            continue

        matched_ids += 1
        users2 = snapshots2[_id]
        if normalize_users(users1) == normalize_users(users2):
            matched_equal += 1
        else:
            different_users.append(_id)

    percent_equal = matched_equal / total_ids * 100.0
    percent_id_overlap = matched_ids / total_ids * 100.0

    print(f"Tổng _id trong file 1: {total_ids}")
    print(f"_id có mặt trong file 2: {matched_ids} ({percent_id_overlap:.2f}%)")
    print(f"_id mà users giống hệt: {matched_equal}/{total_ids} ({percent_equal:.2f}%)")

    if missing_in_file2:
        print(f"_id có trong file 1 nhưng thiếu ở file 2 (ví dụ 10 đầu): {missing_in_file2[:10]}")
    if different_users:
        print(f"_id có ở cả 2 nhưng users khác nhau (ví dụ 10 đầu): {different_users[:10]}")

# Để chạy trực tiếp trong notebook, gán đường dẫn file ở đây:
file1 = "leaderboard_snapshots/raw_snapshots.json"
file2 = "leaderboard_snapshots/top_level_gainers_snapshots_v2.json"  # Thay bằng file bạn muốn so sánh

main(file1, file2)

Tổng _id trong file 1: 238
_id có mặt trong file 2: 238 (100.00%)
_id mà users giống hệt: 238/238 (100.00%)


{
    _id: 1759451393430,
    users: [
        {
            score: 10003,
            userId: 'whale_3',
            lastUpdateTime: 1759451273549
        },
        {
            score: 10002,
            userId: 'whale_2',
            lastUpdateTime: 1759450973518
        },
        {
            score: 36,
            userId: 'user_163',
            lastUpdateTime: 1759451367059
        },
        {
            score: 34,
            userId: 'user_35',
            lastUpdateTime: 1759451355358
        },
        {
            score: 33,
            userId: 'user_64',
            lastUpdateTime: 1759451375159
        },
        {
            score: 33,
            userId: 'user_81',
            lastUpdateTime: 1759451389561
        },
        {
            score: 32,
            userId: 'user_106',
            lastUpdateTime: 1759451362858
        },
        {
            score: 32,
            userId: 'user_65',
            lastUpdateTime: 1759451385060
        },
        {
            score: 30,
            userId: 'user_154',
            lastUpdateTime: 1759451332456
        },
        {
            score: 29,
            userId: 'user_24',
            lastUpdateTime: 1759451371459
        }
    ]
}

 {
    "_id": 1759451393430,
    "users": [
      {
        "userId": "whale_3",
        "score": 10003,
        "lastUpdateTime": 1759451273549
      },
      {
        "userId": "user_163",
        "score": 36,
        "lastUpdateTime": 1759451367059
      },
      {
        "userId": "user_35",
        "score": 34,
        "lastUpdateTime": 1759451355358
      },
      {
        "userId": "user_64",
        "score": 33,
        "lastUpdateTime": 1759451375159
      },
      {
        "userId": "user_81",
        "score": 33,
        "lastUpdateTime": 1759451389561
      },
      {
        "userId": "user_106",
        "score": 32,
        "lastUpdateTime": 1759451362858
      },
      {
        "userId": "user_65",
        "score": 32,
        "lastUpdateTime": 1759451385060
      },
      {
        "userId": "user_154",
        "score": 30,
        "lastUpdateTime": 1759451332456
      },
      {
        "userId": "user_24",
        "score": 29,
        "lastUpdateTime": 1759451371459
      },
      {
        "userId": "user_149",
        "score": 29,
        "lastUpdateTime": 1759451386360
      }
    ]
  },