# PySpark Leaderboard Snapshot

Notebook này chạy PySpark ở local mode để gen snapshot leaderboard data từ parquet file.

## Pipeline:
1. Đọc data từ Parquet file
2. Transform thành Score objects với event time
3. Tính tổng điểm trong sliding window (1 phút gần nhất)
4. Clean up các user có submisstion time < cleanuptime-5 phút, clean up lần đầu tiên vào phút thứ 10 và trigger mỗi 5 phút
5. Snapshot TopN mỗi 7 phút theo event time


In [1]:
# Import required libraries
import os
import sys
from datetime import datetime, timezone
from typing import List, Dict, Any, Optional, Tuple
from dataclasses import dataclass
from collections import defaultdict, deque
import math

# PySpark imports
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import Window


print("Libraries imported successfully!")


Libraries imported successfully!


In [2]:
# Initialize Spark Session for local mode
spark = SparkSession.builder \
    .appName("LeaderBoardAnalysis") \
    .master("local[*]") \
    .config("spark.sql.adaptive.enabled", "true") \
    .config("spark.sql.adaptive.coalescePartitions.enabled", "true") \
    .config("spark.sql.adaptive.skewJoin.enabled", "true") \
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
    .getOrCreate()

print(f"Spark version: {spark.version}")
print(f"Spark UI: http://localhost:4041")
print("Spark session created successfully!")


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/10/03 02:20:59 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Spark version: 3.5.6
Spark UI: http://localhost:4041
Spark session created successfully!


In [3]:
# Helper functions
def parse_timestamp(timestamp_str: str) -> int:
    """Parse timestamp string to milliseconds"""
    try:
        # Try parsing ISO format
        dt = datetime.fromisoformat(timestamp_str.replace('Z', '+00:00'))
        return int(dt.timestamp() * 1000)
    except:
        # Fallback to current time
        return int(datetime.now().timestamp() * 1000)

def format_timestamp(timestamp: int) -> str:
    """Format timestamp for display"""
    dt = datetime.fromtimestamp(timestamp / 1000, tz=timezone.utc)
    return dt.isoformat()

print("Helper functions defined successfully!")


Helper functions defined successfully!


In [4]:
# Step 1: Read data from Parquet file
input_path = "fixed-dataset.parquet"

if os.path.exists(input_path):
    print(f"Reading data from: {input_path}")
    user_data = spark.read.parquet(input_path)
    print(f"Data loaded successfully! Rows: {user_data.count()}")
    user_data.show(10, False)
    user_data.printSchema()
else:
    print(f"File not found: {input_path}")
    print("Available files in app-python directory:")
    if os.path.exists("app-python"):
        for file in os.listdir("app-python"):
            print(f"  - {file}")


Reading data from: fixed-dataset.parquet
Data loaded successfully! Rows: 1000000
+--------+-----+------------+-------+------+---+------------------------------------+--------------------------------+---------------+-------------------------------------------------------------------+---------+--------------------------------+--------------------------------+-----+-------------+--------------------------------+----+
|uid     |email|authProvider|appId  |avatar|geo|role                                |lastLoginAt                     |name           |devices                                                            |resources|created_at                      |updated_at                      |level|previousLevel|updatedAt                       |team|
+--------+-----+------------+-------+------+---+------------------------------------+--------------------------------+---------------+-------------------------------------------------------------------+---------+--------------------------------+

In [5]:
# In ra 100 hàng đầu filter theo userId = 1
user_data.filter(col("uid") == "user_1").select("uid", "level").show(20, truncate=False)

+------+-----+
|uid   |level|
+------+-----+
|user_1|4    |
|user_1|8    |
|user_1|11   |
|user_1|12   |
|user_1|13   |
|user_1|17   |
|user_1|24   |
|user_1|25   |
|user_1|26   |
|user_1|32   |
|user_1|33   |
|user_1|43   |
|user_1|47   |
|user_1|53   |
|user_1|56   |
|user_1|58   |
|user_1|61   |
|user_1|67   |
|user_1|74   |
|user_1|80   |
|user_1|89   |
|user_1|90   |
|user_1|94   |
|user_1|102  |
|user_1|110  |
|user_1|112  |
|user_1|122  |
|user_1|124  |
|user_1|129  |
|user_1|130  |
|user_1|132  |
|user_1|135  |
|user_1|136  |
|user_1|141  |
|user_1|151  |
|user_1|152  |
|user_1|157  |
|user_1|164  |
|user_1|165  |
|user_1|172  |
|user_1|177  |
|user_1|179  |
|user_1|184  |
|user_1|188  |
|user_1|195  |
|user_1|202  |
|user_1|206  |
|user_1|207  |
|user_1|216  |
|user_1|222  |
|user_1|224  |
|user_1|225  |
|user_1|231  |
|user_1|235  |
|user_1|241  |
|user_1|249  |
|user_1|252  |
|user_1|255  |
|user_1|257  |
|user_1|267  |
|user_1|276  |
|user_1|277  |
|user_1|279  |
|user_1|28

In [6]:
user_scores = user_data.select(
    col("uid").alias("id"),
    when(col("level") - col("previousLevel") > 0, 
           col("level") - col("previousLevel"))
     .otherwise(0).alias("score"),
    ((to_timestamp(col("updatedAt")).cast("timestamp").cast("double") * 1000).cast("long").alias("lastUpdateTime"))
  
)
print(user_scores.show(n=20, truncate=False))

+--------+-----+--------------+
|id      |score|lastUpdateTime|
+--------+-----+--------------+
|user_63 |6    |1759450133430 |
|user_182|1    |1759450133530 |
|user_84 |5    |1759450133630 |
|user_146|1    |1759450133730 |
|user_161|1    |1759450133830 |
|user_82 |3    |1759450133930 |
|user_15 |6    |1759450134030 |
|user_167|4    |1759450134130 |
|user_118|1    |1759450134230 |
|user_37 |3    |1759450134330 |
|user_71 |4    |1759450134430 |
|user_154|4    |1759450134530 |
|user_164|4    |1759450134630 |
|user_71 |4    |1759450134730 |
|user_104|9    |1759450134830 |
|user_48 |1    |1759450134930 |
|user_112|1    |1759450135030 |
|user_46 |3    |1759450135130 |
|user_126|2    |1759450135230 |
|user_49 |4    |1759450135330 |
+--------+-----+--------------+
only showing top 20 rows

None


In [7]:
# Step 3: Calculate total scores in sliding window 
from pyspark.sql.window import Window
from pyspark.sql.functions import sum as spark_sum, lag, col, when

window_size_minutes = 1
window_size_ms = window_size_minutes * 60 * 1000

print(f"Calculating sliding window scores for {window_size_minutes} minute window...")

# Create window specification for sliding window calculation
window_spec = Window.partitionBy("id").orderBy("lastUpdateTime")

# Calculate total score in sliding window using rangeBetween
total_scores = user_scores.withColumn(
    "totalScore",
    spark_sum("score").over(
        window_spec.rangeBetween(-window_size_ms, 0)
    )
).withColumn(
    "previousTotalScore",
    lag("totalScore", 1, 0.0).over(window_spec)
).select(
    col("id").alias("userId"),
    col("totalScore"),
    col("previousTotalScore"),
    col("lastUpdateTime")
)

print("done!")


Calculating sliding window scores for 1 minute window...
done!


In [8]:
print(f"Total scores calculated for {window_size_minutes} minute window!")
total_scores.filter(col("userId") == "user_1").show(20, False)
print(f"Total records: {total_scores.count()}")

Total scores calculated for 1 minute window!
+------+----------+------------------+--------------+
|userId|totalScore|previousTotalScore|lastUpdateTime|
+------+----------+------------------+--------------+
|user_1|4         |0                 |1759450144031 |
|user_1|8         |4                 |1759450153732 |
|user_1|11        |8                 |1759450175935 |
|user_1|12        |11                |1759450189636 |
|user_1|13        |12                |1759450193936 |
|user_1|17        |13                |1759450196636 |
|user_1|24        |17                |1759450200437 |
|user_1|25        |24                |1759450201137 |
|user_1|22        |25                |1759450206738 |
|user_1|24        |22                |1759450221339 |
|user_1|16        |24                |1759450259743 |
|user_1|11        |16                |1759450288445 |
|user_1|15        |11                |1759450305347 |
|user_1|20        |15                |1759450325848 |
|user_1|23        |20                

In [9]:
def calculate_leaderboard_at_snapshot(total_scores: DataFrame, snapshot_time: int, 
                                    top_n: int, cutoff_time: int) -> List[Dict]:

    
   
    
    print(f"  Snapshot at {format_timestamp(snapshot_time)}: using data from {format_timestamp(cutoff_time)} to {format_timestamp(snapshot_time)}")
    

    valid_scores = total_scores.filter(
        (col('lastUpdateTime') <= snapshot_time) & 
        (col('lastUpdateTime') > cutoff_time)
    ).collect()
    
    print(f"    Found {len(valid_scores)} valid scores in cleanup interval")
    
    # Group by user và lấy score mới nhất cho mỗi user
    user_latest_scores = {}
    for score in valid_scores:
        user_id = score['userId']
        if user_id not in user_latest_scores or score['lastUpdateTime'] > user_latest_scores[user_id]['lastUpdateTime']:
            user_latest_scores[user_id] = score
    
    print(f"    Found {len(user_latest_scores)} unique users")
    
    # Sort by total score và lấy top N (tie-breaker: lastUpdateTime desc, userId asc)
    sorted_users = sorted(
        user_latest_scores.values(),
        key=lambda x: (-float(x['totalScore']), int(x['lastUpdateTime']))
    )
    
    # Tạo leaderboard cho snapshot này
    snapshot_entries = []
    for i, user_score in enumerate(sorted_users[:top_n]):
        snapshot_entries.append({
            'userId': user_score['userId'],
            'score': user_score['totalScore'],  # Đổi từ totalScore thành score
            'lastUpdateTime': user_score['lastUpdateTime']
        })
    
    print(f"    Generated {len(snapshot_entries)} leaderboard entries (Top-{top_n})")
    return snapshot_entries

def generate_snapshots(total_scores: DataFrame, top_n: int, 
                                            cleanup_interval_minutes: int = 5,
                                            snapshot_interval_minutes: int = 7) -> List[Dict]:
    """Generate snapshots với logic retractable TopN đúng"""
    
    # Lấy tất cả timestamps
    all_timestamps = [row['lastUpdateTime'] for row in total_scores.select('lastUpdateTime').distinct().collect()]
    all_timestamps.sort()
    
    if not all_timestamps:
        return []
    
    first_timestamp = all_timestamps[0]
    last_timestamp = all_timestamps[-1]
    
    # Convert intervals to milliseconds
    snapshot_interval_ms = snapshot_interval_minutes * 60 * 1000
    cleanup_interval_ms = cleanup_interval_minutes * 60 * 1000
    
    print(f"Data range: {format_timestamp(first_timestamp)} to {format_timestamp(last_timestamp)}")
    print(f"Cleanup interval: {cleanup_interval_minutes} minutes")
    print(f"Snapshot interval: {snapshot_interval_minutes} minutes")
    
    # Generate snapshot times
    snapshot_times = []
    current_snapshot_time = first_timestamp + snapshot_interval_ms
    
    while current_snapshot_time <= last_timestamp:
        snapshot_times.append(current_snapshot_time)
        current_snapshot_time += snapshot_interval_ms
    
    # Generate cleanup times để debug
    cleanup_times = []
    # Cleanup đầu tiên = first_timestamp + cleanup_interval_ms * 2 (như trong Flink code)
    first_cleanup_time = first_timestamp + cleanup_interval_ms * 2
    current_cleanup_time = first_cleanup_time
    
    while current_cleanup_time <= last_timestamp:
        cleanup_times.append(current_cleanup_time)
        current_cleanup_time += cleanup_interval_ms
    
    print(f"\nCleanup times (for reference):")
    for i, ts in enumerate(cleanup_times[:10]):  # Show first 10
        print(f"  Cleanup {i+1}: {format_timestamp(ts)}")
    if len(cleanup_times) > 10:
        print(f"  ... and {len(cleanup_times) - 10} more")

    
    # Tính leaderboard tại các snapshot times
    all_snapshots = []
    # Sử dụng bisect để tìm cleanup_time lớn nhất mà < snapshot_time một cách hiệu quả
    import bisect

    cleanup_times_sorted = sorted(cleanup_times)
    for i, snapshot_time in enumerate(snapshot_times):
        # Tìm vị trí chèn snapshot_time vào cleanup_times_sorted
        idx = bisect.bisect_left(cleanup_times_sorted, snapshot_time)
        cutoff_time = (
            cleanup_times_sorted[0] - cleanup_interval_ms if idx == 1
            else cleanup_times_sorted[idx-2] if idx > 1
            else first_timestamp
        )

        snapshot_entries = calculate_leaderboard_at_snapshot(
            total_scores, snapshot_time, top_n, cutoff_time
        )
        
        # Format giống SnapshotTopNSink: Tuple2<timestamp, List<Score>>
        snapshot_data = {
            '_id': snapshot_time,
            'users': snapshot_entries
        }
        all_snapshots.append(snapshot_data)
    
    return all_snapshots

print("Retractable TopN snapshot functions defined successfully!")

Retractable TopN snapshot functions defined successfully!


In [None]:
# Step 5: Generate leaderboard snapshots
top_n = 10
cleanup_interval_minutes = 5
snapshot_interval_minutes = 7

print(f"Generating leaderboard snapshots with:")
print(f"  Top N: {top_n}")
print(f"  TTL: {cleanup_interval_minutes} minutes")
print(f"  Snapshot interval: {snapshot_interval_minutes} minutes")

snapshots = generate_snapshots(
    total_scores, 
    top_n, 
    cleanup_interval_minutes,
    snapshot_interval_minutes
)

print(f"\nGenerated {len(snapshots)} leaderboard entries across snapshots.")


Generating leaderboard snapshots with:
  Top N: 10
  TTL: 5 minutes
  Snapshot interval: 7 minutes


                                                                                

Data range: 2025-10-03T00:08:53.430000+00:00 to 2025-10-04T03:55:43.803000+00:00
Cleanup interval: 5 minutes
Snapshot interval: 7 minutes

Cleanup times (for reference):
  Cleanup 1: 2025-10-03T00:18:53.430000+00:00
  Cleanup 2: 2025-10-03T00:23:53.430000+00:00
  Cleanup 3: 2025-10-03T00:28:53.430000+00:00
  Cleanup 4: 2025-10-03T00:33:53.430000+00:00
  Cleanup 5: 2025-10-03T00:38:53.430000+00:00
  Cleanup 6: 2025-10-03T00:43:53.430000+00:00
  Cleanup 7: 2025-10-03T00:48:53.430000+00:00
  Cleanup 8: 2025-10-03T00:53:53.430000+00:00
  Cleanup 9: 2025-10-03T00:58:53.430000+00:00
  Cleanup 10: 2025-10-03T01:03:53.430000+00:00
  ... and 322 more
  Snapshot at 2025-10-03T00:15:53.430000+00:00: using data from 2025-10-03T00:08:53.430000+00:00 to 2025-10-03T00:15:53.430000+00:00


                                                                                

    Found 4199 valid scores in cleanup interval
    Found 201 unique users
    Generated 10 leaderboard entries (Top-10)
  Snapshot at 2025-10-03T00:22:53.430000+00:00: using data from 2025-10-03T00:13:53.430000+00:00 to 2025-10-03T00:22:53.430000+00:00


                                                                                

    Found 5400 valid scores in cleanup interval
    Found 201 unique users
    Generated 10 leaderboard entries (Top-10)
  Snapshot at 2025-10-03T00:29:53.430000+00:00: using data from 2025-10-03T00:23:53.430000+00:00 to 2025-10-03T00:29:53.430000+00:00


                                                                                

    Found 3599 valid scores in cleanup interval
    Found 201 unique users
    Generated 10 leaderboard entries (Top-10)
  Snapshot at 2025-10-03T00:36:53.430000+00:00: using data from 2025-10-03T00:28:53.430000+00:00 to 2025-10-03T00:36:53.430000+00:00


                                                                                

    Found 4800 valid scores in cleanup interval
    Found 201 unique users
    Generated 10 leaderboard entries (Top-10)
  Snapshot at 2025-10-03T00:43:53.430000+00:00: using data from 2025-10-03T00:33:53.430000+00:00 to 2025-10-03T00:43:53.430000+00:00


                                                                                

    Found 5999 valid scores in cleanup interval
    Found 202 unique users
    Generated 10 leaderboard entries (Top-10)
  Snapshot at 2025-10-03T00:50:53.430000+00:00: using data from 2025-10-03T00:43:53.430000+00:00 to 2025-10-03T00:50:53.430000+00:00


                                                                                

    Found 4200 valid scores in cleanup interval
    Found 201 unique users
    Generated 10 leaderboard entries (Top-10)
  Snapshot at 2025-10-03T00:57:53.430000+00:00: using data from 2025-10-03T00:48:53.430000+00:00 to 2025-10-03T00:57:53.430000+00:00


                                                                                

    Found 5400 valid scores in cleanup interval
    Found 201 unique users
    Generated 10 leaderboard entries (Top-10)
  Snapshot at 2025-10-03T01:04:53.430000+00:00: using data from 2025-10-03T00:58:53.430000+00:00 to 2025-10-03T01:04:53.430000+00:00


                                                                                

    Found 3599 valid scores in cleanup interval
    Found 201 unique users
    Generated 10 leaderboard entries (Top-10)
  Snapshot at 2025-10-03T01:11:53.430000+00:00: using data from 2025-10-03T01:03:53.430000+00:00 to 2025-10-03T01:11:53.430000+00:00


                                                                                

    Found 4800 valid scores in cleanup interval
    Found 201 unique users
    Generated 10 leaderboard entries (Top-10)
  Snapshot at 2025-10-03T01:18:53.430000+00:00: using data from 2025-10-03T01:08:53.430000+00:00 to 2025-10-03T01:18:53.430000+00:00


                                                                                

    Found 5999 valid scores in cleanup interval
    Found 202 unique users
    Generated 10 leaderboard entries (Top-10)
  Snapshot at 2025-10-03T01:25:53.430000+00:00: using data from 2025-10-03T01:18:53.430000+00:00 to 2025-10-03T01:25:53.430000+00:00


                                                                                

    Found 4200 valid scores in cleanup interval
    Found 201 unique users
    Generated 10 leaderboard entries (Top-10)
  Snapshot at 2025-10-03T01:32:53.430000+00:00: using data from 2025-10-03T01:23:53.430000+00:00 to 2025-10-03T01:32:53.430000+00:00


                                                                                

    Found 5400 valid scores in cleanup interval
    Found 201 unique users
    Generated 10 leaderboard entries (Top-10)
  Snapshot at 2025-10-03T01:39:53.430000+00:00: using data from 2025-10-03T01:33:53.430000+00:00 to 2025-10-03T01:39:53.430000+00:00


                                                                                

    Found 3599 valid scores in cleanup interval
    Found 201 unique users
    Generated 10 leaderboard entries (Top-10)
  Snapshot at 2025-10-03T01:46:53.430000+00:00: using data from 2025-10-03T01:38:53.430000+00:00 to 2025-10-03T01:46:53.430000+00:00


                                                                                

    Found 4800 valid scores in cleanup interval
    Found 201 unique users
    Generated 10 leaderboard entries (Top-10)
  Snapshot at 2025-10-03T01:53:53.430000+00:00: using data from 2025-10-03T01:43:53.430000+00:00 to 2025-10-03T01:53:53.430000+00:00


                                                                                

    Found 5999 valid scores in cleanup interval
    Found 202 unique users
    Generated 10 leaderboard entries (Top-10)
  Snapshot at 2025-10-03T02:00:53.430000+00:00: using data from 2025-10-03T01:53:53.430000+00:00 to 2025-10-03T02:00:53.430000+00:00


                                                                                

    Found 4200 valid scores in cleanup interval
    Found 201 unique users
    Generated 10 leaderboard entries (Top-10)
  Snapshot at 2025-10-03T02:07:53.430000+00:00: using data from 2025-10-03T01:58:53.430000+00:00 to 2025-10-03T02:07:53.430000+00:00


                                                                                

    Found 5399 valid scores in cleanup interval
    Found 201 unique users
    Generated 10 leaderboard entries (Top-10)
  Snapshot at 2025-10-03T02:14:53.430000+00:00: using data from 2025-10-03T02:08:53.430000+00:00 to 2025-10-03T02:14:53.430000+00:00


                                                                                

    Found 3600 valid scores in cleanup interval
    Found 201 unique users
    Generated 10 leaderboard entries (Top-10)
  Snapshot at 2025-10-03T02:21:53.430000+00:00: using data from 2025-10-03T02:13:53.430000+00:00 to 2025-10-03T02:21:53.430000+00:00


                                                                                

    Found 4800 valid scores in cleanup interval
    Found 201 unique users
    Generated 10 leaderboard entries (Top-10)
  Snapshot at 2025-10-03T02:28:53.430000+00:00: using data from 2025-10-03T02:18:53.430000+00:00 to 2025-10-03T02:28:53.430000+00:00


                                                                                

    Found 5999 valid scores in cleanup interval
    Found 202 unique users
    Generated 10 leaderboard entries (Top-10)
  Snapshot at 2025-10-03T02:35:53.430000+00:00: using data from 2025-10-03T02:28:53.430000+00:00 to 2025-10-03T02:35:53.430000+00:00


                                                                                

    Found 4200 valid scores in cleanup interval
    Found 201 unique users
    Generated 10 leaderboard entries (Top-10)
  Snapshot at 2025-10-03T02:42:53.430000+00:00: using data from 2025-10-03T02:33:53.430000+00:00 to 2025-10-03T02:42:53.430000+00:00


                                                                                

    Found 5400 valid scores in cleanup interval
    Found 201 unique users
    Generated 10 leaderboard entries (Top-10)
  Snapshot at 2025-10-03T02:49:53.430000+00:00: using data from 2025-10-03T02:43:53.430000+00:00 to 2025-10-03T02:49:53.430000+00:00


                                                                                

    Found 3600 valid scores in cleanup interval
    Found 201 unique users
    Generated 10 leaderboard entries (Top-10)
  Snapshot at 2025-10-03T02:56:53.430000+00:00: using data from 2025-10-03T02:48:53.430000+00:00 to 2025-10-03T02:56:53.430000+00:00


                                                                                

    Found 4799 valid scores in cleanup interval
    Found 201 unique users
    Generated 10 leaderboard entries (Top-10)
  Snapshot at 2025-10-03T03:03:53.430000+00:00: using data from 2025-10-03T02:53:53.430000+00:00 to 2025-10-03T03:03:53.430000+00:00


                                                                                

    Found 5999 valid scores in cleanup interval
    Found 202 unique users
    Generated 10 leaderboard entries (Top-10)
  Snapshot at 2025-10-03T03:10:53.430000+00:00: using data from 2025-10-03T03:03:53.430000+00:00 to 2025-10-03T03:10:53.430000+00:00


                                                                                

    Found 4200 valid scores in cleanup interval
    Found 201 unique users
    Generated 10 leaderboard entries (Top-10)
  Snapshot at 2025-10-03T03:17:53.430000+00:00: using data from 2025-10-03T03:08:53.430000+00:00 to 2025-10-03T03:17:53.430000+00:00


                                                                                

    Found 5399 valid scores in cleanup interval
    Found 201 unique users
    Generated 10 leaderboard entries (Top-10)
  Snapshot at 2025-10-03T03:24:53.430000+00:00: using data from 2025-10-03T03:18:53.430000+00:00 to 2025-10-03T03:24:53.430000+00:00


                                                                                

    Found 3600 valid scores in cleanup interval
    Found 201 unique users
    Generated 10 leaderboard entries (Top-10)
  Snapshot at 2025-10-03T03:31:53.430000+00:00: using data from 2025-10-03T03:23:53.430000+00:00 to 2025-10-03T03:31:53.430000+00:00


                                                                                

    Found 4800 valid scores in cleanup interval
    Found 201 unique users
    Generated 10 leaderboard entries (Top-10)
  Snapshot at 2025-10-03T03:38:53.430000+00:00: using data from 2025-10-03T03:28:53.430000+00:00 to 2025-10-03T03:38:53.430000+00:00


                                                                                

    Found 5999 valid scores in cleanup interval
    Found 202 unique users
    Generated 10 leaderboard entries (Top-10)
  Snapshot at 2025-10-03T03:45:53.430000+00:00: using data from 2025-10-03T03:38:53.430000+00:00 to 2025-10-03T03:45:53.430000+00:00


                                                                                

    Found 4200 valid scores in cleanup interval
    Found 201 unique users
    Generated 10 leaderboard entries (Top-10)
  Snapshot at 2025-10-03T03:52:53.430000+00:00: using data from 2025-10-03T03:43:53.430000+00:00 to 2025-10-03T03:52:53.430000+00:00


                                                                                

    Found 5399 valid scores in cleanup interval
    Found 201 unique users
    Generated 10 leaderboard entries (Top-10)
  Snapshot at 2025-10-03T03:59:53.430000+00:00: using data from 2025-10-03T03:53:53.430000+00:00 to 2025-10-03T03:59:53.430000+00:00


                                                                                

    Found 3600 valid scores in cleanup interval
    Found 201 unique users
    Generated 10 leaderboard entries (Top-10)
  Snapshot at 2025-10-03T04:06:53.430000+00:00: using data from 2025-10-03T03:58:53.430000+00:00 to 2025-10-03T04:06:53.430000+00:00


                                                                                

    Found 4799 valid scores in cleanup interval
    Found 201 unique users
    Generated 10 leaderboard entries (Top-10)
  Snapshot at 2025-10-03T04:13:53.430000+00:00: using data from 2025-10-03T04:03:53.430000+00:00 to 2025-10-03T04:13:53.430000+00:00


                                                                                

    Found 6000 valid scores in cleanup interval
    Found 202 unique users
    Generated 10 leaderboard entries (Top-10)
  Snapshot at 2025-10-03T04:20:53.430000+00:00: using data from 2025-10-03T04:13:53.430000+00:00 to 2025-10-03T04:20:53.430000+00:00


                                                                                

    Found 4199 valid scores in cleanup interval
    Found 201 unique users
    Generated 10 leaderboard entries (Top-10)
  Snapshot at 2025-10-03T04:27:53.430000+00:00: using data from 2025-10-03T04:18:53.430000+00:00 to 2025-10-03T04:27:53.430000+00:00


                                                                                

    Found 5400 valid scores in cleanup interval
    Found 201 unique users
    Generated 10 leaderboard entries (Top-10)
  Snapshot at 2025-10-03T04:34:53.430000+00:00: using data from 2025-10-03T04:28:53.430000+00:00 to 2025-10-03T04:34:53.430000+00:00


                                                                                

    Found 3600 valid scores in cleanup interval
    Found 201 unique users
    Generated 10 leaderboard entries (Top-10)
  Snapshot at 2025-10-03T04:41:53.430000+00:00: using data from 2025-10-03T04:33:53.430000+00:00 to 2025-10-03T04:41:53.430000+00:00


                                                                                

    Found 4799 valid scores in cleanup interval
    Found 201 unique users
    Generated 10 leaderboard entries (Top-10)
  Snapshot at 2025-10-03T04:48:53.430000+00:00: using data from 2025-10-03T04:38:53.430000+00:00 to 2025-10-03T04:48:53.430000+00:00


                                                                                

    Found 6000 valid scores in cleanup interval
    Found 202 unique users
    Generated 10 leaderboard entries (Top-10)
  Snapshot at 2025-10-03T04:55:53.430000+00:00: using data from 2025-10-03T04:48:53.430000+00:00 to 2025-10-03T04:55:53.430000+00:00


                                                                                

    Found 4199 valid scores in cleanup interval
    Found 201 unique users
    Generated 10 leaderboard entries (Top-10)
  Snapshot at 2025-10-03T05:02:53.430000+00:00: using data from 2025-10-03T04:53:53.430000+00:00 to 2025-10-03T05:02:53.430000+00:00


                                                                                

    Found 5398 valid scores in cleanup interval
    Found 201 unique users
    Generated 10 leaderboard entries (Top-10)
  Snapshot at 2025-10-03T05:09:53.430000+00:00: using data from 2025-10-03T05:03:53.430000+00:00 to 2025-10-03T05:09:53.430000+00:00


                                                                                

    Found 3600 valid scores in cleanup interval
    Found 201 unique users
    Generated 10 leaderboard entries (Top-10)
  Snapshot at 2025-10-03T05:16:53.430000+00:00: using data from 2025-10-03T05:08:53.430000+00:00 to 2025-10-03T05:16:53.430000+00:00


                                                                                

    Found 4800 valid scores in cleanup interval
    Found 201 unique users
    Generated 10 leaderboard entries (Top-10)
  Snapshot at 2025-10-03T05:23:53.430000+00:00: using data from 2025-10-03T05:13:53.430000+00:00 to 2025-10-03T05:23:53.430000+00:00


                                                                                

    Found 5999 valid scores in cleanup interval
    Found 202 unique users
    Generated 10 leaderboard entries (Top-10)
  Snapshot at 2025-10-03T05:30:53.430000+00:00: using data from 2025-10-03T05:23:53.430000+00:00 to 2025-10-03T05:30:53.430000+00:00


                                                                                

    Found 4200 valid scores in cleanup interval
    Found 201 unique users
    Generated 10 leaderboard entries (Top-10)
  Snapshot at 2025-10-03T05:37:53.430000+00:00: using data from 2025-10-03T05:28:53.430000+00:00 to 2025-10-03T05:37:53.430000+00:00


                                                                                

    Found 5399 valid scores in cleanup interval
    Found 201 unique users
    Generated 10 leaderboard entries (Top-10)
  Snapshot at 2025-10-03T05:44:53.430000+00:00: using data from 2025-10-03T05:38:53.430000+00:00 to 2025-10-03T05:44:53.430000+00:00


                                                                                

    Found 3600 valid scores in cleanup interval
    Found 201 unique users
    Generated 10 leaderboard entries (Top-10)
  Snapshot at 2025-10-03T05:51:53.430000+00:00: using data from 2025-10-03T05:43:53.430000+00:00 to 2025-10-03T05:51:53.430000+00:00


                                                                                

    Found 4800 valid scores in cleanup interval
    Found 201 unique users
    Generated 10 leaderboard entries (Top-10)
  Snapshot at 2025-10-03T05:58:53.430000+00:00: using data from 2025-10-03T05:48:53.430000+00:00 to 2025-10-03T05:58:53.430000+00:00


                                                                                

    Found 5999 valid scores in cleanup interval
    Found 202 unique users
    Generated 10 leaderboard entries (Top-10)
  Snapshot at 2025-10-03T06:05:53.430000+00:00: using data from 2025-10-03T05:58:53.430000+00:00 to 2025-10-03T06:05:53.430000+00:00


                                                                                

    Found 4200 valid scores in cleanup interval
    Found 201 unique users
    Generated 10 leaderboard entries (Top-10)
  Snapshot at 2025-10-03T06:12:53.430000+00:00: using data from 2025-10-03T06:03:53.430000+00:00 to 2025-10-03T06:12:53.430000+00:00


                                                                                

    Found 5400 valid scores in cleanup interval
    Found 201 unique users
    Generated 10 leaderboard entries (Top-10)
  Snapshot at 2025-10-03T06:19:53.430000+00:00: using data from 2025-10-03T06:13:53.430000+00:00 to 2025-10-03T06:19:53.430000+00:00


                                                                                

    Found 3600 valid scores in cleanup interval
    Found 201 unique users
    Generated 10 leaderboard entries (Top-10)
  Snapshot at 2025-10-03T06:26:53.430000+00:00: using data from 2025-10-03T06:18:53.430000+00:00 to 2025-10-03T06:26:53.430000+00:00


In [None]:
snapshots[0]

In [None]:
# Step 6: Save results to file
import os
output_path = "leaderboard_snapshots/raw_snapshots.json"


if snapshots:
    # Create output directory if it doesn't exist
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    
    # Convert snapshots to DataFrame for saving
    snapshot_data = []
    for snapshot in snapshots:
        snapshot_time = snapshot['_id']
        users = snapshot['users']
        
        for user in users:
            snapshot_data.append({
                'userId': user['userId'],
                'score': user['score'],  # Đổi từ totalScore thành score
                'lastUpdateTime': user['lastUpdateTime'],
                'snapshotTime': snapshot_time,
                'snapshotTimeFormatted': format_timestamp(snapshot_time),
                'lastUpdateTimeFormatted': format_timestamp(user['lastUpdateTime'])
            })
    
    snapshot_df = spark.createDataFrame(snapshot_data)
    

    
    # Save raw snapshots in MongoDB format
    import json
    raw_snapshots_path = "leaderboard_snapshots/raw_snapshots.json"
    with open(raw_snapshots_path, 'w') as f:
        json.dump(snapshots, f, indent=2)
    print(f"Raw snapshots (MongoDB format) saved to: {raw_snapshots_path}")
    
else:
    print("No snapshots to save!")

In [None]:
# Clean up
print("Stopping Spark session...")
spark.stop()
print("Spark session stopped successfully!")
