In [None]:
import os

os.environ["STEAM__API_KEY"] = "id"
os.environ["FABRIC__WORKSPACE_ID"] = "id"
os.environ["FABRIC__BRONZE_LAKEHOUSE_ID"] = "id"
os.environ["FABRIC__SILVER_LAKEHOUSE_ID"] = "id"
os.environ["FABRIC__GOLD_LAKEHOUSE_ID"] = "id"

import sys
sys.path.insert(0, "/lakehouse/default/Files")

from src.steam_analytics.config import Settings
settings = Settings()

BRONZE_PATH = settings.fabric.bronze_abfss_path
print(f"Bronze Path: {BRONZE_PATH}")

### Fetch All Games from Steam

In [None]:
from src.steam_analytics.ingestion.extractors.app_list import AppListExtractor

extractor = AppListExtractor()

# Get all games (uses IStoreService with include_games=true)
print("=" * 60)
print("STEP 1: Fetching game list from Steam API...")
print("=" * 60)

all_apps = await extractor.get_all_apps()
print(f"\nTotal apps from API: {len(all_apps):,}")

# Additional filtering (in case API lets some junk through)
filtered_apps = extractor.filter_likely_games(all_apps)
print(f"After keyword filter: {len(filtered_apps):,}")

In [None]:
# Get Player Counts (This takes ~1-2 hours)
print("=" * 60)
print("STEP 2: Fetching player counts...")
print("=" * 60)
print(f"Games to check: {len(filtered_apps):,}")
print("Estimated time: ~1-2 hours")
print("=" * 60)

# Get all app IDs
app_ids = [app.app_id for app in filtered_apps]

# Progress callback for logging
def progress(completed, total):
    if completed % 5000 == 0:
        pct = completed / total * 100
        print(f"Progress: {completed:,}/{total:,} ({pct:.1f}%)")

# Fetch player counts (rate limited)
player_counts = await extractor.get_player_counts_batch(
    app_ids, 
    concurrency=5,
    progress_callback=progress
)

print(f"\nCompleted! Got {len(player_counts):,} results")

# Create lookup
player_lookup = {
    r.app_id: r.player_count 
    for r in player_counts 
    if r.success
}
print(f"Successful lookups: {len(player_lookup):,}")

In [None]:
# Build Catalog with Priorities
from src.steam_analytics.catalog import GameCatalogManager, SyncPriority
from datetime import datetime

print("=" * 60)
print("STEP 3: Building catalog with priorities...")
print("=" * 60)

manager = GameCatalogManager()
now = datetime.utcnow()

catalog_entries = []
for app in filtered_apps:
    player_count = player_lookup.get(app.app_id)
    
    entry = manager.create_catalog_entry(
        app_id=app.app_id,
        name=app.name,
        player_count=player_count,
        discovered_at=now,
    )
    catalog_entries.append(entry.to_dict())

print(f"Catalog entries created: {len(catalog_entries):,}")

# Convert to DataFrame for stats
import pandas as pd
catalog_pdf = pd.DataFrame(catalog_entries)

# Show priority distribution
print("\nPriority Distribution:")
print(catalog_pdf["priority"].value_counts().sort_index())

# Show top games
print("\nTop 10 Games by Player Count:")
top_10 = catalog_pdf.nlargest(10, "player_count")[["app_id", "name", "player_count", "priority"]]
print(top_10.to_string(index=False))

In [None]:
from pyspark.sql.types import (
    StructType, StructField, IntegerType, StringType, TimestampType, LongType
)
from pyspark.sql import functions as F

print("=" * 60)
print("STEP 4: Saving catalog to Delta table...")
print("=" * 60)

catalog_schema = StructType([
    StructField("app_id", IntegerType(), False),
    StructField("name", StringType(), True),
    StructField("player_count", IntegerType(), True),
    StructField("priority", StringType(), True),
    StructField("discovered_at", StringType(), False),  # Viene como string ISO
    StructField("last_synced_at", StringType(), True)   # Viene como None, lo tratamos como String temporalmente
])

# Convert to Spark DataFrame usando el esquema
catalog_df = spark.createDataFrame(catalog_entries, schema=catalog_schema)

# Add metadata (El resto de tu código sigue igual...)
catalog_df = catalog_df.withColumn(
    "discovered_at", 
    F.to_timestamp(F.col("discovered_at"))
).withColumn(
    "last_synced_at",
    F.lit(None).cast(TimestampType()) # Aquí ya lo convertimos a Timestamp oficial
)

# Show schema
catalog_df.printSchema()

# Show stats
print("\nCatalog Stats:")
catalog_df.groupBy("priority").count().orderBy("priority").show()

# Total players
total_players = catalog_df.agg(F.sum("player_count")).collect()[0][0]
print(f"Total players across all games: {total_players:,}")

# Save to Delta
catalog_path = f"{BRONZE_PATH}/Tables/game_catalog"
print(f"\nSaving to: {catalog_path}")

catalog_df.write.format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .save(catalog_path)

print("✅ Catalog saved successfully!")

In [None]:
# Verify
print("=" * 60)
print("VERIFICATION")
print("=" * 60)

# Read back
verify_df = spark.read.format("delta").load(f"{BRONZE_PATH}/Tables/game_catalog")

print(f"Total games in catalog: {verify_df.count():,}")

print("\nBy Priority:")
verify_df.groupBy("priority").agg(
    F.count("*").alias("games"),
    F.sum("player_count").alias("total_players"),
    F.avg("player_count").alias("avg_players")
).orderBy("priority").show()

print("\nTop 20 Games:")
verify_df.orderBy(F.col("player_count").desc()).select(
    "app_id", "name", "player_count", "priority"
).show(20, truncate=False)

print("\n✅ Discovery complete!")
print(f"Catalog ready at: {BRONZE_PATH}/Tables/game_catalog")

In [None]:
# Estimate Sync Times
print("=" * 60)
print("SYNC TIME ESTIMATES")
print("=" * 60)

# Get counts per priority
high_count = verify_df.filter("priority = 'high'").count()
medium_count = verify_df.filter("priority = 'medium'").count()
low_count = verify_df.filter("priority = 'low'").count()

print(f"\nHIGH priority:   {high_count:,} games")
print(f"MEDIUM priority: {medium_count:,} games")
print(f"LOW priority:    {low_count:,} games")

print("\nDaily Sync Estimates (3 API sources per game):")
for name, count in [("HIGH only", high_count), 
                     ("HIGH+MEDIUM", high_count + medium_count),
                     ("ALL", high_count + medium_count + low_count)]:
    est = manager.estimate_sync_time(count)
    print(f"  {name}: ~{est['estimated_minutes']:.0f} min ({est['estimated_hours']:.1f} hrs)")