In [None]:
import os

os.environ["STEAM__API_KEY"] = "id"
os.environ["FABRIC__WORKSPACE_ID"] = "id"
os.environ["FABRIC__BRONZE_LAKEHOUSE_ID"] = "id"
os.environ["FABRIC__SILVER_LAKEHOUSE_ID"] = "id"
os.environ["FABRIC__GOLD_LAKEHOUSE_ID"] = "id"

import sys
sys.path.insert(0, "/lakehouse/default/Files")

from src.steam_analytics.config import Settings
settings = Settings()

BRONZE_PATH = settings.fabric.bronze_abfss_path
CATALOG_PATH = f"{BRONZE_PATH}/Tables/game_catalog"

print(f"Catalog Path: {CATALOG_PATH}")

In [None]:
from pyspark.sql import functions as F

print("=" * 60)
print("STEP 1: Loading existing catalog...")
print("=" * 60)

existing_catalog = spark.read.format("delta").load(CATALOG_PATH)
existing_count = existing_catalog.count()

print(f"Existing games in catalog: {existing_count:,}")

# Get set of existing app_ids for fast lookup
existing_ids = set(
    row.app_id for row in 
    existing_catalog.select("app_id").collect()
)

print(f"Loaded {len(existing_ids):,} app IDs")

In [None]:
from src.steam_analytics.ingestion.extractors.app_list import AppListExtractor

extractor = AppListExtractor()

print("=" * 60)
print("STEP 2: Fetching current app list from Steam...")
print("=" * 60)

all_apps = await extractor.get_all_apps()
filtered_apps = extractor.filter_likely_games(all_apps)

print(f"Total games from Steam: {len(filtered_apps):,}")

In [None]:
print("=" * 60)
print("STEP 3: Detecting new games...")
print("=" * 60)

new_apps = [app for app in filtered_apps if app.app_id not in existing_ids]

print(f"New games found: {len(new_apps):,}")

if new_apps:
    print("\nSample of new games:")
    for app in new_apps[:10]:
        print(f"  {app.app_id}: {app.name}")

In [None]:
if new_apps:
    print("=" * 60)
    print("STEP 4: Fetching player counts for new games...")
    print("=" * 60)
    
    new_app_ids = [app.app_id for app in new_apps]
    
    player_counts = await extractor.get_player_counts_batch(
        new_app_ids,
        concurrency=5
    )
    
    player_lookup = {
        r.app_id: r.player_count 
        for r in player_counts 
        if r.success
    }
    
    print(f"Got player counts for {len(player_lookup):,} new games")
else:
    print("No new games to process!")
    player_lookup = {}

In [None]:
from src.steam_analytics.catalog import GameCatalogManager
from datetime import datetime

if new_apps:
    print("=" * 60)
    print("STEP 5: Creating catalog entries for new games...")
    print("=" * 60)
    
    manager = GameCatalogManager()
    now = datetime.utcnow()
    
    new_entries = []
    for app in new_apps:
        player_count = player_lookup.get(app.app_id)
        entry = manager.create_catalog_entry(
            app_id=app.app_id,
            name=app.name,
            player_count=player_count,
            discovered_at=now,
        )
        new_entries.append(entry.to_dict())
    
    # Stats
    from collections import Counter
    priority_counts = Counter(e["priority"] for e in new_entries)
    
    print(f"New entries created: {len(new_entries):,}")
    print("\nBy priority:")
    for priority, count in sorted(priority_counts.items()):
        print(f"  {priority}: {count:,}")

In [None]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, TimestampType
from pyspark.sql import functions as F

if new_apps and new_entries:
    print("=" * 60)
    print("STEP 6: Appending new games to catalog (Explicit Schema Fixed)...")
    print("=" * 60)
    
    # 1. EL CAMBIO CLAVE:
    # Definimos las fechas como StringType() inicialmente porque tus datos vienen como texto "2026-..."
    catalog_schema = StructType([
        StructField("app_id", IntegerType(), False),
        StructField("name", StringType(), True),
        StructField("player_count", IntegerType(), True),
        StructField("priority", StringType(), True),
        StructField("discovered_at", StringType(), True),   # <--- AHORA ES STRING
        StructField("last_synced_at", StringType(), True)   # <--- AHORA ES STRING
    ])
    
    # 2. Creamos el DataFrame (Ahora Spark aceptará el texto sin quejarse)
    new_df = spark.createDataFrame(new_entries, schema=catalog_schema)
    
    # 3. Convertimos a Timestamp ANTES de guardar
    # Esto transforma el texto "2026-02-05..." a un objeto de fecha real
    new_df_final = (new_df
        .withColumn("discovered_at", F.col("discovered_at").cast(TimestampType()))
        .withColumn("last_synced_at", F.col("last_synced_at").cast(TimestampType()))
    )
    
    # 4. Guardamos
    new_df_final.write.format("delta").mode("append").save(CATALOG_PATH)
    
    print(f"✅ Appended {len(new_entries):,} new games to catalog successfully!")

else:
    print("Nothing to append - catalog is up to date!")

In [None]:
print("=" * 60)
print("VERIFICATION")
print("=" * 60)

final_catalog = spark.read.format("delta").load(CATALOG_PATH)
final_count = final_catalog.count()

print(f"Previous catalog size: {existing_count:,}")
print(f"New games added:       {final_count - existing_count:,}")
print(f"Final catalog size:    {final_count:,}")

print("\nFinal Priority Distribution:")
final_catalog.groupBy("priority").agg(
    F.count("*").alias("games"),
    F.sum("player_count").alias("total_players")
).orderBy("priority").show()

print("\nCatalog refresh complete!")

In [None]:
from notebookutils import mssparkutils
import json

output = {
    "status": "success",
    "previous_count": existing_count,
    "new_games_added": final_count - existing_count,
    "final_count": final_count,
}

print(json.dumps(output, indent=2))
mssparkutils.notebook.exit(json.dumps(output))