In [8]:
import pandas as pd
import requests
import time
import os
from datetime import datetime

def log(msg):
    print(f"[{datetime.now()}] {msg}")

# ----------- Paths & Settings -----------
MASTER_PATH          = "steam_games_with_owners_parsed.csv"
MISSING_REPORT_PATH  = "missingness_report.txt"
LOG_PATH             = "missing_fill_log.txt"
MAX_RETRIES          = 2
INITIAL_WAIT         = 2      # seconds
SLEEP_BETWEEN_CALLS  = 1.5    # seconds
CRITICAL_RAW_COLS    = ["short_description", "about_the_game"]  # adjust as needed

# ----------- Step 1: Load Master and Report Any NaNs -----------
log("STEP 1: Loading master dataset and computing missingness…")
if not os.path.exists(MASTER_PATH):
    raise FileNotFoundError(f"Master file not found at '{MASTER_PATH}'")

master_df = pd.read_csv(MASTER_PATH)
total_rows = len(master_df)

# Report any column that has NaNs
nan_counts = master_df.isna().sum()
nan_report = []
for col, count in nan_counts.items():
    if count > 0:
        pct = count / total_rows * 100
        nan_report.append(f"{col}: {count} missing ({pct:.1f}%)")

if not nan_report:
    log("  → No missing values in any column.")
else:
    log("  → Columns with missing values:")
    for line in nan_report:
        log(f"    • {line}")

# Append missingness summary to text file (always append)
with open(MISSING_REPORT_PATH, "a") as rep:
    rep.write(f"\n=== Missingness Report {datetime.now()} ===\n")
    if nan_report:
        for line in nan_report:
            rep.write(line + "\n")
    else:
        rep.write("No missing values found.\n")
    rep.write("-" * 50 + "\n")

# ----------- Step 2: Identify Rows Missing Critical Raw Fields -----------
# Only look at raw columns, not one-hot. Adjust CRITICAL_RAW_COLS as desired.
log("STEP 2: Identifying rows missing critical raw fields…")
available_raw_cols = [c for c in CRITICAL_RAW_COLS if c in master_df.columns]
missing_mask = master_df[available_raw_cols].isna().any(axis=1)
rows_to_fill = master_df.loc[missing_mask, "app_id"].astype(int).tolist()

if not rows_to_fill:
    log("  → No rows missing the specified raw fields.")
else:
    log(f"  → Found {len(rows_to_fill)} App IDs missing raw fields: {rows_to_fill[:10]}...")

# ----------- Step 3: Fetch Missing Data from Steam & Fill -----------
def fetch_steam_info(app_id):
    retry = 0
    while retry < MAX_RETRIES:
        try:
            url = f"https://store.steampowered.com/api/appdetails?appids={app_id}&cc=us&l=en"
            resp = requests.get(url, timeout=10)
            resp.raise_for_status()
            px = resp.json()
            if str(app_id) in px and px[str(app_id)].get("success"):
                return px[str(app_id)]["data"]
            else:
                return None
        except requests.exceptions.RequestException as e:
            wait = INITIAL_WAIT * (2 ** retry)
            log(f"    → Retry {retry+1}/{MAX_RETRIES} for App ID {app_id} after {wait}s due to: {e}")
            time.sleep(wait)
            retry += 1
    log(f"    ! Failed to fetch Steam data for App ID {app_id} after {MAX_RETRIES} tries.")
    return None

log("STEP 3: Attempting to fill missing raw fields from Steam API…")
fills = []  # list of (app_id, {col: new_value})

for idx, app_id in enumerate(rows_to_fill, start=1):
    log(f"  [{idx}/{len(rows_to_fill)}] Processing App ID {app_id}…")
    row_index = master_df.index[master_df["app_id"] == app_id][0]
    steam_data = fetch_steam_info(app_id)
    if steam_data is None:
        log(f"    ✗ No Steam data for {app_id}, skipping.")
        continue

    updates = {}
    # For each critical raw column, if it’s missing, try to pull from Steam JSON
    if "short_description" in available_raw_cols and pd.isna(master_df.at[row_index, "short_description"]):
        val = steam_data.get("short_description")
        if val:
            updates["short_description"] = val

    if "about_the_game" in available_raw_cols and pd.isna(master_df.at[row_index, "about_the_game"]):
        val = steam_data.get("about_the_game")
        if val:
            updates["about_the_game"] = val

    # Add more raw fields as needed…

    if updates:
        log(f"    ✓ Found data to fill: {list(updates.keys())}")
        for col, new_val in updates.items():
            master_df.at[row_index, col] = new_val
        fills.append((app_id, updates))
    else:
        log(f"    → No new data found to fill for {app_id}.")
    time.sleep(SLEEP_BETWEEN_CALLS)

# ----------- Step 4: Save Filled Values & Log -----------

if fills:
    log(f"STEP 4: Writing {len(fills)} updated rows back to master CSV…")
    master_df.to_csv(MASTER_PATH, index=False)
    log("  → Master CSV overwritten with filled values.")

    with open(LOG_PATH, "a") as lf:
        lf.write(f"\n=== Missing-Fill Actions {datetime.now()} ===\n")
        for app_id, cols in fills:
            lf.write(f"App ID {app_id} filled columns: {list(cols.keys())}\n")
        lf.write("-" * 50 + "\n")
    log(f"  → Fill actions appended to '{LOG_PATH}'.")
else:
    log("STEP 4: No fills were performed. Master CSV unchanged.")


[2025-06-03 18:40:54.381172] STEP 1: Loading master dataset and computing missingness…
[2025-06-03 18:40:54.550186]   → Columns with missing values:
[2025-06-03 18:40:54.550186]     • about_the_game: 15 missing (0.6%)
[2025-06-03 18:40:54.550186]     • achievements_total: 451 missing (16.8%)
[2025-06-03 18:40:54.550186]     • background_image: 285 missing (10.6%)
[2025-06-03 18:40:54.550186]     • coming_soon: 285 missing (10.6%)
[2025-06-03 18:40:54.550186]     • content_descriptors: 502 missing (18.7%)
[2025-06-03 18:40:54.550186]     • controller_support: 492 missing (18.3%)
[2025-06-03 18:40:54.550186]     • detailed_description: 288 missing (10.7%)
[2025-06-03 18:40:54.550186]     • developer: 290 missing (10.8%)
[2025-06-03 18:40:54.550186]     • developers: 402 missing (15.0%)
[2025-06-03 18:40:54.550186]     • discount_percent: 1078 missing (40.2%)
[2025-06-03 18:40:54.550186]     • dlc_count: 285 missing (10.6%)
[2025-06-03 18:40:54.550186]     • dlc_list: 523 missing (19.5%)
