In [1]:
import requests
import json
import os
import pandas as pd
import time

In [2]:
# API_key
RAWG_API_KEY = " "  # Fill in the API Key

In [3]:
def test_rawg_api_key(URL: str):
    """
    This function tests whether the API is valid by making a GET request.
    If successful, it prints a sample game from the response.
    """

    # Send GET request
    response = requests.get(URL)

    # Check if the API request was successful
    if response.status_code == 200:
        data = response.json()  # Parse JSON response
        print("API request successful! Sample data:")

        # Display sample data (only first game for readability)
        print(json.dumps(data["results"][0], indent=4))

    else:
        print(f"API request failed. Status code: {response.status_code}, Error: {response.text}")



In [7]:
#  API Request URL (Fetching the first 5 games)
test_rawg_URL = f"https://api.rawg.io/api/games?key={RAWG_API_KEY}&page=1&page_size=5"
test_rawg_api_key(test_rawg_URL)

API request failed. Status code: 401, Error: {"error": "The API key is not found"}


In [8]:
save_file = "games_data.json"
progress_file = "progress.json"
temp_save_file = "games_data_temp.json"
temp_progress_file = "progress_temp.json"

In [9]:
# This cell is generated using ChatGPT

from datetime import datetime

# Define local file paths for persistent storage
save_file = "games_data.json"
progress_file = "progress.json"
temp_save_file = "games_data_temp.json"  # Temporary file to prevent corruption
temp_progress_file = "progress_temp.json"  # Temporary progress file

def safe_load_json(filename):
    """Safely loads JSON data from a file, handling potential corruption."""
    if os.path.exists(filename):
        try:
            with open(filename, "r", encoding="utf-8") as f:
                return json.load(f)
        except json.JSONDecodeError:
            print(f"Warning: Corrupted JSON file detected ({filename}). Starting fresh.")
            return [] if "games_data" in filename else {"last_page": 1}
    return [] if "games_data" in filename else {"last_page": 1}

def save_data_realtime(games_list, page):
    """Safely saves game data and progress to prevent corruption."""

    # Write to a temporary file first, then replace the original file (atomic operation)
    with open(temp_save_file, "w", encoding="utf-8") as f:
        json.dump(games_list, f, ensure_ascii=False, indent=4)
    os.replace(temp_save_file, save_file)

    # Save progress safely
    with open(temp_progress_file, "w", encoding="utf-8") as f:
        json.dump({"last_page": page}, f)
    os.replace(temp_progress_file, progress_file)

    print(f"Data saved successfully! Current page: {page}, Total games stored: {len(games_list)}")

def fetch_all_games(rawg_api_key, base_url: str, page_size=40, max_attempts=3, rate_limit=1.0):
    """
    Fetches all available games from the RAWG API while handling interruptions and rate limits.

    Args:
        rawg_api_key (str): Your RAWG API key.
        base_url (str): Base URL for the RAWG API.
        page_size (int): Number of games per page (default: 40, max allowed).
        max_attempts (int): Maximum retry attempts for failed requests (default: 3).
        rate_limit (float): Delay in seconds between requests (default: 1.0 sec).

    Returns:
        list: A list of all game data retrieved from the API.
    """
    # Load previously saved data and progress
    all_games = safe_load_json(save_file)
    progress = safe_load_json(progress_file)
    last_page = progress.get("last_page", 1)  # Resume from the last saved page

    print(f"Resuming from page {last_page}...")

    page = last_page
    while True:
        print(f"Fetching page {page} at {datetime.now().strftime('%H:%M:%S')}...")

        url = f"{base_url}?key={rawg_api_key}&page={page}&page_size={page_size}"

        # Exponential backoff for retries
        for attempt in range(max_attempts):
            try:
                response = requests.get(url, timeout=10)

                if response.status_code == 200:
                    data = response.json()
                    break  # Exit retry loop on success

                print(f"Attempt {attempt + 1}: Error {response.status_code}, retrying in {2 ** attempt} sec...")
                time.sleep(2 ** attempt)  # Exponential backoff
            except requests.exceptions.RequestException as e:
                print(f"Attempt {attempt + 1}: Request failed due to {e}, retrying in {2 ** attempt} sec...")
                time.sleep(2 ** attempt)
        else:
            print(f"Failed to fetch page {page} after {max_attempts} attempts. Skipping to next page...")
            page += 1
            continue

        # Extract results
        results = data.get("results", [])
        if not results:
            print("No more games available. Stopping data collection.")
            break

        # Store new data while avoiding duplicates
        all_games.extend(results)

        # Save data in real-time (avoiding excessive writes)
        if page % 5 == 0:  # Save every 5 pages to reduce I/O operations
            save_data_realtime(all_games, page)

        # Respect API rate limits
        time.sleep(rate_limit)

        # Move to the next page
        page += 1

    # Final save after fetching all pages
    save_data_realtime(all_games, page)

    print(f"Total games fetched: {len(all_games)}")
    return all_games



In [12]:
fetch_all_games(RAWG_API_KEY, "https://api.rawg.io/api/games")

Resuming from page 1...
Fetching page 1 at 01:04:25...
Attempt 1: Error 401, retrying in 1 sec...
Attempt 2: Error 401, retrying in 2 sec...
Attempt 3: Error 401, retrying in 4 sec...
Failed to fetch page 1 after 3 attempts. Skipping to next page...
Fetching page 2 at 01:04:34...
Attempt 1: Error 401, retrying in 1 sec...


KeyboardInterrupt: 

In [13]:
# This cell is assisted with ChatGPT

import json

# Load stored game data
with open("games_data.json", "r", encoding="utf-8") as f:
    all_games = json.load(f)

# Estimate stored pages
total_games_stored = len(all_games)
games_per_page = 40
estimated_pages_stored = total_games_stored // games_per_page  # ~20098 pages

# Generate a list of expected pages
expected_pages = set(range(1, estimated_pages_stored + 1))

# Extract stored pages (Assuming unique game IDs)
stored_game_ids = set(game["id"] for game in all_games)

# If we assume game IDs are sequential, we estimate missing pages
missing_pages_before_20099 = [p for p in expected_pages if p not in stored_game_ids]

print(f"Estimated pages stored: {estimated_pages_stored}")
print(f"Missing pages before 20099: {missing_pages_before_20099[:20]}... (showing first 20)")
print(f"Total missing pages before 20099: {len(missing_pages_before_20099)}")


FileNotFoundError: [Errno 2] No such file or directory: 'games_data.json'

In [14]:
# Merge missing pages from 1-20098 and 20099-22160
all_missing_pages = sorted(set(missing_pages_before_20099 + missing_pages))

print(f"Final missing pages: {all_missing_pages[:20]}... (showing first 20)")
print(f"Total pages to fetch: {len(all_missing_pages)}")


NameError: name 'missing_pages_before_20099' is not defined

In [15]:
for page in all_missing_pages:
    print(f"Fetching missing page {page}...")
    fetch_all_games(RAWG_API_KEY, "https://api.rawg.io/api/games", page_size=40, max_attempts=3, rate_limit=1.0)

    print(f"Page {page} fetched! Sleeping for 5 seconds to avoid rate limits...")
    time.sleep(5)  # Adjust based on API limits


NameError: name 'all_missing_pages' is not defined

In [16]:
import pandas as pd
import orjson

# Stream JSON file and load in chunks
def load_json_fast(file_path, max_records=None):
    with open(file_path, "r", encoding="utf-8") as f:
        data = orjson.loads(f.read())  # Faster JSON parsing

    # Convert to DataFrame
    df = pd.DataFrame(data)

    # Limit records for testing (if needed)
    if max_records:
        df = df.head(max_records)

    return df

# Load JSON
df = load_json_fast("games_data.json", max_records=500000)  # Load only 500,000 rows for speed


FileNotFoundError: [Errno 2] No such file or directory: 'games_data.json'

In [18]:
# Display DataFrame
import ace_tools as tools
tools.display_dataframe_to_user(name="Optimized Games Data", dataframe=df)
df.head()

ModuleNotFoundError: No module named 'ace_tools'