In [1]:
import requests
import json
import os
import pandas as pd
import time

In [3]:
# API_key
RAWG_API_KEY = "fef70a09ff254c2d96484a80116b01c2"  # API Key

In [44]:
def test_rawg_api_key(URL: str):
    """
    This function tests whether the API is valid by making a GET request.
    If successful, it prints a sample game from the response.
    """

    # Send GET request
    response = requests.get(URL)

    # Check if the API request was successful
    if response.status_code == 200:
        data = response.json()  # Parse JSON response
        print("API request successful! Sample data:")

        # Display sample data (only first game for readability)
        print(json.dumps(data["results"][0], indent=4))

    else:
        print(f"API request failed. Status code: {response.status_code}, Error: {response.text}")



In [None]:
#  API Request URL (Fetching the first 5 games)
test_rawg_URL = f"https://api.rawg.io/api/games?key={RAWG_API_KEY}&page=1&page_size=5"
test_rawg_api_key(test_rawg_URL)

API request successful! Sample data:
{
    "id": 3498,
    "slug": "grand-theft-auto-v",
    "name": "Grand Theft Auto V",
    "released": "2013-09-17",
    "tba": false,
    "background_image": "https://media.rawg.io/media/games/20a/20aa03a10cda45239fe22d035c0ebe64.jpg",
    "rating": 4.47,
    "rating_top": 5,
    "ratings": [
        {
            "id": 5,
            "title": "exceptional",
            "count": 4225,
            "percent": 59.02
        },
        {
            "id": 4,
            "title": "recommended",
            "count": 2342,
            "percent": 32.71
        },
        {
            "id": 3,
            "title": "meh",
            "count": 456,
            "percent": 6.37
        },
        {
            "id": 1,
            "title": "skip",
            "count": 136,
            "percent": 1.9
        }
    ],
    "ratings_count": 7046,
    "reviews_text_count": 65,
    "added": 21880,
    "added_by_status": {
        "yet": 545,
        "owned": 12657,
 

In [5]:
save_file = "games_data.json"
progress_file = "progress.json"
temp_save_file = "games_data_temp.json"  
temp_progress_file = "progress_temp.json"

In [48]:
import os
import json
import time
import requests
from datetime import datetime

# Define local file paths for persistent storage
save_file = "games_data.json"
progress_file = "progress.json"
temp_save_file = "games_data_temp.json"  # Temporary file to prevent corruption
temp_progress_file = "progress_temp.json"  # Temporary progress file

def safe_load_json(filename):
    """Safely loads JSON data from a file, handling potential corruption."""
    if os.path.exists(filename):
        try:
            with open(filename, "r", encoding="utf-8") as f:
                return json.load(f)
        except json.JSONDecodeError:
            print(f"❌ Warning: Corrupted JSON file detected ({filename}). Starting fresh.")
            return [] if "games_data" in filename else {"last_page": 1}
    return [] if "games_data" in filename else {"last_page": 1}

def save_data_realtime(games_list, page):
    """Safely saves game data and progress to prevent corruption."""
    
    # Write to a temporary file first, then replace the original file (atomic operation)
    with open(temp_save_file, "w", encoding="utf-8") as f:
        json.dump(games_list, f, ensure_ascii=False, indent=4)
    os.replace(temp_save_file, save_file)

    # Save progress safely
    with open(temp_progress_file, "w", encoding="utf-8") as f:
        json.dump({"last_page": page}, f)
    os.replace(temp_progress_file, progress_file)

    print(f"💾 Data saved successfully! Current page: {page}, Total games stored: {len(games_list)}")

def fetch_all_games(rawg_api_key, base_url: str, page_size=40, max_attempts=3, rate_limit=1.0):
    """
    Fetches all available games from the RAWG API while handling interruptions and rate limits.

    Args:
        rawg_api_key (str): Your RAWG API key.
        base_url (str): Base URL for the RAWG API.
        page_size (int): Number of games per page (default: 40, max allowed).
        max_attempts (int): Maximum retry attempts for failed requests (default: 3).
        rate_limit (float): Delay in seconds between requests (default: 1.0 sec).

    Returns:
        list: A list of all game data retrieved from the API.
    """
    # Load previously saved data and progress
    all_games = safe_load_json(save_file)
    progress = safe_load_json(progress_file)
    last_page = progress.get("last_page", 1)  # Resume from the last saved page

    print(f"📌 Resuming from page {last_page}...")

    page = last_page
    while True:
        print(f"📡 Fetching page {page} at {datetime.now().strftime('%H:%M:%S')}...")

        url = f"{base_url}?key={rawg_api_key}&page={page}&page_size={page_size}"

        # Exponential backoff for retries
        for attempt in range(max_attempts):
            try:
                response = requests.get(url, timeout=10)
                
                if response.status_code == 200:
                    data = response.json()
                    break  # Exit retry loop on success
                
                print(f"⚠️ Attempt {attempt + 1}: Error {response.status_code}, retrying in {2 ** attempt} sec...")
                time.sleep(2 ** attempt)  # Exponential backoff
            except requests.exceptions.RequestException as e:
                print(f"⚠️ Attempt {attempt + 1}: Request failed due to {e}, retrying in {2 ** attempt} sec...")
                time.sleep(2 ** attempt)
        else:
            print(f"❌ Failed to fetch page {page} after {max_attempts} attempts. Skipping to next page...")
            page += 1
            continue

        # Extract results
        results = data.get("results", [])
        if not results:
            print("✅ No more games available. Stopping data collection.")
            break

        # Store new data while avoiding duplicates
        all_games.extend(results)

        # Save data in real-time (avoiding excessive writes)
        if page % 5 == 0:  # Save every 5 pages to reduce I/O operations
            save_data_realtime(all_games, page)

        # Respect API rate limits
        time.sleep(rate_limit)

        # Move to the next page
        page += 1

    # Final save after fetching all pages
    save_data_realtime(all_games, page)

    print(f"✅ Total games fetched: {len(all_games)}")
    return all_games



In [None]:
fetch_all_games(RAWG_API_KEY, "https://api.rawg.io/api/games")

In [49]:
import json

# Load stored game data
with open("games_data.json", "r", encoding="utf-8") as f:
    all_games = json.load(f)

# Estimate stored pages
total_games_stored = len(all_games)
games_per_page = 40
estimated_pages_stored = total_games_stored // games_per_page  # ~20098 pages

# Generate a list of expected pages
expected_pages = set(range(1, estimated_pages_stored + 1))

# Extract stored pages (Assuming unique game IDs)
stored_game_ids = set(game["id"] for game in all_games)

# If we assume game IDs are sequential, we estimate missing pages
missing_pages_before_20099 = [p for p in expected_pages if p not in stored_game_ids]

print(f"✅ Estimated pages stored: {estimated_pages_stored}")
print(f"⚠️ Missing pages before 20099: {missing_pages_before_20099[:20]}... (showing first 20)")
print(f"📌 Total missing pages before 20099: {len(missing_pages_before_20099)}")


✅ Estimated pages stored: 20098
⚠️ Missing pages before 20099: [5, 18, 19, 53, 66, 68, 75, 81, 83, 96, 120, 126, 141, 145, 151, 154, 156, 166, 171, 178]... (showing first 20)
📌 Total missing pages before 20099: 1784


In [51]:
# Merge missing pages from 1-20098 and 20099-22160
all_missing_pages = sorted(set(missing_pages_before_20099 + missing_pages))

print(f"🚀 Final missing pages: {all_missing_pages[:20]}... (showing first 20)")
print(f"📌 Total pages to fetch: {len(all_missing_pages)}")


🚀 Final missing pages: [5, 18, 19, 53, 66, 68, 75, 81, 83, 96, 120, 126, 141, 145, 151, 154, 156, 166, 171, 178]... (showing first 20)
📌 Total pages to fetch: 3846


In [52]:
for page in all_missing_pages:
    print(f"🚀 Fetching missing page {page}...")
    fetch_all_games(RAWG_API_KEY, "https://api.rawg.io/api/games", page_size=40, max_attempts=3, rate_limit=1.0)

    print(f"✅ Page {page} fetched! Sleeping for 5 seconds to avoid rate limits...")
    time.sleep(5)  # Adjust based on API limits


🚀 Fetching missing page 5...
📌 Resuming from page 20140...
📡 Fetching page 20140 at 01:58:44...
💾 Data saved successfully! Current page: 20140, Total games stored: 803960
📡 Fetching page 20141 at 01:59:53...
📡 Fetching page 20142 at 01:59:56...
📡 Fetching page 20143 at 02:00:00...
📡 Fetching page 20144 at 02:00:05...
📡 Fetching page 20145 at 02:00:08...
💾 Data saved successfully! Current page: 20145, Total games stored: 804160
📡 Fetching page 20146 at 02:01:14...
📡 Fetching page 20147 at 02:01:19...
📡 Fetching page 20148 at 02:01:49...
📡 Fetching page 20149 at 02:01:56...
📡 Fetching page 20150 at 02:02:00...
💾 Data saved successfully! Current page: 20150, Total games stored: 804360
📡 Fetching page 20151 at 02:03:10...
📡 Fetching page 20152 at 02:03:13...
📡 Fetching page 20153 at 02:03:17...
📡 Fetching page 20154 at 02:03:21...
📡 Fetching page 20155 at 02:03:24...
💾 Data saved successfully! Current page: 20155, Total games stored: 804560
📡 Fetching page 20156 at 02:04:33...
📡 Fetching p

KeyboardInterrupt: 

In [11]:
import pandas as pd
import orjson

# Stream JSON file and load in chunks
def load_json_fast(file_path, max_records=None):
    with open(file_path, "r", encoding="utf-8") as f:
        data = orjson.loads(f.read())  # Faster JSON parsing

    # Convert to DataFrame
    df = pd.DataFrame(data)

    # Limit records for testing (if needed)
    if max_records:
        df = df.head(max_records)

    return df

# Load JSON
df = load_json_fast("games_data.json", max_records=500000)  # Load only 500,000 rows for speed

# Display DataFrame
import ace_tools as tools
tools.display_dataframe_to_user(name="Optimized Games Data", dataframe=df)


ModuleNotFoundError: No module named 'ace_tools'

In [19]:
!pip install ace_tools




In [23]:
df.head()

Unnamed: 0,id,slug,name,released,tba,background_image,rating,rating_top,ratings,ratings_count,...,dominant_color,platforms,parent_platforms,genres,stores,clip,tags,esrb_rating,short_screenshots,community_rating
0,3498,grand-theft-auto-v,Grand Theft Auto V,2013-09-17,False,https://media.rawg.io/media/games/20a/20aa03a1...,4.47,5,"[{'id': 5, 'title': 'exceptional', 'count': 42...",7040,...,0f0f0f,"[{'platform': {'id': 4, 'name': 'PC', 'slug': ...","[{'platform': {'id': 1, 'name': 'PC', 'slug': ...","[{'id': 4, 'name': 'Action', 'slug': 'action',...","[{'id': 290375, 'store': {'id': 3, 'name': 'Pl...",,"[{'id': 31, 'name': 'Singleplayer', 'slug': 's...","{'id': 4, 'name': 'Mature', 'slug': 'mature'}","[{'id': -1, 'image': 'https://media.rawg.io/me...",
1,3328,the-witcher-3-wild-hunt,The Witcher 3: Wild Hunt,2015-05-18,False,https://media.rawg.io/media/games/618/618c2031...,4.65,5,"[{'id': 5, 'title': 'exceptional', 'count': 53...",6816,...,0f0f0f,"[{'platform': {'id': 186, 'name': 'Xbox Series...","[{'platform': {'id': 1, 'name': 'PC', 'slug': ...","[{'id': 4, 'name': 'Action', 'slug': 'action',...","[{'id': 354780, 'store': {'id': 5, 'name': 'GO...",,"[{'id': 31, 'name': 'Singleplayer', 'slug': 's...","{'id': 4, 'name': 'Mature', 'slug': 'mature'}","[{'id': -1, 'image': 'https://media.rawg.io/me...",
2,4200,portal-2,Portal 2,2011-04-18,False,https://media.rawg.io/media/games/2ba/2bac0e87...,4.59,5,"[{'id': 5, 'title': 'exceptional', 'count': 40...",5845,...,0f0f0f,"[{'platform': {'id': 16, 'name': 'PlayStation ...","[{'platform': {'id': 1, 'name': 'PC', 'slug': ...","[{'id': 2, 'name': 'Shooter', 'slug': 'shooter...","[{'id': 465889, 'store': {'id': 2, 'name': 'Xb...",,"[{'id': 31, 'name': 'Singleplayer', 'slug': 's...","{'id': 2, 'name': 'Everyone 10+', 'slug': 'eve...","[{'id': -1, 'image': 'https://media.rawg.io/me...",
3,4291,counter-strike-global-offensive,Counter-Strike: Global Offensive,2012-08-21,False,https://media.rawg.io/media/games/736/73619bd3...,3.57,4,"[{'id': 4, 'title': 'recommended', 'count': 16...",3554,...,0f0f0f,"[{'platform': {'id': 4, 'name': 'PC', 'slug': ...","[{'platform': {'id': 1, 'name': 'PC', 'slug': ...","[{'id': 2, 'name': 'Shooter', 'slug': 'shooter...","[{'id': 4619, 'store': {'id': 3, 'name': 'Play...",,"[{'id': 40847, 'name': 'Steam Achievements', '...","{'id': 4, 'name': 'Mature', 'slug': 'mature'}","[{'id': -1, 'image': 'https://media.rawg.io/me...",
4,5286,tomb-raider,Tomb Raider (2013),2013-03-05,False,https://media.rawg.io/media/games/021/021c4e21...,4.06,4,"[{'id': 4, 'title': 'recommended', 'count': 24...",3977,...,0f0f0f,"[{'platform': {'id': 16, 'name': 'PlayStation ...","[{'platform': {'id': 1, 'name': 'PC', 'slug': ...","[{'id': 4, 'name': 'Action', 'slug': 'action',...","[{'id': 33824, 'store': {'id': 7, 'name': 'Xbo...",,"[{'id': 31, 'name': 'Singleplayer', 'slug': 's...","{'id': 4, 'name': 'Mature', 'slug': 'mature'}","[{'id': -1, 'image': 'https://media.rawg.io/me...",


In [25]:
df.columns

Index(['id', 'slug', 'name', 'released', 'tba', 'background_image', 'rating',
       'rating_top', 'ratings', 'ratings_count', 'reviews_text_count', 'added',
       'added_by_status', 'metacritic', 'playtime', 'suggestions_count',
       'updated', 'user_game', 'reviews_count', 'saturated_color',
       'dominant_color', 'platforms', 'parent_platforms', 'genres', 'stores',
       'clip', 'tags', 'esrb_rating', 'short_screenshots', 'community_rating'],
      dtype='object')

In [27]:
df.to_csv("all_games.csv")

In [31]:
df2=df.sample(10)
df2

Unnamed: 0,id,slug,name,released,tba,background_image,rating,rating_top,ratings,ratings_count,...,dominant_color,platforms,parent_platforms,genres,stores,clip,tags,esrb_rating,short_screenshots,community_rating
243083,840634,humankind-itch,HumanKind (itch),2022-08-19,False,https://media.rawg.io/media/screenshots/ba4/ba...,0.0,0,[],0,...,0f0f0f,"[{'platform': {'id': 21, 'name': 'Android', 's...","[{'platform': {'id': 1, 'name': 'PC', 'slug': ...","[{'id': 2, 'name': 'Shooter', 'slug': 'shooter...","[{'id': 837777, 'store': {'id': 9, 'name': 'it...",,"[{'id': 122, 'name': 'Pixel Graphics', 'slug':...",,"[{'id': -1, 'image': 'https://media.rawg.io/me...",0.0
60636,31615,monster-bash-2,Monster Bash!,1993-04-09,False,https://media.rawg.io/media/screenshots/8b9/8b...,0.0,0,[],0,...,0f0f0f,"[{'platform': {'id': 4, 'name': 'PC', 'slug': ...","[{'platform': {'id': 1, 'name': 'PC', 'slug': ...","[{'id': 11, 'name': 'Arcade', 'slug': 'arcade'...",[],,[],,"[{'id': -1, 'image': 'https://media.rawg.io/me...",0.0
354389,713331,xboxargumentfnf,Xboxargumentfnf,2021-12-22,False,https://media.rawg.io/media/screenshots/854/85...,0.0,0,[],0,...,0f0f0f,"[{'platform': {'id': 4, 'name': 'PC', 'slug': ...","[{'platform': {'id': 1, 'name': 'PC', 'slug': ...",[],"[{'id': 709991, 'store': {'id': 9, 'name': 'it...",,"[{'id': 4779, 'name': 'poop', 'slug': 'poop', ...",,"[{'id': -1, 'image': 'https://media.rawg.io/me...",0.0
145570,947108,the-fatalist,the fatalist,2023-03-26,False,https://media.rawg.io/media/screenshots/002/00...,0.0,0,[],0,...,0f0f0f,"[{'platform': {'id': 171, 'name': 'Web', 'slug...","[{'platform': {'id': 14, 'name': 'Web', 'slug'...",[],"[{'id': 945708, 'store': {'id': 9, 'name': 'it...",,"[{'id': 45, 'name': '2D', 'slug': '2d', 'langu...",,"[{'id': -1, 'image': 'https://media.rawg.io/me...",0.0
23543,29666,wild-metal-country,Wild Metal Country,1999-01-01,False,https://media.rawg.io/media/games/885/8858877b...,0.0,0,"[{'id': 4, 'title': 'recommended', 'count': 2,...",4,...,0f0f0f,"[{'platform': {'id': 4, 'name': 'PC', 'slug': ...","[{'platform': {'id': 1, 'name': 'PC', 'slug': ...","[{'id': 4, 'name': 'Action', 'slug': 'action',...",[],,"[{'id': 31, 'name': 'Singleplayer', 'slug': 's...","{'id': 3, 'name': 'Teen', 'slug': 'teen'}","[{'id': -1, 'image': 'https://media.rawg.io/me...",0.0
50323,34932,tom-clancys-politika,Tom Clancy's Politika,1997-01-01,False,https://media.rawg.io/media/screenshots/fbd/fb...,0.0,0,[],0,...,0f0f0f,"[{'platform': {'id': 4, 'name': 'PC', 'slug': ...","[{'platform': {'id': 1, 'name': 'PC', 'slug': ...","[{'id': 10, 'name': 'Strategy', 'slug': 'strat...",[],,[],,"[{'id': -1, 'image': 'https://media.rawg.io/me...",0.0
300416,776553,bai-se-kong-bu-shi-qi-you-xi,白色恐怖時期遊戲,2022-04-28,False,https://media.rawg.io/media/screenshots/da8/da...,0.0,0,[],0,...,0f0f0f,"[{'platform': {'id': 4, 'name': 'PC', 'slug': ...","[{'platform': {'id': 1, 'name': 'PC', 'slug': ...",[],"[{'id': 772948, 'store': {'id': 9, 'name': 'it...",,"[{'id': 31, 'name': 'Singleplayer', 'slug': 's...",,"[{'id': -1, 'image': 'https://media.rawg.io/me...",0.0
230652,854673,hibernation-gilel,Hibernation (Gilel),2022-09-23,False,https://media.rawg.io/media/screenshots/489/48...,0.0,0,[],0,...,0f0f0f,"[{'platform': {'id': 171, 'name': 'Web', 'slug...","[{'platform': {'id': 1, 'name': 'PC', 'slug': ...","[{'id': 3, 'name': 'Adventure', 'slug': 'adven...","[{'id': 851953, 'store': {'id': 9, 'name': 'it...",,"[{'id': 45, 'name': '2D', 'slug': '2d', 'langu...",,"[{'id': -1, 'image': 'https://media.rawg.io/me...",0.0
208904,878552,abyssyda,AbyssydA,2022-11-09,False,https://media.rawg.io/media/screenshots/cf2/cf...,0.0,0,[],0,...,0f0f0f,"[{'platform': {'id': 4, 'name': 'PC', 'slug': ...","[{'platform': {'id': 1, 'name': 'PC', 'slug': ...","[{'id': 4, 'name': 'Action', 'slug': 'action',...","[{'id': 876359, 'store': {'id': 9, 'name': 'it...",,"[{'id': 7, 'name': 'Multiplayer', 'slug': 'mul...",,"[{'id': -1, 'image': 'https://media.rawg.io/me...",0.0
286587,792113,shape-runner-android-version,Shape Runner (Android Version),2022-05-26,False,https://media.rawg.io/media/screenshots/b6d/b6...,0.0,0,[],0,...,0f0f0f,"[{'platform': {'id': 21, 'name': 'Android', 's...","[{'platform': {'id': 8, 'name': 'Android', 'sl...","[{'id': 4, 'name': 'Action', 'slug': 'action',...","[{'id': 788814, 'store': {'id': 9, 'name': 'it...",,"[{'id': 45, 'name': '2D', 'slug': '2d', 'langu...",,"[{'id': -1, 'image': 'https://media.rawg.io/me...",0.0


In [35]:
df["community_rating"].unique()

array([nan,  0.])

In [59]:
df["clip"].unique()

array([None], dtype=object)

In [77]:
df["ratings"].iloc[1],df["rating"]

([{'id': 5, 'title': 'exceptional', 'count': 5317, 'percent': 76.75},
  {'id': 4, 'title': 'recommended', 'count': 1135, 'percent': 16.38},
  {'id': 3, 'title': 'meh', 'count': 296, 'percent': 4.27},
  {'id': 1, 'title': 'skip', 'count': 180, 'percent': 2.6}],
 0         4.47
 1         4.65
 2         4.59
 3         3.57
 4         4.06
           ... 
 499995    0.00
 499996    0.00
 499997    0.00
 499998    0.00
 499999    0.00
 Name: rating, Length: 500000, dtype: float64)

In [65]:
df["genres"].iloc[2]

[{'id': 2,
  'name': 'Shooter',
  'slug': 'shooter',
  'games_count': 59549,
  'image_background': 'https://media.rawg.io/media/games/2ba/2bac0e87cf45e5b508f227d281c9252a.jpg'},
 {'id': 7,
  'name': 'Puzzle',
  'slug': 'puzzle',
  'games_count': 97329,
  'image_background': 'https://media.rawg.io/media/games/4cb/4cb855e8ef1578415a928e53c9f51867.png'}]

In [83]:
df["user_game"].unique()

array([None], dtype=object)