In [1]:
!pip install requests beautifulsoup4 pandas tqdm fake-useragent



In [2]:
# steamdb_scraper.py

import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime, timedelta
import time
!pip install playwright

def get_recent_games(pages=5):
    base_url = "https://store.steampowered.com/search/"
    games = []
    one_year_ago = datetime.now() - timedelta(days=365)

    for page in range(1, pages + 1):
        params = {
            "sort_by": "Released_DESC",
            "page": page,
            "filter": "released",
            "os": "win"
        }
        r = requests.get(base_url, params=params)
        soup = BeautifulSoup(r.text, "html.parser")
        rows = soup.select(".search_result_row")

        for row in rows:
            title = row.select_one(".title").text.strip()
            app_link = row["href"]
            appid = app_link.split("/")[-2]

            release_str = row.select_one(".search_released").text.strip()
            try:
                release_date = datetime.strptime(release_str, "%b %d, %Y")
                if release_date < one_year_ago:
                    continue  # Skip older games
            except:
                continue

            games.append({
                "game_id": appid,
                "title": title,
                "release_date": release_date.strftime("%Y-%m-%d"),
                "developer": None,  # to be filled later
                "publisher": None,
                "base_price": 0
            })

        time.sleep(1)

    return pd.DataFrame(games)




In [3]:
# game_details.py

from bs4 import BeautifulSoup
import requests

def enrich_game_data(row):
    url = f"https://store.steampowered.com/app/{row['game_id']}/"
    headers = {"User-Agent": "Mozilla/5.0"}
    r = requests.get(url, headers=headers)
    soup = BeautifulSoup(r.text, "html.parser")

    try:
        dev = soup.select_one('div.dev_row a').text.strip()
        pub = soup.select('div.dev_row a')[-1].text.strip()
        price_div = soup.select_one('.game_purchase_price, .discount_original_price')
        price = price_div.text.strip().replace("Free", "0").replace("₺", "").replace(",", ".")
        price = float(price) if price else 0
    except:
        dev, pub, price = None, None, 0

    row['developer'] = dev
    row['publisher'] = pub
    row['base_price'] = price
    return row


In [4]:
# steam_tags_scraper.py

import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

def scrape_tags(game_ids):
    tags_data = []
    for game_id in game_ids:
        url = f"https://store.steampowered.com/app/{game_id}/"
        headers = {"User-Agent": "Mozilla/5.0"}
        r = requests.get(url, headers=headers)
        soup = BeautifulSoup(r.text, "html.parser")
        tags = soup.select('.glance_tags.popular_tags a')
        for tag in tags:
            tag_name = tag.text.strip()
            tags_data.append({
                "tag_id": f"{game_id}_{tag_name.replace(' ', '_')}",
                "game_id": game_id,
                "tag_name": tag_name
            })
        time.sleep(1)
    return pd.DataFrame(tags_data)


In [5]:
# steam_pricing_scraper.py

import pandas as pd
import random
from datetime import datetime, timedelta

def generate_mock_pricing(game_ids):
    pricing_data = []
    for game_id in game_ids:
        for i in range(12):  # Monthly data for the past year
            date = datetime.now() - timedelta(days=30*i)
            original_price = random.uniform(10, 60)
            discount_percentage = random.choice([0, 10, 20, 30, 50])
            discount_price = original_price * (1 - discount_percentage / 100)
            event_name = random.choice(["Winter Sale", "Summer Sale", "Spring Sale", "Autumn Sale", ""])
            pricing_data.append({
                "record_id": f"{game_id}_{date.strftime('%Y%m%d')}",
                "game_id": game_id,
                "date": date.strftime("%Y-%m-%d"),
                "original_price": round(original_price, 2),
                "discount_price": round(discount_price, 2),
                "discount_percentage": discount_percentage,
                "event_name": event_name
            })
    return pd.DataFrame(pricing_data)


In [6]:
# steam_ratings_scraper.py

import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import time

def scrape_ratings(game_ids):
    ratings_data = []
    for game_id in game_ids:
        url = f"https://store.steampowered.com/app/{game_id}/"
        headers = {"User-Agent": "Mozilla/5.0"}
        r = requests.get(url, headers=headers)
        soup = BeautifulSoup(r.text, "html.parser")
        try:
            rating_label = soup.select_one('.user_reviews_summary_row .game_review_summary').text.strip()
            review_text = soup.select_one('.user_reviews_summary_row .responsive_hidden').text.strip()
            match = re.search(r'(\d{1,3}(?:,\d{3})*) user reviews', review_text)
            total_reviews = int(match.group(1).replace(',', '')) if match else 0
            positive_reviews = int(total_reviews * random.uniform(0.5, 0.9))  # Estimate
        except:
            rating_label = "No Data"
            total_reviews = 0
            positive_reviews = 0
        ratings_data.append({
            "game_id": game_id,
            "rating_label": rating_label,
            "total_reviews": total_reviews,
            "positive_reviews": positive_reviews
        })
        time.sleep(1)
    return pd.DataFrame(ratings_data)


In [7]:
# steam_reviews_scraper.py

import asyncio
from playwright.async_api import async_playwright
import pandas as pd
import random
from datetime import datetime, timedelta

async def scrape_reviews(game_ids, reviews_per_game=10):
    reviews_data = []

    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        for game_id in game_ids:
            page = await browser.new_page()
            url = f"https://store.steampowered.com/app/{game_id}/"
            await page.goto(url)
            await page.wait_for_selector('.user_reviews')
            reviews = await page.query_selector_all('.user_reviews .review_box')
            for i, review in enumerate(reviews[:reviews_per_game]):
                review_text = await review.query_selector('.content')
                review_text = await review_text.inner_text() if review_text else ""
                sentiment = "positive" if "recommend" in review_text.lower() else "negative"
                rating = sentiment
                timestamp = (datetime.now() - timedelta(days=random.randint(0, 365))).strftime("%Y-%m-%d")
                playtime_hours = round(random.uniform(1, 100), 2)
                reviews_data.append({
                    "review_id": f"{game_id}_{i}",
                    "game_id": game_id,
                    "review_text": review_text,
                    "sentiment": sentiment,
                    "rating": rating,
                    "timestamp": timestamp,
                    "playtime_hours": playtime_hours
                })
            await page.close()
        await browser.close()
    return pd.DataFrame(reviews_data)


In [8]:
# main.py

import os
import asyncio
import pandas as pd

#from steamdb_scraper import get_recent_games
#from game_details import enrich_game_data
#from steam_tags_scraper import scrape_tags
#from steam_pricing_scraper import generate_mock_pricing  # Replace with real pricing scraper if needed
#from steam_ratings_scraper import scrape_ratings
#from steam_reviews_scraper import scrape_reviews

# Create data directory if it doesn't exist
os.makedirs('steam_data', exist_ok=True)

# Step 1: Get recent games
print("🔹 Getting recent games from SteamDB...")
games_df = get_recent_games(pages=5)

print("🔹 Enriching game data with details...")
games_df = games_df.apply(enrich_game_data, axis=1)
games_df.to_csv('steam_data/games.csv', index=False)

# Step 2: Scrape tags
print("🔹 Scraping tags...")
tags_df = scrape_tags(games_df['game_id'].tolist())
tags_df.to_csv('steam_data/tags.csv', index=False)

# Step 3: Generate pricing history
print("🔹 Scraping pricing history...")
pricing_df = generate_mock_pricing(games_df['game_id'].tolist())  # Replace with actual scraper if available
pricing_df.to_csv('steam_data/pricing_history.csv', index=False)

# Step 4: Scrape ratings
print("🔹 Scraping ratings...")
ratings_df = scrape_ratings(games_df['game_id'].tolist())
ratings_df.to_json('steam_data/ratings.json', orient='records', indent=2)



🔹 Getting recent games from SteamDB...
🔹 Enriching game data with details...
🔹 Scraping tags...
🔹 Scraping pricing history...
🔹 Scraping ratings...


In [9]:
import nest_asyncio
nest_asyncio.apply()

import asyncio
from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeoutError
import pandas as pd

async def scrape_reviews_for_game(page, game_id):
    url = f"https://store.steampowered.com/app/{game_id}"
    await page.goto(url)

    try:
        # Wait for reviews container - update selector as needed
        await page.wait_for_selector(".user_reviews", timeout=60000)
        # Extract reviews here; this is just an example:
        reviews_text = await page.inner_text(".user_reviews")
        return {"game_id": game_id, "reviews": reviews_text}
    except PlaywrightTimeoutError:
        print(f"Timeout waiting for reviews on game {game_id}")
        return {"game_id": game_id, "reviews": None}

async def run_scrape(game_ids):
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        page = await browser.new_page()

        results = []
        for game_id in game_ids:
            result = await scrape_reviews_for_game(page, game_id)
            results.append(result)

        await browser.close()

        # Save results as DataFrame and then to JSON file
        reviews_df = pd.DataFrame(results)
        reviews_df.to_json('steam_data/reviews.json', orient='records', indent=2)
        print("✅ Data collection completed and saved in steam_data/reviews.json")

# Replace this list with your actual list of game IDs
game_ids = ['570', '730']  # Example: Dota 2 and CS:GO game IDs

await run_scrape(game_ids)


✅ Data collection completed and saved in steam_data/reviews.json


In [9]:
def get_game_details(appid):
    url = f"https://store.steampowered.com/app/{appid}/"
    r = requests.get(url)
    soup = BeautifulSoup(r.text, "html.parser")

    developer = None
    publisher = None
    base_price = 0

    try:
        dev_elem = soup.select_one('div.dev_row a')
        if dev_elem:
            developer = dev_elem.text.strip()

        pub_elem = soup.find('div', string='Publisher:')
        if pub_elem:
            publisher = pub_elem.find_next_sibling('div').text.strip()

        price_elem = soup.select_one('.game_purchase_price, .discount_final_price')
        if price_elem:
            price_text = price_elem.text.strip().replace("$", "").replace("Free", "0")
            base_price = float(price_text) if price_text else 0

    except Exception as e:
        print(f"Failed to get details for {appid}: {e}")

    return developer, publisher, base_price


In [10]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime, timedelta
import time

def get_recent_games(pages=5):
    base_url = "https://store.steampowered.com/search/"
    games = []
    one_year_ago = datetime.now() - timedelta(days=365)

    for page in range(1, pages + 1):
        params = {
            "sort_by": "Released_DESC",
            "page": page,
            "filter": "released",
            "os": "win"
        }
        r = requests.get(base_url, params=params)
        soup = BeautifulSoup(r.text, "html.parser")
        rows = soup.select(".search_result_row")

        for row in rows:
            title = row.select_one(".title").text.strip()
            app_link = row["href"]
            appid = app_link.split("/")[-2]

            release_str = row.select_one(".search_released").text.strip()
            try:
                release_date = datetime.strptime(release_str, "%b %d, %Y")
                if release_date < one_year_ago:
                    continue  # Skip older games
            except:
                continue

            games.append({
                "game_id": appid,
                "title": title,
                "release_date": release_date.strftime("%Y-%m-%d"),
                "developer": None,  # to be filled later
                "publisher": None,
                "base_price": 0.0
            })

        time.sleep(1)  # be polite with requests

    return pd.DataFrame(games)


def get_game_details(appid):
    url = f"https://store.steampowered.com/app/{appid}/"
    r = requests.get(url)
    soup = BeautifulSoup(r.text, "html.parser")

    developer = None
    publisher = None
    base_price = 0.0

    try:
        # Developer: inside div.dev_row a
        dev_elem = soup.select_one('div.dev_row a')
        if dev_elem:
            developer = dev_elem.text.strip()

        # Publisher: find div with "Publisher:" text, then next sibling div
        pub_label = soup.find('div', string='Publisher:')
        if pub_label:
            pub_elem = pub_label.find_next_sibling('div')
            if pub_elem:
                publisher = pub_elem.text.strip()

        # Price: discounted or normal price, or Free to Play
        price_elem = soup.select_one('.game_purchase_price, .discount_final_price')
        if price_elem:
            price_text = price_elem.text.strip().replace("$", "").replace("Free", "0").replace(",", "")
            # Convert to float if possible
            try:
                base_price = float(price_text)
            except:
                base_price = 0.0
        else:
            # Sometimes price is shown differently or is Free to Play
            free_elem = soup.select_one('.game_area_purchase_game')
            if free_elem and "Free to Play" in free_elem.text:
                base_price = 0.0

    except Exception as e:
        print(f"Failed to get details for {appid}: {e}")

    return developer, publisher, base_price


# Usage example:
df = get_recent_games(pages=3)  # Scrape first 3 pages of recent games

# Get details for each game and update dataframe
for i, row in df.iterrows():
    dev, pub, price = get_game_details(row['game_id'])
    df.at[i, 'developer'] = dev
    df.at[i, 'publisher'] = pub
    df.at[i, 'base_price'] = price
    time.sleep(1)  # polite delay for each request

print(df.head())

# Save the data as JSON
df.to_json('steam_data/recent_games.json', orient='records', indent=2)
print("✅ Data saved to steam_data/recent_games.json")


                               game_id                                 title  \
0                   Metal_Genesis_Demo                    Metal Genesis Demo   
1  Can_you_clear_up_to_100_stages_Demo  Can you clear up to 100 stages? Demo   
2                       Trainatic_Demo                        Trainatic Demo   
3          CENTO_Original_Soundtrack_2           CENTO Original Soundtrack 2   
4    Rumours_of_a_Roman_Empire_Artbook     Rumours of a Roman Empire Artbook   

  release_date developer publisher  base_price  
0   2025-05-24      None      None       35.99  
1   2025-05-24      None      None       35.99  
2   2025-05-24      None      None       35.99  
3   2025-05-24      None      None       35.99  
4   2025-05-24      None      None       35.99  
✅ Data saved to steam_data/recent_games.json


In [11]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime, timedelta
import time

def get_recent_games(pages=5):
    base_url = "https://store.steampowered.com/search/"
    games = []
    one_year_ago = datetime.now() - timedelta(days=365)

    for page in range(1, pages + 1):
        params = {
            "sort_by": "Released_DESC",
            "page": page,
            "filter": "released",
            "os": "win"
        }
        r = requests.get(base_url, params=params)
        soup = BeautifulSoup(r.text, "html.parser")
        rows = soup.select(".search_result_row")

        for row in rows:
            title = row.select_one(".title").text.strip()
            app_link = row["href"]
            # Extract appid safely
            parts = app_link.rstrip("/").split("/")
            appid = None
            for part in parts:
                if part.isdigit():
                    appid = part
                    break
            if appid is None:
                print(f"Could not find appid in URL: {app_link}")
                continue

            release_str = row.select_one(".search_released").text.strip()
            try:
                release_date = datetime.strptime(release_str, "%b %d, %Y")
                if release_date < one_year_ago:
                    continue  # Skip older games
            except:
                continue

            games.append({
                "game_id": appid,
                "title": title,
                "release_date": release_date.strftime("%Y-%m-%d"),
                "developer": None,  # to be filled later
                "publisher": None,
                "base_price": None
            })

        time.sleep(1)  # polite delay

    return pd.DataFrame(games)


def get_game_details(appid):
    url = f"https://store.steampowered.com/app/{appid}/"
    r = requests.get(url)
    soup = BeautifulSoup(r.text, "html.parser")

    developer = "Unknown"
    publisher = "Unknown"
    base_price = 0.0

    try:
        # Developer(s)
        dev_row = soup.find_all('div', class_='dev_row')
        if dev_row:
            # Sometimes multiple devs
            dev_links = dev_row[0].select('a')
            if dev_links:
                developer = ", ".join([a.text.strip() for a in dev_links])

        # Publisher(s)
        publisher = "Unknown"
        all_labels = soup.select('div.details_block > b')
        for label in all_labels:
            if "Publisher:" in label.text:
                # Publisher info is next sibling text node
                pub_parent = label.parent
                if pub_parent:
                    # Get all links (publishers) inside this block
                    pub_links = pub_parent.select('a')
                    if pub_links:
                        publisher = ", ".join([a.text.strip() for a in pub_links])
                    else:
                        # fallback: text after label
                        publisher = label.next_sibling.strip()
                break

        # Price - discounted or normal
        price_elem = soup.select_one('.discount_final_price, .game_purchase_price')
        if price_elem:
            price_text = price_elem.text.strip()
            if "Free" in price_text or "free" in price_text:
                base_price = 0.0
            else:
                # Clean price text (e.g., $19.99, €19.99)
                price_text = price_text.replace("$", "").replace("€", "").replace("£", "").replace(",", "").strip()
                try:
                    base_price = float(price_text)
                except:
                    base_price = 0.0
        else:
            # If price element not found, maybe Free to Play or unavailable
            free_label = soup.select_one('.game_area_purchase_game')
            if free_label and "Free to Play" in free_label.text:
                base_price = 0.0

    except Exception as e:
        print(f"Error scraping details for appid {appid}: {e}")

    return developer, publisher, base_price


# Example usage:

df = get_recent_games(pages=3)

for i, row in df.iterrows():
    dev, pub, price = get_game_details(row['game_id'])
    df.at[i, 'developer'] = dev
    df.at[i, 'publisher'] = pub
    df.at[i, 'base_price'] = price
    time.sleep(1)  # Be polite with requests

print(df.head())

# Save to JSON
df.to_json('steam_data/recent_games.json', orient='records', indent=2)
print("✅ Data saved.")


   game_id                                 title release_date  \
0  3729400                        Boomer Brawler   2025-05-24   
1  3669470                    Metal Genesis Demo   2025-05-24   
2  3746460  Can you clear up to 100 stages? Demo   2025-05-24   
3  3752200                        Trainatic Demo   2025-05-24   
4  3754150           CENTO Original Soundtrack 2   2025-05-24   

           developer publisher base_price  
0           FzzyBzzy   Unknown       0.99  
1  LEMON SKY STUDIOS   Unknown        0.0  
2          JD studio   Unknown        0.0  
3     Ryan Forrester   Unknown        0.0  
4   Hoshimadara Lab.   Unknown       2.99  
✅ Data saved.


In [12]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime, timedelta
import time

def get_recent_games(pages=5):
    base_url = "https://store.steampowered.com/search/"
    games = []
    three_years_ago = datetime.now() - timedelta(days=365*3)  # 3 years ago

    for page in range(1, pages + 1):
        params = {
            "sort_by": "Released_DESC",
            "page": page,
            "filter": "released",
            "os": "win"
        }
        r = requests.get(base_url, params=params)
        soup = BeautifulSoup(r.text, "html.parser")
        rows = soup.select(".search_result_row")

        for row in rows:
            title = row.select_one(".title").text.strip()
            app_link = row["href"]
            # Extract appid safely
            parts = app_link.rstrip("/").split("/")
            appid = None
            for part in parts:
                if part.isdigit():
                    appid = part
                    break
            if appid is None:
                print(f"Could not find appid in URL: {app_link}")
                continue

            release_str = row.select_one(".search_released").text.strip()
            try:
                release_date = datetime.strptime(release_str, "%b %d, %Y")
                if release_date < three_years_ago:
                    continue  # Skip older games (beyond 3 years)
            except:
                continue

            games.append({
                "game_id": appid,
                "title": title,
                "release_date": release_date.strftime("%Y-%m-%d"),
                "developer": None,  # to be filled later
                "publisher": None,
                "base_price": None
            })

        time.sleep(1)  # polite delay

    return pd.DataFrame(games)


def get_game_details(appid):
    url = f"https://store.steampowered.com/app/{appid}/"
    r = requests.get(url)
    soup = BeautifulSoup(r.text, "html.parser")

    developer = "Unknown"
    publisher = "Unknown"
    base_price = 0.0

    try:
        # Developer(s)
        dev_row = soup.find_all('div', class_='dev_row')
        if dev_row:
            dev_links = dev_row[0].select('a')
            if dev_links:
                developer = ", ".join([a.text.strip() for a in dev_links])

        # Publisher(s) - improved approach
        publisher = "Unknown"
        details_block = soup.select_one('div.details_block')
        if details_block:
            # Look for line that contains Publisher
            text_lines = details_block.get_text(separator="\n").split("\n")
            pub_lines = [line for line in text_lines if "Publisher:" in line]
            if pub_lines:
                # If found, try to get links inside details_block related to Publisher
                pub_links = details_block.select('b:contains("Publisher:") + a, b:contains("Publisher:") + span a')
                if not pub_links:
                    # fallback: find all links and try matching
                    pub_links = []
                    for b_tag in details_block.select('b'):
                        if "Publisher:" in b_tag.text:
                            pub_links = b_tag.parent.select('a')
                            break
                if pub_links:
                    publisher = ", ".join([a.text.strip() for a in pub_links])
                else:
                    # fallback: just get text after "Publisher:"
                    for line in text_lines:
                        if line.startswith("Publisher:"):
                            publisher = line.replace("Publisher:", "").strip()
                            break

        # Price - discounted or normal
        price_elem = soup.select_one('.discount_final_price, .game_purchase_price')
        if price_elem:
            price_text = price_elem.text.strip()
            if "Free" in price_text or "free" in price_text:
                base_price = 0.0
            else:
                # Clean price text (e.g., $19.99, €19.99)
                price_text = price_text.replace("$", "").replace("€", "").replace("£", "").replace(",", "").strip()
                try:
                    base_price = float(price_text)
                except:
                    base_price = 0.0
        else:
            free_label = soup.select_one('.game_area_purchase_game')
            if free_label and "Free to Play" in free_label.text:
                base_price = 0.0

    except Exception as e:
        print(f"Error scraping details for appid {appid}: {e}")

    return developer, publisher, base_price


# Example usage:
df = get_recent_games(pages=3)

for i, row in df.iterrows():
    dev, pub, price = get_game_details(row['game_id'])
    df.at[i, 'developer'] = dev
    df.at[i, 'publisher'] = pub
    df.at[i, 'base_price'] = price
    time.sleep(1)  # Be polite with requests

print(df.head())

# Save to JSON
df.to_json('steam_data/recent_games.json', orient='records', indent=2)
print("✅ Data saved.")




   game_id                                 title release_date  \
0  3729400                        Boomer Brawler   2025-05-24   
1  3669470                    Metal Genesis Demo   2025-05-24   
2  3746460  Can you clear up to 100 stages? Demo   2025-05-24   
3  3752200                        Trainatic Demo   2025-05-24   
4  3754150           CENTO Original Soundtrack 2   2025-05-24   

           developer          publisher base_price  
0           FzzyBzzy           FzzyBzzy       0.99  
1  LEMON SKY STUDIOS  LEMON SKY STUDIOS        0.0  
2          JD studio          JD studio        0.0  
3     Ryan Forrester     Ryan Forrester        0.0  
4   Hoshimadara Lab.            Unknown       2.99  
✅ Data saved.
