In [1]:
import requests
import time
import numpy as np
import os
import csv
import datetime as dt
import statistics
import pandas as pd

reference:
https://github.com/fpugliese/steam-crawl

In [None]:
def request(url, params=None, count = 0):
    # maximum repeat 5 times
    if count > 5:
        return {}
    
    try:
        response = requests.get(url, params=params)
        response.raise_for_status()
    except requests.exceptions.RequestException as e:
        print(f"Error during the request: {e}")
        time.sleep(1)
        return request(url, params, count+1)
    
    if response.status_code == 200:
        return response.json()
    elif response == None:
        print("No answer, Trying again in 10 seconds...")
        time.sleep(1)
        return request(url, params, count+1)
    else:
        print(f"Error in the API response: {response.status_code} - Parameters: {params}")
        return None
    

In [3]:
def get_app_data(start, stop, parser, pause):
    """
    Return list of app data generated from parser.
    
    parser : function to handle request
    """
    app_data = []
    
    try:
        app_list = pd.read_csv('data/steam_spy/id_name.csv')
    except FileNotFoundError:
        print("File 'id_name.csv' not found. You should run 'get_steam_spy_data.py' first.")
        return 
    except pd.errors.EmptyDataError:
        print("File 'id_name.csv' is empty. Check file contents.")
        return

    # iterate through each row of app_list, confined by start and stop
    for index, row in app_list[start:stop].iterrows():
        print('Current index: {}'.format(index), end='\r')
        
        appid = row['appid']
        name = row['name']

        # retrive app data for a row, handled by supplied parser, and append to list
        data = parser(appid, name)
        app_data.append(data)

    time.sleep(pause) # prevent overloading api with requests
    
    return app_data

In [9]:
def process_batches(parser, app_list, download_path, data_filename, index_filename,
                    columns, begin=0, end=-1, batchsize=100, pause = 1):
    """Process app data in batches, writing directly to file.
    
    parser : custom function to format request
    app_list : dataframe of appid and name
    download_path : path to store data
    data_filename : filename to save app data
    index_filename : filename to store highest index written
    columns : column names for file
    
    Keyword arguments:
    
    begin : starting index (get from index_filename, default 0)
    end : index to finish (defaults to end of app_list)
    batchsize : number of apps to write in each batch (default 100)
    pause : time to wait after each api request (defualt 1)
    
    returns: none
    """
    print('Starting at index {}:\n'.format(begin))
    
    # by default, process all apps in app_list
    if end == -1:
        end = len(app_list) + 1
    
    # generate array of batch begin and end points
    batches = np.arange(begin, end, batchsize)
    batches = np.append(batches, end)
    
    apps_written = 0
    batch_times = []
    
    for i in range(len(batches) - 1):
        start_time = time.time()
        
        start = batches[i]
        stop = batches[i+1]
        
        app_data = get_app_data(start, stop, parser, pause)
        
        rel_path = os.path.join(download_path, data_filename)
        
        # writing app data to file
        with open(rel_path, 'a', newline='', encoding='utf-8') as f:
            writer = csv.DictWriter(f, fieldnames=columns, extrasaction='ignore')
            
            for j in range(3,0,-1):
                print("\rAbout to write data, don't stop script! ({})".format(j), end='')
                time.sleep(0.5)
            
            writer.writerows(app_data)
            print('\rExported lines {}-{} to {}.'.format(start, stop-1, data_filename), end=' ')
            
        apps_written += len(app_data)
        
        idx_path = os.path.join(download_path, index_filename)
        
        # writing last index to file
        with open(idx_path, 'w') as f:
            index = stop
            print(index, file=f)
            
        # logging time taken
        end_time = time.time()
        time_taken = end_time - start_time
        
        batch_times.append(time_taken)
        mean_time = statistics.mean(batch_times)
        
        est_remaining = (len(batches) - i - 2) * mean_time
        
        remaining_td = dt.timedelta(seconds=round(est_remaining))
        time_td = dt.timedelta(seconds=round(time_taken))
        mean_td = dt.timedelta(seconds=round(mean_time))
        
        print('Batch {} time: {} (avg: {}, remaining: {})'.format(i, time_td, mean_td, remaining_td))
            
    print('\nProcessing batches complete. {} apps written'.format(apps_written))

def reset_index(download_path, index_filename):
    """Reset index in file to 0."""
    rel_path = os.path.join(download_path, index_filename)
    
    with open(rel_path, 'w') as f:
        print(0, file=f)
        

def get_index(download_path, index_filename):
    """Retrieve index from file, returning 0 if file not found."""
    try:
        rel_path = os.path.join(download_path, index_filename)

        with open(rel_path, 'r') as f:
            index = int(f.readline())
    
    except FileNotFoundError:
        index = 0
        
    return index


def prepare_data_file(download_path, filename, index, columns):
    """Create file and write headers if index is 0."""
    if index == 0:
        rel_path = os.path.join(download_path, filename)

        with open(rel_path, 'w', newline='') as f:
            writer = csv.DictWriter(f, fieldnames=columns)
            writer.writeheader()

def check_folder():
    if not os.path.exists("data/steam_spy"):
        print("Creating data/steam_spy directory...\n")
        os.makedirs("data/steam_spy")
    if not os.path.exists("data/steam_store"):
        print("Creating data/steam_store directory...\n")
        os.makedirs("data/steam_store")

In [14]:

def parse_steam_request(appid, name):
    """Unique parser to handle data from Steam Store API.
    
    Returns : json formatted data (dict-like)
    """
    url = "http://store.steampowered.com/api/appdetails/"
    parameters = {"appids": appid}
    json_data = request(url, params=parameters, count=0)
    if json_data == {}:
        with open('wrong_ids.txt', 'a') as f:
            print(appid, file=f)
        return {}
        
    json_app_data = json_data[str(appid)]
    
    if json_app_data['success']:
        data = json_app_data['data']
    else:
        data = {'name': name, 'steam_appid': appid}
        
    return data

In [None]:
# Defines Steam Spy's URL and the parameters for the requests
url = "https://steamspy.com/api.php"
page = 0
params = {"request": "all", "page": page}

all_data = []

check_folder()

while page < 100:
        data = request(url, params=params, count=0)

        all_data = pd.concat([pd.DataFrame.from_dict(data, orient='index'), pd.DataFrame(all_data)], ignore_index=True)

        page += 1
        params = {"request": "all", "page": page}

        # SteamSpy offers 1000 apps per all request, should it provide less than 1000, it means that we reached the end
        if len(data) < 1000:
            print("Less than 1000 appids found, wrapping up.")
            break

        time.sleep(1)  # 1 second delay so we don't overload the server
        print(f"Going to page {page}...")

steam_spy_all = pd.DataFrame(all_data)


games = steam_spy_all.sort_values(by = 'appid').reset_index(drop=True)
games.to_csv("data/steam_spy/all_data.csv")
print("All data saved to data/steam_spy/all_data.csv")

SyntaxError: positional argument follows keyword argument (1344372991.py, line 11)

In [10]:
games = steam_spy_all[["appid", "name"]].sort_values(by = 'appid').reset_index(drop=True)
games.to_csv("data/steam_spy/id_name.csv")

In [16]:
# Set file parameters
download_path = 'data/steam_store/'
steam_app_data = 'steam_app_data.csv'
steam_index = 'steam_index.txt'

check_folder()

steam_columns = [
        'type', 'name', 'steam_appid', 'required_age', 'is_free', 'controller_support',
        'dlc', 'detailed_description', 'about_the_game', 'short_description', 'fullgame',
        'supported_languages', 'header_image', 'website', 'pc_requirements', 'mac_requirements',
        'linux_requirements', 'legal_notice', 'drm_notice', 'ext_user_account_notice',
        'developers', 'publishers', 'demos', 'price_overview', 'packages', 'package_groups',
        'platforms', 'metacritic', 'reviews', 'categories', 'genres', 'screenshots',
        'movies', 'recommendations', 'achievements', 'release_date', 'support_info',
        'background', 'content_descriptors'
    ]


# Retrieve last index downloaded from file
index = get_index(download_path, steam_index)

# Wipe or create data file and write headers if index is 0
prepare_data_file(download_path, steam_app_data, index, steam_columns)

# Set end and chunksize for demonstration - remove to run through entire app list
process_batches(
        parser=parse_steam_request,
        app_list=pd.read_csv('data/steam_spy/id_name.csv'),
        download_path=download_path,
        data_filename=steam_app_data,
        index_filename=steam_index,
        columns=steam_columns,
        begin=index
)    

Starting at index 14400:

Exported lines 14400-14499 to steam_app_data.csv. Batch 0 time: 0:00:49 (avg: 0:00:49, remaining: 9:51:19)
Error during the request: 429 Client Error: Too Many Requests for url: https://store.steampowered.com/api/appdetails/?appids=589860
Error during the request: 429 Client Error: Too Many Requests for url: https://store.steampowered.com/api/appdetails/?appids=589860
Error during the request: 429 Client Error: Too Many Requests for url: https://store.steampowered.com/api/appdetails/?appids=589860
Error during the request: 429 Client Error: Too Many Requests for url: https://store.steampowered.com/api/appdetails/?appids=589860
Error during the request: 429 Client Error: Too Many Requests for url: https://store.steampowered.com/api/appdetails/?appids=589860
Error during the request: 429 Client Error: Too Many Requests for url: https://store.steampowered.com/api/appdetails/?appids=589860
Error during the request: 429 Client Error: Too Many Requests for url: http