In [1]:
# standard library imports
import csv
import datetime as dt
import json
import os
import statistics
import time

# third-party imports
import numpy as np
import pandas as pd
import requests
import pickle

# customisations - ensure tables show all columns
pd.set_option("max_columns", 100)

In [None]:
def get_request(url, parameters=None):
    """Return json-formatted response of a get request using optional parameters.
    
    Parameters
    ----------
    url : string
    parameters : {'parameter': 'value'}
        parameters to pass as part of get request
    
    Returns
    -------
    json_data
        json-formatted response (dict-like)
    """
    try:
        response = requests.get(url=url, params=parameters)
    except SSLError as s:
        print('SSL Error:', s)
        
        for i in range(5, 0, -1):
            print('\rWaiting... ({})'.format(i), end='')
            time.sleep(1)
        print('\rRetrying.' + ' '*10)
        
        # recusively try again
        return get_request(url, parameters)
    
    if response:
        return response.json()
    else:
        # response is none usually means too many requests. Wait and try again 
        print('No response, waiting 10 seconds...')
        time.sleep(10)
        print('Retrying.')
        return get_request(url, parameters)

In [None]:
url = "https://steamspy.com/api.php"
parameters = {"request": "all"}

# request 'all' from steam spy and parse into dataframe
json_data = get_request(url, parameters=parameters)
steam_spy_all = pd.DataFrame.from_dict(json_data, orient='index')

# generate sorted app_list from steamspy data
app_list = steam_spy_all[['appid', 'name']].sort_values('appid').reset_index(drop=True)

# export disabled to keep consistency across download sessions
app_list.to_csv('app_list.csv', index=False)

# instead read from stored csv
app_list = pd.read_csv('app_list.csv')

# display first few rows
app_list.head()

In [None]:
def get_app_data(start, stop, parser, pause):
    """Return list of app data generated from parser.
    
    parser : function to handle request
    """
    app_data = []
    
    # iterate through each row of app_list, confined by start and stop
    for index, row in app_list[start:stop].iterrows():
        print('Current index: {}'.format(index), end='\r')
        
        appid = row['appid']
        name = row['name']

        # retrive app data for a row, handled by supplied parser, and append to list
        data = parser(appid, name)
        app_data.append(data)

        time.sleep(pause) # prevent overloading api with requests
    
    return app_data

In [None]:
def process_batches(parser, app_list, download_path, data_filename, index_filename,
                    columns, begin=0, end=-1, batchsize=50, pause=1):
    """Process app data in batches, writing directly to file.
    
    parser : custom function to format request
    app_list : dataframe of appid and name
    download_path : path to store data
    data_filename : filename to save app data
    index_filename : filename to store highest index written
    columns : column names for file
    
    Keyword arguments:
    
    begin : starting index (get from index_filename, default 0)
    end : index to finish (defaults to end of app_list)
    batchsize : number of apps to write in each batch (default 100)
    pause : time to wait after each api request (defualt 1)
    
    returns: none
    """
    print('Starting at index {}:\n'.format(begin))
    
    # by default, process all apps in app_list
    if end == -1:
        end = len(app_list) + 1
    
    # generate array of batch begin and end points
    batches = np.arange(begin, end, batchsize)
    batches = np.append(batches, end)
    
    apps_written = 0
    batch_times = []
    
    for i in range(len(batches) - 1):
        start_time = time.time()
        
        start = batches[i]
        stop = batches[i+1]
        
        app_data = get_app_data(start, stop, parser, pause)
        
        rel_path = os.path.join(download_path, data_filename)
        
        # writing app data to file
        with open(rel_path, 'a', newline='', encoding='utf-8') as f:
            writer = csv.DictWriter(f, fieldnames=columns, extrasaction='ignore')
            
            for j in range(3,0,-1):
                print("\rAbout to write data, don't stop script! ({})".format(j), end='')
                time.sleep(0.5)
            
            writer.writerows(app_data)
            print('\rExported lines {}-{} to {}.'.format(start, stop-1, data_filename), end=' ')
            
        apps_written += len(app_data)
        
        idx_path = os.path.join(download_path, index_filename)
        
        # writing last index to file
        with open(idx_path, 'w') as f:
            index = stop
            print(index, file=f)
            
        # logging time taken
        end_time = time.time()
        time_taken = end_time - start_time
        
        batch_times.append(time_taken)
        mean_time = statistics.mean(batch_times)
        
        est_remaining = (len(batches) - i - 2) * mean_time
        
        remaining_td = dt.timedelta(seconds=round(est_remaining))
        time_td = dt.timedelta(seconds=round(time_taken))
        mean_td = dt.timedelta(seconds=round(mean_time))
        
        print('Batch {} time: {} (avg: {}, remaining: {})'.format(i, time_td, mean_td, remaining_td))
            
    print('\nProcessing batches complete. {} apps written'.format(apps_written))

In [2]:
def reset_index(download_path, index_filename):
    """Reset index in file to 0."""
    rel_path = os.path.join(download_path, index_filename)
    
    with open(rel_path, 'w') as f:
        print(0, file=f)
        

def get_index(download_path, index_filename):
    """Retrieve index from file, returning 0 if file not found."""
    try:
        rel_path = os.path.join(download_path, index_filename)

        with open(rel_path, 'r') as f:
            index = int(f.readline())
    
    except FileNotFoundError:
        index = 0
        
    return index


def prepare_data_file(download_path, filename, index, columns):
    """Create file and write headers if index is 0."""
    if index == 0:
        rel_path = os.path.join(download_path, filename)

        with open(rel_path, 'w', newline='') as f:
            writer = csv.DictWriter(f, fieldnames=columns)
            writer.writeheader()

In [None]:
def parse_steam_request(appid, name):
    """Unique parser to handle data from Steam Store API.
    
    Returns : json formatted data (dict-like)
    """
    url = "http://store.steampowered.com/api/appdetails/"
    parameters = {"appids": appid}
    
    json_data = get_request(url, parameters=parameters)
    json_app_data = json_data[str(appid)]
    
    if json_app_data['success']:
        data = json_app_data['data']
    else:
        data = {'name': name, 'steam_appid': appid}
        
    return data


# # Set file parameters
# download_path = '../data/download'
# steam_app_data = 'steam_app_data.csv'
# steam_index = 'steam_index.txt'

# steam_columns = [
#     'name', 'steam_appid',
   
#     'pc_requirements',
    
   
#    'genres'
# ]


# # Overwrites last index for demonstration (would usually store highest index so can continue across sessions)
# reset_index(download_path, steam_index)

# # Retrieve last index downloaded from file
# index = get_index(download_path, steam_index)

# # Wipe or create data file and write headers if index is 0
# prepare_data_file(download_path, steam_app_data, index, steam_columns)

# # Set end and chunksize for demonstration - remove to run through entire app list
# process_batches(
#     parser=parse_steam_request,
#     app_list=app_list,
#     download_path=download_path,
#     data_filename=steam_app_data,
#     index_filename=steam_index,
#     columns=steam_columns,
#     begin=index,
#     end=10,
#     batchsize=5
# )

In [None]:
def parse_steamspy_request(appid, name):
    """Parser to handle SteamSpy API data."""
    url = "https://steamspy.com/api.php"
    parameters = {"request": "appdetails", "appid": appid}
    
    json_data = get_request(url, parameters)
    return json_data

# set files and columns
download_path = '../'
steamspy_data = 'steamspy_data.csv'
steamspy_index = 'steamspy_index.txt'

steamspy_columns = [
    'appid', 'name', 'developer', 'publisher', 'score_rank', 'positive',
    'negative', 'userscore', 'owners', 'average_forever', 'average_2weeks',
    'median_forever', 'median_2weeks', 'price', 'initialprice', 'discount',
    'languages', 'genre', 'ccu', 'tags'
]

# reset_index(download_path, steamspy_index)
index = get_index(download_path, steamspy_index)

# Wipe data file if index is 0
prepare_data_file(download_path, steamspy_data, index, steamspy_columns)

process_batches(
    parser=parse_steamspy_request,
    app_list=app_list,
    download_path=download_path, 
    data_filename=steamspy_data,
    index_filename=steamspy_index,
    columns=steamspy_columns,
    begin=index,
    end=-1,
    batchsize=50,
    pause=0.3
)

In [None]:
len(app_list)

In [None]:
app_list[:-50]

In [None]:
pd.read_csv('../steamspy_data.csv').head(50)

In [3]:
def process_library_batches(parser, steamid_list, download_path, data_filename, index_filename,
                    columns, begin=0, end=-1, batchsize=50, pause=1):
    """Process library data in batches, writing directly to file.
    
    parser : custom function to format request
    steamid_list : dataframe of steamids
    download_path : path to store data
    data_filename : filename to save library data
    index_filename : filename to store highest index written
    columns : column names for file
    
    Keyword arguments:
    
    begin : starting index (get from index_filename, default 0)
    end : index to finish (defaults to end of steamid_list)
    batchsize : number of libraries to write in each batch (default 50)
    pause : time to wait after each api request (default 1)
    
    returns: none
    """
    print('Starting at index {}:\n'.format(begin))
    
    # by default, process all steamids in steamid_list
    if end == -1:
        end = len(steamid_list) + 1
    
    # generate array of batch begin and end points
    batches = np.arange(begin, end, batchsize)
    batches = np.append(batches, end)
    
    libraries_written = 0
    batch_times = []
    
    for i in range(len(batches) - 1):
        start_time = time.time()
        
        start = batches[i]
        stop = batches[i+1]
        
        library_data = get_library_data(start, stop, parser, pause)
        
        rel_path = os.path.join(download_path, data_filename)
        
        # writing library data to file
        with open(rel_path, 'a', newline='', encoding='utf-8') as f:
            writer = csv.DictWriter(f, fieldnames=columns, extrasaction='ignore')
            
            for j in range(3,0,-1):
                print("\rAbout to write data, don't stop script! ({})".format(j), end='')
                time.sleep(0.5)
            
            writer.writerows(library_data)
            print('\rExported lines {}-{} to {}.'.format(start, stop-1, data_filename), end=' ')
            
        libraries_written += len(library_data)
        
        idx_path = os.path.join(download_path, index_filename)
        
        # writing last index to file
        with open(idx_path, 'w') as f:
            index = stop
            print(index, file=f)
            
        # logging time taken
        end_time = time.time()
        time_taken = end_time - start_time
        
        batch_times.append(time_taken)
        mean_time = statistics.mean(batch_times)
        
        est_remaining = (len(batches) - i - 2) * mean_time
        
        remaining_td = dt.timedelta(seconds=round(est_remaining))
        time_td = dt.timedelta(seconds=round(time_taken))
        mean_td = dt.timedelta(seconds=round(mean_time))
        
        print('Batch {} time: {} (avg: {}, remaining: {})'.format(i, time_td, mean_td, remaining_td))
            
    print('\nProcessing batches complete. {} libraries written'.format(libraries_written))

In [4]:
def get_library_data(start, stop, parser, pause):
    """Return list of library data generated from parser.
    
    parser : function to handle request
    """
    library_data = []
    
    # iterate through each row of steamid_list, confined by start and stop
    for steamid in steamid_list[start:stop]:
        print('Current index: {}'.format(index), end='\r')
        
        # retrieve library data for a row, handled by supplied parser, and append to list
        data = parser(steamid)
        library_data.append(data)

        time.sleep(pause) # prevent overloading api with requests
    
    return library_data

In [5]:
def parse_steam_library(steamid):
    """Parser to handle library data."""
    url = "https://api.steampowered.com/IPlayerService/GetOwnedGames/v0001/"
    parameters = {'include_appinfo': True, 'steamid': steamid}
    
    json_data = get_library_request(url, parameters)
    
    if json_data['response']:
        return {'steamid':steamid, 'library':[{'appid': game['appid'], 'name': game['name'], 'hours': game['playtime_forever']} for game in json_data['response']['games']]}
    else:
        return {'steamid':steamid, 'library':'hidden'}

In [6]:
def get_library_request(url, parameters=None):
    """Return json-formatted response of a get request using optional parameters.
    
    Parameters
    ----------
    url : string
    parameters : {'parameter': 'value'}
        parameters to pass as part of get request
    
    Returns
    -------
    json_data
        json-formatted response (dict-like)
    """
    try:
        response = requests.get(
        url,
        {
            **parameters,
            **{
                'key': '9C34DB5DC9F6FB662853AC6217BC048F',
                'format': 'json'
            }
        }
    )
    except SSLError as s:
        print('SSL Error:', s)
        
        for i in range(5, 0, -1):
            print('\rWaiting... ({})'.format(i), end='')
            time.sleep(1)
        print('\rRetrying.' + ' '*10)
        
        # recusively try again
        return get_library_request(url, parameters)
    
    if response:
        return response.json()
    else:
        # response is none usually means too many requests. Wait and try again 
        print('No response, waiting 10 seconds...')
        time.sleep(10)
        print('Retrying.')
        return get_library_request(url, parameters)

In [7]:
with open('steamid_list.pickle', 'rb') as handle:
   steamid_list = pickle.load(handle)

In [8]:
steamid_list

[76561198219067393,
 76561198148157441,
 76561198993539076,
 76561198247182340,
 76561198278705159,
 76561198306000904,
 76561199041871881,
 76561198398210058,
 76561198313209867,
 76561198170079242,
 76561199187853325,
 76561198424457231,
 76561199073034256,
 76561198088650778,
 76561198128726044,
 76561198886682654,
 76561198311899167,
 76561199160819745,
 76561199085551652,
 76561198401257513,
 76561199063236653,
 76561198809874484,
 76561199122907193,
 76561197992706107,
 76561198836187202,
 76561198372618308,
 76561198063386696,
 76561198998093897,
 76561198147928140,
 76561199100035151,
 76561198168342632,
 76561199221735528,
 76561198129545322,
 76561199056158837,
 76561198110376063,
 76561198894416000,
 76561198134493312,
 76561199123759235,
 76561198157955203,
 76561199096168586,
 76561198797291660,
 76561198800797837,
 76561198036582543,
 76561198332706960,
 76561198352302223,
 76561198018494613,
 76561199085518999,
 76561198330806423,
 76561199208988823,
 76561197991952543,


In [9]:
# set files and columns
download_path = '../data'
library_data = 'library_data.csv'
library_index = 'library_index.txt'
api_key = '9C34DB5DC9F6FB662853AC6217BC048F'

library_columns = [
    'steamid', 'library'
]

# reset_index(download_path, library_index)
index = get_index(download_path, library_index)

# Wipe data file if index is 0
prepare_data_file(download_path, library_data, index, library_columns)

process_library_batches(
    parser=parse_steam_library,
    steamid_list=steamid_list,
    download_path=download_path, 
    data_filename=library_data,
    index_filename=library_index,
    columns=library_columns,
    begin=index,
    end=-1,
    batchsize=10,
    pause=.4
)

Starting at index 80:

No response, waiting 10 seconds...
Retrying.
No response, waiting 10 seconds...
Retrying.
No response, waiting 10 seconds...
Retrying.
No response, waiting 10 seconds...
Retrying.
No response, waiting 10 seconds...
Retrying.
No response, waiting 10 seconds...
Retrying.
No response, waiting 10 seconds...


KeyboardInterrupt: 

In [11]:
pd.read_csv('../data/library_data.csv').head()

Unnamed: 0,steamid,library
0,76561198219067393,"[{'appid': 220, 'name': 'Half-Life 2', 'hours'..."
1,76561198148157441,"[{'appid': 17390, 'name': 'Spore', 'hours': 26..."
2,76561198993539076,hidden
3,76561198247182340,hidden
4,76561198278705159,hidden


In [12]:
library_df = pd.read_csv('../data/library_data.csv')

In [13]:
library_df.head(50)

Unnamed: 0,steamid,library
0,76561198219067393,"[{'appid': 220, 'name': 'Half-Life 2', 'hours'..."
1,76561198148157441,"[{'appid': 17390, 'name': 'Spore', 'hours': 26..."
2,76561198993539076,hidden
3,76561198247182340,hidden
4,76561198278705159,hidden
5,76561198306000904,hidden
6,76561199041871881,hidden
7,76561198398210058,hidden
8,76561198313209867,hidden
9,76561198170079242,"[{'appid': 3830, 'name': 'Psychonauts', 'hours..."
