In [1]:
import pandas as pd
import numpy as np
import requests
import keyring
import time
from IPython.display import clear_output
from queryer import QueryApi
import json
from datetime import datetime

In [2]:
password = keyring.get_password(service_name = 'steam', username = 'IcyJoseph')

In [3]:
api = QueryApi(password)
gamesfilter = pd.read_csv('datafiles/gamesfilter.csv')

Successfully connected to the Steam API


In [4]:
number_ids_you_want = 3000000 # how many unique steam ids do you want? the API queryer will stop when it has more than
                              # this many

### generate an initial list of steam ids - scraping https://steamcommunity.com/groups/steamuniverse/memberslistxml/

In [15]:
def query_steamcommunity(pagenum):
    '''
    small function to scrape steam ids from the steamcommunity website
    '''
    page = requests.get(f'https://steamcommunity.com/groups/steamuniverse/memberslistxml/?xml=1&p={pagenum+1}')
    
    page2 = str(page.content)
    page2 = page2.replace(">","")
    page2 = page2.replace("<","")
    page2 = page2.replace("/","")
    page3 = str(page2).split("steamID64")
    
    steam_ids = [i for i in page3 if i.isnumeric()]
    
    return steam_ids

In [34]:
open('datafiles/new_list_ids.txt', mode = 'w')
wait_multiplier = 1

for pagenum in range(1700):
    
    steam_ids = query_steamcommunity(pagenum)
    
    while len(steam_ids) == 0:
        print(f"Website refused our query - sleeping - {wait_multiplier * 60}s")
        time.sleep(wait_multiplier * 60) # the steam website will shut you out if you make requests too frequently, 
                                         # so we take a rest here
        steam_ids = query_steamcommunity(pagenum)
        wait_multiplier += 1
    
    wait_multiplier = 1
    string_ids = "".join(i+"," for i in steam_ids)
    
    with open('datafiles/idlist.txt', mode = 'a') as file:
        file.write(string_ids)
    
    clear_output()
    print(f"done {pagenum+1}")
        
    time.sleep(2)

done 1700 of 1,000


In [16]:
# pulling friends from a known id, then pulling friends of each friend, and so on

with(open('datafiles/idlist.txt', mode = 'r')) as file:
    input_ids = file.read()

input_ids = input_ids.replace("'","")
input_ids = input_ids.replace("[","")
input_ids = input_ids.replace("]","")
input_ids = input_ids.split(", ")

counter_start = 0
num_runs = 1
all_ids = list()

while len(all_ids) < number_ids_you_want and counter_start != len(input_ids):
    
    counter_end = counter_start+100 # the steam api can accept up to 100 steam ids at once, so we'll give it 100...
    
    if counter_end > len(input_ids): # ... or as many as we can if we don't have 100
        counter_end = len(input_ids)
        
    id_string = [i+"," for i in input_ids[counter_start:counter_end]] # pull out the ids from our list
    ids = "".join(id_string) # and concatenate them all into one string
        
    time.sleep(0.5)    
        
    steam_ids = api.find_friends(ids) # query API
        
    all_ids.extend(steam_ids) # adds public ids to output list
    all_ids = list(set(all_ids)) # unique values
    
    input_ids.extend(steam_ids) # adds the friends just pulled to the list of ids to iterate through, so we can keep pulling
    input_ids = list(set(input_ids))
    
    clear_output()
    print(f"running from {counter_start} to {counter_end}, input_ids is "+str(len(input_ids))+" items long")
    
    counter_start = counter_end
    num_runs += 1

id_df = pd.DataFrame(data = {'id':all_ids,'games_played':''}) # export steam ids to csv
id_df.to_csv('datafiles/all_ids.csv', index = False)

### Get Games (work in progress)

In [4]:
all_ids = pd.read_csv('datafiles/all_ids.csv', index_col = 'id', keep_default_na = False, 
                      dtype = {'games_played':str}) 
                      # steam will only accept 100k calls a day, and we have 1.7M, so have to do this in bits

all_ids['games_played'] = ''
all_ids.to_csv('datafiles/all_ids.csv')

ids_still_to_query = all_ids[all_ids['games_played'] == ''].copy()

if len(ids_still_to_query) >= 20:
    ids_still_to_query = ids_still_to_query[:20]
    
ids_to_run = ids_still_to_query.index

In [6]:
maxruns = len(ids_to_run)

for runnum, id_num in enumerate(ids_to_run):
    try:
        newgames = api.get_users_games(id_num) # get all games
        restrictedgames = [i for i in newgames if i[0] in gamesfilter['app_id']] 
                                            # just the 1,000 most important games - otherwise it'll be too many features
        game_names = [i[1] for i in restrictedgames]

        all_ids.at[id_num,'games_played']= game_names
        
    except KeyError:
        all_ids.at[id_num,'games_played']= 'no games'
        continue
        
    finally:
        clear_output()
        print(f"run {runnum+1} of {maxruns} completed")

all_ids.to_csv('datafiles/all_ids.csv')

run 1 of 20 completed
run 2 of 20 completed
run 3 of 20 completed
run 4 of 20 completed
run 5 of 20 completed
run 6 of 20 completed
run 7 of 20 completed
run 8 of 20 completed
run 9 of 20 completed
run 10 of 20 completed
run 11 of 20 completed
run 12 of 20 completed
run 13 of 20 completed
run 14 of 20 completed
run 15 of 20 completed
run 16 of 20 completed
run 17 of 20 completed
run 18 of 20 completed
run 19 of 20 completed
run 20 of 20 completed
