# Recommendation System Crawler

In [1]:
# for web crawler
import requests
import json

# used to parse HTML
from bs4 import BeautifulSoup as bs
from contextlib import closing
import urllib
import re

# for multithreading support
from threading import Thread
from collections import defaultdict
from Queue import Queue

# Steam API key
key = 'xxxxxxxxxxxxxxxxxxxxxxxxxx'

## Get Member ID

Get all online member ID first, since SteamAPI will only accept member ID for query

In [2]:
# get user IDs by searching user profile
# version 1: might missing some IDs since beautiful soup cannot parse PHP script

def get_user_id(user_profile, user_ids):
    url = user_profile

    resp = requests.get(url)

    soup = bs(resp.text, 'html.parser')

    steamid_blocks = soup.find_all("div", class_ = "showcase_slot item_showcase_item ")

    for steamid_block in steamid_blocks:
        try:
            user_id_field = steamid_block['data-economy-item'].encode("ascii")
            user_id = re.search("\d+\/\d+\/\d+\/(\d+)", user_id_field).group(1)
            print user_id + ' ' + user_profile
            user_ids.append(user_id)
            break
        except:
            continue

            from contextlib import closing

In [3]:
# version 2: stemID can always be found in PHP script; use urllib to parse response

def get_user_id(user_profile, user_ids):
    url = user_profile

    with closing(urllib.urlopen(url)) as page:
        for line in page:
            if "steamid" in line:
                try: 
                    user_id = re.search("\"steamid\":\"(\d+)\"", line).group(1)
                    print user_id + ' ' + user_profile
                    if user_id != None:
                        user_ids.append(user_id)
                        break
                except:
                    continue

In [4]:
# traverse the member list to find out online/in-game users

def get_online_users(member_list_no, user_ids):
    url = 'https://steamcommunity.com/games/steam/members?p=' + str(member_list_no)

    resp = requests.get(url)

    soup = bs(resp.text, 'html.parser')
    # print(soup.prettify())

    # search profile of users who are online/in-game
    all_users = soup.find_all("div", \
                              onclick = re.compile("top\.location\.href='https:\/\/steamcommunity\.com\/id\/(\w+)'"), \
                              class_ = re.compile("online|in-game"))

    # get user names
    for user in all_users:
        user_profile = user.div.div.div.a['href'].encode("ascii")
        # print user_profile
        get_user_id(user_profile, user_ids)
        # user_name = re.search('https:\/\/steamcommunity\.com\/id\/(\w+)', user_profile).group(1)

In [5]:
# traverse through every member list
# modify the range to get more users

member_list_page_no = 5
user_ids = []
for idx in range(1, member_list_page_no + 1):
    print "Member List " + str(idx)
    get_online_users(idx, user_ids)

print "Total online users found:"
print len(user_ids)

Member List 1
76561197972495328 https://steamcommunity.com/id/FireSlash
76561197960434622 https://steamcommunity.com/id/afarnsworth
76561197968459473 https://steamcommunity.com/id/drunkenf00l
76561197970323416 https://steamcommunity.com/id/tomqbui
76561197963135603 https://steamcommunity.com/id/jigoku
76561197960794555 https://steamcommunity.com/id/killahinstinct_
76561198053398526 https://steamcommunity.com/id/0x6D6178
76561197971155734 https://steamcommunity.com/id/rotNdude
76561197978607315 https://steamcommunity.com/id/Zefar
76561197995162898 https://steamcommunity.com/id/Electrosta
76561197967617980 https://steamcommunity.com/id/metal_smith
76561198005531434 https://steamcommunity.com/id/yojka
76561198021504253 https://steamcommunity.com/id/grodiusmouft
76561198118064479 https://steamcommunity.com/id/citi3en
76561198039785193 https://steamcommunity.com/id/soraefir
Member List 2
76561198039785193 https://steamcommunity.com/id/soraefir
76561198047364412 https://steamcommunity.com/id

Index the Steam user ID since each ID is too long. Otherwise it'll introduce overflow in the recommendation algorithm we use later.

In [5]:
def dump_user_id(user_ids, user_out_file):
    with open(user_out_file, 'w') as f:
        for idx in range(0, len(user_ids)):
            user_id_idx = {'user_idx': idx, 'user_id': user_ids[idx]}
            json.dump(user_id_idx, f)
            f.write('\n')

In [None]:
dump_user_id(user_ids, 'user_idx_sample.json')

## User Info Summaries

Using GetPlayerSummaries (v0002) [API](https://developer.valvesoftware.com/wiki/Steam_Web_API#GetUserStatsForGame_.28v0002.29)

Note: 
1. the Steam ID is appended to each record since the original record does not hold such information
2. we need to do some data cleaning during json object extraction since some fields or json hierarchies are not quite useful

In [6]:
def process_json_obj(resp, user_out_file, user_id):
    if 'user_summary' in user_out_file:
        # corner case: list index out of range
        try:
            obj = resp.json()['response']['players'][0]
        except:
            obj = {'steamid' : user_id}
    elif 'user_owned_games' in user_out_file:
        obj = resp.json()['response']
        obj = {'steamid' : user_id, 'game_count' : obj['game_count'], 'games' : obj['games']}
    elif 'user_friend_list' in user_out_file:
        obj = resp.json()['friendslist']
        obj = {'steamid' : user_id, 'friends' : obj['friends']}
    elif 'user_recently_played_games' in user_out_file:
        obj = resp.json()['response']
        try:
            obj = {'steamid' : user_id, 'total_count' : obj['total_count'], 'games' : obj['games']}
        except:
            # corner case: total_count is zero
            obj = {'steamid' : user_id, 'total_count' : obj['total_count'], 'games' : []}
    return obj

def dump_user_info(url, user_ids, user_out_file):
    with open(user_out_file, 'w') as f:
        for user_id in user_ids:
            url_temp = url + str(user_id)
            resp = requests.get(url_temp)
            # resp = requests.head(url_temp)
            obj = process_json_obj(resp, user_out_file, user_id)
            json.dump(obj, f)
            f.write('\n')

In [36]:
url = 'http://api.steampowered.com/ISteamUser/GetPlayerSummaries/v0002/?key=' + key + '&steamids='

dump_user_info(url, user_ids, 'user_summary_sample.json')

## User Owned Games

Using GetOwnedGames (v0001) [API](https://developer.valvesoftware.com/wiki/Steam_Web_API#GetOwnedGames_.28v0001.29)

In [14]:
url = 'http://api.steampowered.com/IPlayerService/GetOwnedGames/v0001/?key=' + key + '&steamid='

dump_user_info(url, user_ids, 'user_owned_games_sample.json')

## User Friend List

Using GetFriendList (v0001) [API](https://developer.valvesoftware.com/wiki/Steam_Web_API#GetFriendList_.28v0001.29)

In [19]:
url = 'http://api.steampowered.com/ISteamUser/GetFriendList/v0001/?key=' + key + '&steamid='

dump_user_info(url, user_ids, 'user_friend_list_sample.json')

## User Recently Played Games

Using GetRecentlyPlayedGames (v0001) [API](https://developer.valvesoftware.com/wiki/Steam_Web_API#GetRecentlyPlayedGames_.28v0001.29)

In [32]:
url = 'http://api.steampowered.com/IPlayerService/GetRecentlyPlayedGames/v0001/?key=' + key + '&steamid='

dump_user_info(url, user_ids, 'user_recently_played_games_sample.json')

## Get Member ID - Multithreading

Modify the member ID retrievial process to accomodate multi-threading. Reuse get_user_id and get_online_users functions. Note, if the number of pages to iterate is small, then multithreading may yield to lower efficiency compared to single threading.

In [None]:
# Multithreaded version of Get_member_ID process
pages_to_iterate = 400
concurrency = 10
# each thread gets its own "user_ids_mt" list
user_ids_mt = defaultdict(list)

def get_online_users_wrapper(lower_bound, upper_bound, user_ids_local):
    for idx in range(pages_lower_bound, pages_upper_bound):
        get_online_users(idx, user_ids_local)

threadlist = []
for thread_id in range(concurrency):
    user_ids_local = user_ids_mt[thread_id]
    pages_lower_bound = pages_to_iterate / concurrency * thread_id + 1
    pages_upper_bound = pages_to_iterate / concurrency + pages_lower_bound
    # print pages_lower_bound
    # print pages_upper_bound
    thread = Thread(target = get_online_users_wrapper, args = (pages_lower_bound, pages_upper_bound, user_ids_local,))
    thread.start()
    threadlist.append(thread)

for thread in threadlist:
    thread.join()

Like single thread case, index the Steam ID to avoid overflow.

In [8]:
user_ids_flatten = []
for concur in range(concurrency):
    user_ids_flatten.extend(user_ids_mt[concur])

print "Total users found in the first " + str(pages_to_iterate) + " pages of online member list:"
print len(user_ids_flatten)

Total users found in the first 400 pages of online member list:
2385


In [9]:
dump_user_id(user_ids_flatten, 'user_idx_full.json')

## User Info Summaries

The biggist challenge is to ensure the write to file is thread-safe. There are two fundamental methods to guarantee that:

1. Using lock acquisition before write, and release the lock after the write
2. Write the objects to queue, and write objects fetched from queue to file. Queue can guarantee thread-safe

Here we are using method 2, since the first one might impose potential performance overhead. Depending on OS features, the time taken to lock and unlock the file as well as rewrite it for every request may be more than you expect.

In [10]:
write_queue = Queue()

# enqueue thread
def dump_user_info_mt(url, user_ids_local, write_queue, out_file):
    for user_id in user_ids_local:
        url_temp = url + str(user_id)
        resp = requests.get(url_temp)
        obj = process_json_obj(resp, out_file, user_id)
        # print obj
        write_queue.put(obj)

# write to file from queue thread
def write_from_queue(write_queue, out_file):
    with open(out_file, 'w') as f:
        while(True):
            if write_queue.empty():
                continue
            obj = write_queue.get()
            if (obj == 'kill'):
                break
            json.dump(obj, f)
            f.write('\n')
        f.flush()
        
def dump_user_info_mt_wrapper(url, user_ids_mt, out_file):
    write_queue = Queue()
    
    # spawn enqueue threads
    threadlist = []
    for thread_id in range(len(user_ids_mt.keys())):
        user_ids_local = user_ids_mt[thread_id]
        thread = Thread(target = dump_user_info_mt, args = (url, user_ids_local, write_queue, out_file,))
        thread.start()
        threadlist.append(thread)
    
    # spawn dequeue thread
    dequeue_thread = Thread(target = write_from_queue, args = (write_queue, out_file,))
    dequeue_thread.start()
    
    # join all threads
    for thread in threadlist:
        thread.join()
    
    # element types in queue are not necessarily the same
    write_queue.put('kill')
    # make sure all elements are processed before join the dequeue thread
    dequeue_thread.join()

In [11]:
url = 'http://api.steampowered.com/ISteamUser/GetPlayerSummaries/v0002/?key=' + key + '&steamids='

dump_user_info_mt_wrapper(url, user_ids_mt, 'user_summary_full.json')

## Other Information

Reuse the functions above, we can get other information we need as well.

In [12]:
url = 'http://api.steampowered.com/IPlayerService/GetOwnedGames/v0001/?key=' + key + '&steamid='

dump_user_info_mt_wrapper(url, user_ids_mt, 'user_owned_games_full.json')

In [13]:
url = 'http://api.steampowered.com/ISteamUser/GetFriendList/v0001/?key=' + key + '&steamid='

dump_user_info_mt_wrapper(url, user_ids_mt, 'user_friend_list_full.json')

In [14]:
url = 'http://api.steampowered.com/IPlayerService/GetRecentlyPlayedGames/v0001/?key=' + key + '&steamid='

dump_user_info_mt_wrapper(url, user_ids_mt, 'user_recently_played_games_full.json')