In [12]:
from concurrent.futures import ThreadPoolExecutor
from urllib.parse import urlencode

import pandas as pd
import numpy as np
import re
import requests
from bs4 import BeautifulSoup

In [5]:
d = pd.read_csv('steam-200k.csv', encoding='latin2', names=['user', 'game', 'behavior', 'hours'])

# Remove messed up rows.
d = d[(d['behavior'] == 'purchase') | (d['behavior'] == 'play')]

# Purchased means 0 hours.
d.loc[d['behavior'] == 'purchase', 'hours'] = 0
d = d.drop('behavior', axis=1)
d['hours'] = d['hours'].astype(float)
d = d.groupby(['user', 'game'], as_index=False).sum()

def normalize(title):
    return re.sub('[^0-9a-zA-Z\s]', '', title).lower()

# Get official game IDs
url = 'http://api.steampowered.com/ISteamApps/GetAppList/v0002/'
game_data = requests.get(url).json()['applist']['apps']
all_game_ids = {normalize(g['name']): g['appid'] for g in game_data}
d['game_id'] = d['game'].apply(lambda g: all_game_ids.get(normalize(g)))

d = d[d['game_id'].notnull()]
d['game_id'] = d['game_id'].astype(int)

In [6]:
steamspy_url = 'http://steamspy.com/app/{}'
def get_game_info(game_id):
    r = requests.get(steamspy_url.format(game_id)).text
    info = {}
    soup = BeautifulSoup(r, 'html.parser')
    elems = soup.find('div', attrs={'class': 'panel-body'}).find('img').find('br').contents
    i = 0
    count = 0
    while i < len(elems):
        e = elems[i]
        if isinstance(e, str) and not e.strip():
            pass
        elif e.name == 'br':
            i = 0
            elems = e.contents
            continue
        elif e.name == 'strong':
            curr_key = e.text.strip(':')
        elif e.name == 'a':
            if curr_key in ['Tags', 'Languages', 'Genre']:
                if curr_key not in info:
                    info[curr_key] = []
                info[curr_key].append(e.text)
            else:
                info[curr_key] = e.text
        elif curr_key == 'Category' and 'Category' not in info:
            info[curr_key] = [c.strip() for c in e.split(',')]
        elif curr_key == 'Free' and 'Price' not in info:
            info['Price'] = 0.0
        elif curr_key == 'Price' and 'Price' not in info:
            info[curr_key] = float(e.strip().strip('$'))
        elif curr_key not in info:
            val = e.strip(':').strip()
            if val and val[-1] == '%':
                val = int(val.rstrip('%'))
            info[curr_key] = val
        i += 1
    return info

In [10]:
game_ids = list(d['game_id'].unique())

In [48]:
count = 0
total = len(game_ids)
futures = []
results = {}
with ThreadPoolExecutor(max_workers=30) as executor:
    for game_id in game_ids:
        results[game_id] = executor.submit(get_game_info, game_id)

In [31]:
for i in results:
    try:
        results[i] = results[i].result()
    except:
        results[i] = None

In [33]:
missing = [i for i in results if not results[i]]

In [46]:
len(missing)

4496

In [None]:
results