#GetGameData

This notebook executes the code to collect and store relevant Steam game data.

In [1]:
%matplotlib inline

from bs4 import BeautifulSoup
import datetime
import json
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import requests
import scipy as sp
import seaborn as sns
import time

In [2]:
month_map = {};
month_map["jan"] = 1;
month_map["feb"] = 2;
month_map["mar"] = 3;
month_map["apr"] = 4;
month_map["may"] = 5;
month_map["jun"] = 6;
month_map["jul"] = 7;
month_map["aug"] = 8;
month_map["sep"] = 9;
month_map["oct"] = 10;
month_map["nov"] = 11;
month_map["dec"] = 12;

In [3]:
def IsValidDateWordList(wordlist):    
    if len(wordlist) != 3:
        return False;
    
    if not wordlist[0].isalpha():
        return False;
    
    if not wordlist[1].isdigit():
        return False;
    
    if not wordlist[2].isdigit():
        return False;
    
    day = int(wordlist[1]);
    year = int(wordlist[2]);
    
    if (day > 31) or (day < 1):
        return False;
    
    if year < 1970:
        return False;
    
    return True;

In [23]:
fields = ["game_name", "release_date", "lifetime", "tags",
          "platform_windows", "platform_mac", "platform_linux",
          "price_discount", "price_original", "metascore",
          "good_review_percentage_recent", "n_reviews_recent",
          "good_review_percentage", "n_reviews"];

game_dict = {};

for appnumber in [620, 245810, 230050, 391220, 391680, 1]:
    # Define variables and defaults
    game_name = '';
    release_date = datetime.date(1970, 1, 1); # Default
    lifetime = 0; # Default of age 0
    tags = [];
    platform_windows = 0;
    platform_mac     = 0;
    platform_linux   = 0;
    price_discount = -1.00;
    price_original = -1.00;
    metascore = -1;
    good_review_percentage_recent = -1;
    n_reviews_recent = 0;
    good_review_percentage = -1;
    n_reviews = 0;
    
    game_page_html_test = requests.get("http://store.steampowered.com/app/%d/" % appnumber);
    soup = BeautifulSoup(game_page_html_test.text, "html.parser");
    time.sleep(1.5);
    
    ##--- Check if the page redirected to the Steam homepage ---##
    title = soup.title.get_text();
    if("Welcome to Steam" in title):
        continue;
    
    ##--- Look for entry indicating a DLC item ---##
    html_entries = soup.findAll("div", attrs={"class": "game_area_dlc_bubble game_area_bubble"});
    if(len(html_entries) > 0):
        continue;
    
    ##--- Get the game name ---##
    # Find entries that will include the game name
    html_entries = soup.findAll("div", attrs={"class": "apphub_AppName"});
    
    # Should only have one entry
    if(len(html_entries) > 1):
        print(appnumber, "More than 1 game name?");
    if(len(html_entries) == 0):
        print(appnumber, "No game name?");
    else:
        # Extract the game name
        game_name = html_entries[0].get_text();
    print(game_name);
    
    ##--- Get the release date and age ---##
    # Find enries that will include the release date
    html_entries = soup.findAll("span", attrs={"class": "date"});
    
    # Should only have one entry
    if(len(html_entries) > 1):
        print(appnumber, "More than 1 release date?");
    if(len(html_entries) == 0):
        print(appnumber, "No release date?");
    else:
        # Extract text from date entry, remove comma
        release_date_unicode = html_entries[0].get_text();
        release_date_unicode = release_date_unicode.replace(',', '').strip();
        
        # Tokenize the string, should be Mmm DD YYYY
        words = release_date_unicode.split();
        # Make month all lower case
        words[0] = words[0][:3].lower();
        # Extract release date, calculate age
        if IsValidDateWordList(words):
            month = month_map[words[0]];
            day = int(words[1]);
            year = int(words[2]);
            release_date = datetime.date(year, month, day);
            today = datetime.date.today();
            lifetime = (today - release_date).days;
        else:
            print(appnumber, "Invalid release date?");
    print(release_date);
    print(lifetime);
    
    ##--- Get the game tags ---##
    # Find enries that will include the tags
    html_entries = soup.findAll("a", attrs={"class": "app_tag"});
    # Extract each tag and strip trailing/leading white space from them
    tags = [entry.get_text().strip() for entry in html_entries];
    print tags;
    
    ##--- Get the suppoted operating systems ---##
    # Find enries that will include the supported operating systems
    html_entries = soup.findAll("div", attrs={"class": "game_area_purchase_platform"});
    # The "entries" are just the appearance of an image
    # There is no actual text, so simply use (length > 0) as indication of a supported OS
    if(len(html_entries) == 0):
        print(appnumber, "No supported OS?");
    else:
        platform_windows = int(len(html_entries[0].findAll("span", attrs={"class": "platform_img win"})) > 0);
        platform_mac     = int(len(html_entries[0].findAll("span", attrs={"class": "platform_img mac"})) > 0);
        platform_linux   = int(len(html_entries[0].findAll("span", attrs={"class": "platform_img linux"})) > 0);
    print platform_windows, platform_mac, platform_linux;
    
    ##--- Get the current and original prices ---##
    # Find enries that will include the game prices
    html_entries = soup.findAll("div", attrs={"class": "game_purchase_action_bg"});
    # The tag to find is different based on whether the game is currently discounted
    # Search for both
    if(len(html_entries) == 0):
        print(appnumber, "No price block?");
    else:
        price_original_block = html_entries[0].findAll("div", attrs={"class": "game_purchase_price"});
        price_discount_block = html_entries[0].findAll("div", attrs={"class": "discount_prices"});
        
        if(len(price_original_block) == 0 and len(price_discount_block) == 0):
            print(appnumber, "No price?");
        if(len(price_original_block) > 0):
            # Try evaluating the 'no discount' entries first
            # Extract price text, remove dollar sign
            price_text = price_original_block[0].get_text().strip()[1:];
        
            # Evaluate prices as floats
            if(price_text.replace('.', '').isdigit()):
                price_discount = float(price_text);
                price_original = float(price_text);
            else:
                print(appnumber, "Non-numeric word for price?");
        if(len(price_discount_block) > 0):
            # Evaluate 'discount' entry second
            # This will overwrite 'no discount' results if both were found
            price_text = price_discount_block[0].findAll("div", attrs={"class": "discount_final_price"})[0].get_text().strip()[1:];
            if(price_text.replace('.', '').isdigit()):
                price_discount = float(price_text);
            else:
                print(appnumber, "Non-numeric word for price?");
            
            price_text = price_discount_block[0].findAll("div", attrs={"class": "discount_original_price"})[0].get_text().strip()[1:];
            if(price_text.replace('.', '').isdigit()):
                price_original = float(price_text);
            else:
                print(appnumber, "Non-numeric word for price?");
    print(price_discount);
    print(price_original);

    ##--- Get the Metacritic score ---##
    # Find entry that will be the parent of the entry that includes the score
    html_entries = soup.findAll("div", attrs={"id": "game_area_metascore"});
    if(len(html_entries) > 1):
        print(appnumber, "More than 1 Metascore?");
    if(len(html_entries) == 0):
        print(appnumber, "No Metascore?");
    else:
        # The entry that includes the score should be the first
        score_entry = html_entries[0].findAll("div");
        
        if(len(score_entry) == 0):
            print(appnumber, "No score in the Metascore area?");
        else:
            score_text = score_entry[0].get_text().strip();
            if(score_text.isdigit()):
                metascore = int(score_text);
            else:
                print(appnumber, "Non-numeric Metascore?");
    print metascore;
    
    ##--- Get the user review information ---##
    # Find entries that will include the user review information
    html_entries = soup.findAll("div", attrs={"class": "user_reviews_summary_row"});
    # There should be an entry for overall and recent information
    if(len(html_entries) > 2):
        print(appnumber, "More than 1 user review summary?");
    # Extract text from entries, strip trailing/leading white space
    
    if(len(html_entries) == 0):
        print(appnumber, "No user review information?");
    else:
        summary_recent = '';
        summary_overall = html_entries[0].attrs["data-store-tooltip"].strip();
        if(len(html_entries) == 2):
            summary_recent = html_entries[0].attrs["data-store-tooltip"].strip();
            summary_overall = html_entries[1].attrs["data-store-tooltip"].strip();
        
        # Evaluate recent entry information
        if("in the last 30 days" in summary_recent):
            # Tokenize the recent summary text
            words = summary_recent.split();
            
            # The review percentage should be the first word, ignore the '%'
            review_text = words[0][:-1];
            if(review_text.isdigit()):
                good_review_percentage_recent = int(review_text);
            else:
                print(appnumber, "Non-numeric review percentage?");
            
            # Get total number of reviews, remove commmas
            review_text = words[3].replace(',', '');
            if(review_text.isdigit()):
                n_reviews_recent = int(review_text);
            else:
                print(appnumber, "Non-numeric number of reviews?");
        
        # Evaluate overall entry information
        if("in the last 30 days" not in summary_overall):
            words = summary_overall.split();
            
            review_text = words[0][:-1];
            if(review_text.isdigit()):
                good_review_percentage = int(review_text);
            else:
                print(appnumber, "Non-numeric review percentage?");
            
            review_text = words[3].replace(',', '');
            if(review_text.isdigit()):
                n_reviews = int(review_text);
            else:
                print(appnumber, "Non-numeric number of reviews?");
    print(good_review_percentage_recent);
    print(n_reviews_recent);
    print(good_review_percentage);
    print(n_reviews);
    print("\n");
    
    game_dict[appnumber] = dict(zip(fields, [game_name, release_date, lifetime, tags,
                                             platform_windows, platform_mac, platform_linux,
                                             price_discount, price_original, metascore,
                                             good_review_percentage_recent, n_reviews_recent,
                                             good_review_percentage, n_reviews]));

Portal 2
2011-04-18
2076
[u'Puzzle', u'Co-op', u'First-Person', u'Comedy', u'Sci-fi', u'Singleplayer', u'Adventure', u'Online Co-Op', u'Funny', u'Female Protagonist', u'Science', u'Action', u'Story Rich', u'Multiplayer', u'FPS', u'Atmospheric', u'Local Co-Op', u'Strategy', u'Space', u'Platformer']
1 1 1
3.99
19.99
95
99
8918
98
84003


DLC Quest
2013-03-18
1376
[u'Platformer', u'Indie', u'Satire', u'Comedy', u'Short', u'Parody', u'Pixel Graphics', u'2D', u'Singleplayer', u'Funny', u'Retro', u'Adventure', u'Exploration', u'Action', u'Casual', u'Side Scroller', u'Metroidvania', u'Zombies']
1 1 0
0.74
2.99
(230050, 'No Metascore?')
-1
93
141
88
5812


(391220, 'No game name?')

(391220, 'No release date?')
1970-01-01
0
[]
(391220, 'No supported OS?')
0 0 0
(391220, 'No price block?')
-1.0
-1.0
(391220, 'No Metascore?')
-1
(391220, 'No user review information?')
-1
0
-1
0




Now that the data has been scraped, I can save it as a json file. This is the last part of the GetGameData notebook. The data will be loaded in the SteamVis notebook for visualization!

In [24]:
#file_game_data = open("game_data.json", "w");
#json.dump(game_dict, file_game_data);
#file_game_data.close();

game_dict

{620: {'game_name': u'Portal 2',
  'good_review_percentage': 98,
  'good_review_percentage_recent': 99,
  'lifetime': 2076,
  'metascore': 95,
  'n_reviews': 84003,
  'n_reviews_recent': 8918,
  'platform_linux': 1,
  'platform_mac': 1,
  'platform_windows': 1,
  'price_discount': 3.99,
  'price_original': 19.99,
  'release_date': datetime.date(2011, 4, 18),
  'tags': [u'Puzzle',
   u'Co-op',
   u'First-Person',
   u'Comedy',
   u'Sci-fi',
   u'Singleplayer',
   u'Adventure',
   u'Online Co-Op',
   u'Funny',
   u'Female Protagonist',
   u'Science',
   u'Action',
   u'Story Rich',
   u'Multiplayer',
   u'FPS',
   u'Atmospheric',
   u'Local Co-Op',
   u'Strategy',
   u'Space',
   u'Platformer']},
 230050: {'game_name': u'DLC Quest',
  'good_review_percentage': 88,
  'good_review_percentage_recent': 93,
  'lifetime': 1376,
  'metascore': -1,
  'n_reviews': 5812,
  'n_reviews_recent': 141,
  'platform_linux': 0,
  'platform_mac': 1,
  'platform_windows': 1,
  'price_discount': 0.74,
  'pri

In [18]:
game_page_html_test = requests.get("http://store.steampowered.com/app/1/");

In [19]:
#game_page_html_test.text
soup = BeautifulSoup(game_page_html_test.text, "html.parser");

In [22]:
print(soup.title.get_text())
print(soup.prettify())

Welcome to Steam
<!DOCTYPE html>
<html class=" responsive" lang="en">
 <head>
  <meta content="text/html; charset=utf-8" http-equiv="Content-Type">
   <meta content="width=device-width,initial-scale=1" name="viewport">
    <meta content="#171a21" name="theme-color">
     <title>
      Welcome to Steam
     </title>
     <link href="/favicon.ico" rel="shortcut icon" type="image/x-icon">
      <link href="http://store.akamai.steamstatic.com/public/shared/css/motiva_sans.css?v=Sd0odMs2NjL1" rel="stylesheet" type="text/css">
       <link href="http://store.akamai.steamstatic.com/public/shared/css/shared_global.css?v=qUauRjAB6F_h" rel="stylesheet" type="text/css">
        <link href="http://store.akamai.steamstatic.com/public/shared/css/buttons.css?v=FMXZx9fv9yp_" rel="stylesheet" type="text/css">
         <link href="http://store.akamai.steamstatic.com/public/css/v6/store.css?v=WMjWukom2M23" rel="stylesheet" type="text/css">
          <link href="http://store.akamai.steamstatic.com/public/