# Get Aggregated Data

In [6]:
%load_ext lab_black
%load_ext autoreload
%autoreload 2

In [11]:
import os
import re
import time
from glob import glob
from random import choice, uniform

import pandas as pd
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.chrome.options import Options

In [7]:
%aimport src.page_helpers
from src.page_helpers import load_games_search_page

%aimport src.selenium_helpers
from src.selenium_helpers import scroll_up_down_page

%aimport src.utils
from src.utils import save_to_parquet_file, show_df, show_df_dtypes_nans

In [4]:
options = Options()
options.add_argument("--window-size=1920,1080")
options.add_argument("--no-sandbox")
options.add_argument("--disable-gpu")
options.add_argument("start-maximized")
options.add_argument("disable-infobars")
options.add_argument("--incognito")
options.add_argument("--disable-extensions")
options.add_argument("--proxy-server='direct://'")
options.add_argument("--proxy-bypass-list=*")
options.add_experimental_option(
    "prefs",
    {"profile.default_content_setting_values.notifications": 2},
)
options.add_argument(f"user-agent={UserAgent()}")
options.arguments

['--window-size=1920,1080',
 '--no-sandbox',
 '--disable-gpu',
 'start-maximized',
 'disable-infobars',
 '--incognito',
 '--disable-extensions',
 "--proxy-server='direct://'",
 '--proxy-bypass-list=*',
 'user-agent=<fake_useragent.fake.FakeUserAgent object at 0x7f9fbf48cbb0>']

<a href="table-of-contents"></a>

## [Table of Contents](#table-of-contents)
0. [About](#about)
1. [User Inputs](#user-inputs)
2. [Launch Browser](#launch-browser)
3. [Get Aggregated Data for Listings](#get-aggregated-data-for-listings)
   - 3.1. [Display all listing tags](#display-all-listing-tags)
   - 3.2. [Filtered by Operating System](#filtered-by-operating-system)
   - 3.3. [Filtered by Language](#filtered-by-language)
   - 3.4. [Filtered by Supported Number of Players](#filtered-by-supported-number-of-players)
   - 3.5. [Filtered by Feature](#filtered-by-supported-number-of-feature)
   - 3.6. [Filtered by Tag](#filtered-by-tag)
   - 3.7. [Filtered by Virtual Reality Support](#filtered-by-virtual-reality-support)
   - 3.8. [Filtered by Price](#filtered-by-price)
   - 3.9. [Vertically concatenate the individual filtered datasets](#vertically-concatenate-the-individual-filtered-datasets)
   - 3.10. [Post-Processing](#Post-Processing)
4. [Export to Disk](#export-to-disk)
5. [Close Browser](#close-browser)

<a id="about"></a>

## 0. [About](#about)

Scrape aggregated data from the [Steam Powered web store](https://store.steampowered.com/) about PC game listings available on the [Steam platform](https://en.wikipedia.org/wiki/Steam_(service)).

<a id="user-inputs"></a>

## 1. [User Inputs](#user-inputs)

In [12]:
PROJ_ROOT_DIR = os.getcwd()

In [29]:
url = "https://store.steampowered.com/"

# Specify index of web elements of interest to access when filtering
filter_indexes_dict = {
    "feature": [0, 2, 3, 4, 5, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
}

# Specify range slider moves to filter listings by price
price_slider_range_filters = {
    85: 72,
    65: 66,
    55: 60,
    35: 54,
    25: 48,
    7: 42,
    # 0: 42,
    -5: 36,
    -15: 30,
    -30: 24,
    -45: 18,
    -60: 12,
    -75: 6,
    -90: 0,
}

In [15]:
data_dir = os.path.join(PROJ_ROOT_DIR, "data")
raw_data_dir = os.path.join(data_dir, "raw")
webdriver_path = os.path.join(
    os.path.expanduser("~"), "chromedriver_linux64", "chromedriver"
)

tag_counts_filtered_filepath = os.path.join(
    raw_data_dir, "aggregated_counts_by_tag.parquet"
)
num_listings_filtered_filepath = os.path.join(
    raw_data_dir, "aggregated_num_listings.parquet"
)
num_listings_country_os_filtered_filepath = os.path.join(
    raw_data_dir, "aggregated_num_listings_country_os.parquet"
)

In [203]:
def get_counts_by_tag(webdriver, show_more_tags=False):
    """Get number of listings by tag, for the top 16 tags."""
    # Click to show all ("SEE ALL") pre-provided tags
    if show_more_tags:
        webdriver.find_element_by_xpath('.//a[@class="see_all_expander"]').click()
    tags_dict = {}
    # Get list of tag section web elements
    narrow_by_tag = driver.find_element_by_xpath('.//div[@data-collapse-name="tags"]')
    tags = narrow_by_tag.find_element_by_xpath(
        './/div[@class="block_content block_content_inner"]/div'
    ).find_elements_by_tag_name("div")
    # Specify index of web elements of interest (top 16 tags)
    tag_indexes = range(0, 16)
    # Scrape aggregated counts by tag
    for tag_index in tag_indexes:
        single_tag = (
            tags[tag_index]
            .find_element_by_tag_name("span")
            .find_elements_by_tag_name("span")
        )
        print(f"{tag_index}. {single_tag[2].text} = {single_tag[3].text}")
        value = int(single_tag[3].text.replace(",", "")) if single_tag[3].text else 0
        tags_dict[single_tag[2].text] = value
    return tags_dict


def filter_and_count_by_tag(
    driver,
    filter_indexes,
    min_wait_to_update=2.5,
    max_wait_to_update=6.4,
    filter_type="OS",
):
    """Filter listings and get number of listings by tag and overall."""
    all_tags_by_filter = []
    num_listings_by_filter = []
    for filter_index in filter_indexes:
        single_filter = (
            filters[filter_index]
            .find_element_by_tag_name("span")
            .find_elements_by_tag_name("span")
        )
        print(f"Filter Counter = {filter_index}, Filtered by = {single_filter[0].text}")
        # Apply filter to listings (will also update tag counts)
        single_filter[0].click()
        # Pause for counts to update
        time.sleep(uniform(2.5, 6.4))
        # Scroll down
        scroll_up_down_page(
            driver,
            by_how_much=22,
            min_num_pauses=1,
            max_num_pauses=4,
            min_pause=0.1,
            max_pause=2.4,
            scroll_method="slow",
            scroll_direction="down",
        )
        # Get count of tags for filtered listings
        tags_by_filter = get_counts_by_tag(driver, show_more_tags=False)
        # Scroll up
        scroll_up_down_page(
            driver,
            by_how_much=22,
            min_num_pauses=1,
            max_num_pauses=4,
            min_pause=0.1,
            max_pause=2.4,
            scroll_method="slow",
            scroll_direction="up",
        )
        # Pause after scrolling up
        time.sleep(uniform(1, 2))
        # Get count of filtered listings
        try:
            num_listings = driver.find_element_by_xpath(
                './/div[@class="search_results_filtered_warning"]/div'
            ).text
        except Exception:
            num_listings = driver.find_element_by_xpath(
                './/div[@class="search_results"]/div'
            ).text
        print(f"After filtering by {single_filter[0].text}, {num_listings}\n")
        num_listings_by_filter.append(
            {
                "filtered_by": single_filter[0].text,
                "filter_type": filter_type,
                "num_listings": num_listings,
            }
        )
        all_tags_by_filter.append({single_filter[0].text: tags_by_filter})
        # Remove filter to listings (will also restore tag counts)
        single_filter[0].click()
        # Pause for counts to update
        time.sleep(uniform(min_wait_to_update, max_wait_to_update))
    # Convert list of nested dictionaries to a dictionary of DataFrames
    # - produces one DataFrame (value) per unique filter applied (key)
    d_filter = {}
    for k, _ in enumerate(all_tags_by_filter):
        filter_value = list(all_tags_by_filter[k].keys())[0]
        df_tags_by_filter = (
            pd.DataFrame.from_dict(
                list(all_tags_by_filter[k].values())[0], orient="index"
            )
            .assign(filtered_by=filter_value)
            .assign(filter_type=filter_type)
            .reset_index()
            .rename(columns={0: "num_listings", "index": "tag"})
        )
        d_filter[filter_value] = df_tags_by_filter
    df_filtered_tag_counts = pd.concat(list(d_filter.values()), ignore_index=True)
    df_filtered_listings_counts = pd.DataFrame.from_records(num_listings_by_filter)
    return [df_filtered_tag_counts, df_filtered_listings_counts]


def filter_by_tag_and_count_by_tag(
    driver,
    filters,
    filter_indexes,
    min_wait_to_update=2.5,
    max_wait_to_update=6.4,
    filter_type="OS",
):
    """Get number of listings by tag."""
    num_listings_by_filter = []
    for filter_index in filter_indexes:
        single_filter = (
            filters[filter_index]
            .find_element_by_tag_name("span")
            .find_elements_by_tag_name("span")
        )
        print(f"Filter Counter = {filter_index}, Filtered by = {single_filter[0].text}")
        # Apply filter to listings (will also update tag counts)
        single_filter[0].click()
        # Pause for counts to update
        time.sleep(uniform(2.5, 6.4))
        # Scroll down
        scroll_up_down_page(
            driver,
            by_how_much=22,
            min_num_pauses=1,
            max_num_pauses=4,
            min_pause=0.1,
            max_pause=2.4,
            scroll_method="slow",
            scroll_direction="down",
        )
        # Get count of filtered listings
        try:
            num_listings = driver.find_element_by_xpath(
                './/div[@class="search_results_filtered_warning"]/div'
            ).text
        except Exception:
            num_listings = driver.find_element_by_xpath(
                './/div[@class="search_results"]/div'
            ).text
        # Scroll up
        scroll_up_down_page(
            driver,
            by_how_much=22,
            min_num_pauses=1,
            max_num_pauses=4,
            min_pause=0.1,
            max_pause=2.4,
            scroll_method="slow",
            scroll_direction="up",
        )
        # Pause after scrolling up
        print(f"After filtering by {single_filter[0].text}, {num_listings}\n")
        num_listings_by_filter.append(
            {
                "filtered_by": single_filter[0].text,
                "filter_type": filter_type,
                "num_listings": num_listings,
            }
        )
        # Remove filter to listings (will also restore tag counts)
        single_filter[0].click()
        # Pause for counts to update
        time.sleep(uniform(min_wait_to_update, max_wait_to_update))
    df_filtered_listings_counts = pd.DataFrame.from_records(num_listings_by_filter)
    return df_filtered_listings_counts


def filter_and_count_by_tag_with_slider(
    driver,
    move,
    slider,
    price_filter_slider_offset_moves,
    min_wait_to_update=2.5,
    max_wait_to_update=6.4,
    filter_type="OS",
):
    """Filter listings with a range slider and get number of listings by tag and overall."""
    all_tags_by_filter = []
    num_listings_by_filter = []
    for filter_index, (price_slider_filter, price_slider_max_price) in enumerate(
        price_filter_slider_offset_moves.items()
    ):
        # Apply filter to listings (will also update tag counts)
        move.click_and_hold(slider).move_by_offset(
            price_slider_filter, 0
        ).release().perform()
        # Pause for counts to update
        time.sleep(uniform(2.5, 6.4))
        print(
            f"Filter Counter = {filter_index}, Filtered by = {price_slider_max_price}"
        )
        # Scroll down
        scroll_up_down_page(
            driver,
            by_how_much=22,
            min_num_pauses=1,
            max_num_pauses=4,
            min_pause=0.1,
            max_pause=2.4,
            scroll_method="slow",
            scroll_direction="down",
        )
        # Get count of tags for filtered listings
        tags_by_filter = get_counts_by_tag(driver, show_more_tags=False)
        # Scroll up
        scroll_up_down_page(
            driver,
            by_how_much=22,
            min_num_pauses=1,
            max_num_pauses=4,
            min_pause=0.1,
            max_pause=2.4,
            scroll_method="slow",
            scroll_direction="up",
        )
        # Pause after scrolling up
        time.sleep(uniform(1, 2))
        # Get count of filtered listings
        try:
            num_listings = driver.find_element_by_xpath(
                './/div[@class="search_results_filtered_warning"]/div'
            ).text
        except Exception:
            num_listings = driver.find_element_by_xpath(
                './/div[@class="search_results"]/div'
            ).text
        print(f"After filtering by {price_slider_max_price}, {num_listings}\n")
        num_listings_by_filter.append(
            {
                "filtered_by": price_slider_max_price,
                "filter_type": filter_type,
                "num_listings": num_listings,
            }
        )
        all_tags_by_filter.append({price_slider_max_price: tags_by_filter})
        # Pause for counts to update
        time.sleep(uniform(min_wait_to_update, max_wait_to_update))
    # Reset the slider
    move.click_and_hold(price_slider).move_by_offset(100, 0).release().perform()
    # Convert list of nested dictionaries to a dictionary of DataFrames
    # - produces one DataFrame (value) per unique filter applied (key)
    d_filter = {}
    for k, _ in enumerate(all_tags_by_filter):
        filter_value = list(all_tags_by_filter[k].keys())[0]
        df_tags_by_filter = (
            pd.DataFrame.from_dict(
                list(all_tags_by_filter[k].values())[0], orient="index"
            )
            .assign(filtered_by=filter_value)
            .assign(filter_type=filter_type)
            .reset_index()
            .rename(columns={0: "num_listings", "index": "tag"})
        )
        d_filter[filter_value] = df_tags_by_filter
    df_filtered_tag_counts = pd.concat(list(d_filter.values()), ignore_index=True)
    df_filtered_listings_counts = pd.DataFrame.from_records(num_listings_by_filter)
    return [df_filtered_tag_counts, df_filtered_listings_counts]

<a id="launch-browser"></a>

## 2. [Launch Browser](#launch-browser)

In [9]:
driver = webdriver.Chrome(executable_path=webdriver_path, options=options)
driver.delete_all_cookies()

In [61]:
driver = load_games_search_page(driver, url)
# soup = BeautifulSoup(driver.page_source, "html.parser")
# print(soup.prettify())

<html class="responsive" lang="en">
 <head>
  <meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
  <meta content="width=device-width,initial-scale=1" name="viewport"/>
  <meta content="#171a21" name="theme-color"/>
  <title>
   Steam Search
  </title>
  <link href="/favicon.ico" rel="shortcut icon" type="image/x-icon"/>
  <link href="https://store.cloudflare.steamstatic.com/public/shared/css/motiva_sans.css?v=2C1Oh9QFVTyK&amp;l=english&amp;_cdn=cloudflare" rel="stylesheet" type="text/css"/>
  <link href="https://store.cloudflare.steamstatic.com/public/shared/css/shared_global.css?v=Xn8C75dyhnl5&amp;l=english&amp;_cdn=cloudflare" rel="stylesheet" type="text/css"/>
  <link href="https://store.cloudflare.steamstatic.com/public/shared/css/buttons.css?v=hFJKQ6HV7IKT&amp;l=english&amp;_cdn=cloudflare" rel="stylesheet" type="text/css"/>
  <link href="https://store.cloudflare.steamstatic.com/public/css/v6/store.css?v=N61SSgrXjikp&amp;l=english&amp;_cdn=cloudflare" rel="styles

<a id="get-aggregated-data-for-listings"></a>

## 3. [Get Aggregated Data for Listings](#get-aggregated-data-for-listings)

<a id="display-all-listing-tags"></a>

### 3.1. [Display all listing tags](#display-all-listing-tags)

Only five tags are displayed in the *Narrow by tag* section. We'll want to gather aggregated data for all tags available since they are sorted in descending order of most assignments by users (i.e. descending order of popularity). So, we'll start by clicking the *SEE ALL* link to display all the these top tags

In [197]:
driver.find_element_by_xpath('.//a[@class="see_all_expander"]').click()

<a id="filtered-by-operating-system"></a>

### 3.2. [Filtered by Operating System](#filtered-by-operating-system)

In [16]:
%%time
filter_type = "OS"
# Get list of filter section web elements
narrow_by_filter = driver.find_element_by_xpath('.//div[@data-collapse-name="os"]')
filters = narrow_by_filter.find_element_by_xpath(
    './/div[@class="block_content block_content_inner"]'
).find_elements_by_tag_name("div")
# Scrape filtered counts
df_tag_counts_by_os, df_listings_by_os = filter_and_count_by_tag(
    driver, range(len(filters)), 1.4, 2, filter_type
)
show_df(df_tag_counts_by_os)
show_df_dtypes_nans(df_tag_counts_by_os)

Filter Counter = 0, Filtered by = Windows
0. Indie = 38,213
1. Action = 26,908
2. Singleplayer = 26,792
3. Adventure = 24,861
4. Casual = 23,941
5. 2D = 13,942
6. Strategy = 12,517
7. Simulation = 12,206
8. RPG = 11,113
9. Atmospheric = 9,782
10. Puzzle = 9,181
11. Story Rich = 7,933
12. 3D = 7,719
13. Pixel Graphics = 7,217
14. Multiplayer = 7,193
15. Colorful = 7,051
After filtering by OS, 57,608 results match your search. 1,890 titles have been excluded based on your preferences. However, none of these titles would appear on the first page of results.

Filter Counter = 1, Filtered by = macOS
0. Indie = 9,907
1. Singleplayer = 6,238
2. Adventure = 5,956
3. Casual = 5,832
4. Action = 5,288
5. 2D = 4,091
6. Strategy = 3,340
7. Puzzle = 2,769
8. Simulation = 2,699
9. RPG = 2,653
10. Atmospheric = 2,496
11. Story Rich = 2,428
12. Pixel Graphics = 1,947
13. Colorful = 1,752
14. Great Soundtrack = 1,747
15. Multiplayer = 1,741
After filtering by OS, 13,376 results match your search. 324 ti

Unnamed: 0,tag,num_listings,filtered_by,filter_type
0,Indie,38213,Windows,OS
1,Action,26908,Windows,OS
2,Singleplayer,26792,Windows,OS
3,Adventure,24861,Windows,OS
4,Casual,23941,Windows,OS
5,2D,13942,Windows,OS
6,Strategy,12517,Windows,OS
7,Simulation,12206,Windows,OS
8,RPG,11113,Windows,OS
9,Atmospheric,9782,Windows,OS


Unnamed: 0,num_missing,dtype
filtered_by,0,object
filter_type,0,object
num_listings,0,object


CPU times: user 2.13 s, sys: 75.7 ms, total: 2.2 s
Wall time: 46.1 s


In [18]:
show_df(df_listings_by_os, None)
show_df_dtypes_nans(df_listings_by_os)

Unnamed: 0,filtered_by,filter_type,num_listings
0,Windows,OS,"57,608 results match your search. 1,890 titles have been excluded based on your preferences. However, none of these titles would appear on the first page of results."
1,macOS,OS,"13,376 results match your search. 324 titles have been excluded based on your preferences. However, none of these titles would appear on the first page of results."
2,SteamOS + Linux,OS,"8,692 results match your search. 260 titles have been excluded based on your preferences."


Unnamed: 0,num_missing,dtype
filtered_by,0,object
filter_type,0,object
num_listings,0,object


<a id="filtered-by-language"></a>

### 3.3. [Filtered by Language](#filtered-by-language)

In [19]:
%%time
filter_type = "Language"
# (OPTIONAL) Expand filter section
driver.find_element_by_xpath(
    './/div[@id="narrow_language"]/following-sibling::a[@class="see_all_expander"]'
).click()
# Get list of filter section web elements
narrow_by_filter = driver.find_element_by_xpath('.//div[@id="narrow_language"]')
filters = narrow_by_filter.find_elements_by_tag_name("div")
# Scrape filtered counts
df_tag_counts_by_language, df_listings_by_language = filter_and_count_by_tag(
    driver, range(len(filters)), 0.7, 3.8, filter_type
)

Filter Counter = 0, Filtered by = English
0. Indie = 37,001
1. Action = 26,502
2. Singleplayer = 25,958
3. Adventure = 23,866
4. Casual = 23,107
5. 2D = 13,421
6. Strategy = 11,956
7. Simulation = 11,732
8. RPG = 10,116
9. Atmospheric = 9,670
10. Puzzle = 8,981
11. 3D = 7,525
12. Story Rich = 7,504
13. Multiplayer = 7,090
14. Pixel Graphics = 6,985
15. Colorful = 6,955
After filtering by Language, 55,628 results match your search. 1,833 titles have been excluded based on your preferences. However, none of these titles would appear on the first page of results.

Filter Counter = 1, Filtered by = Simplified Chinese
0. Indie = 8,093
1. Singleplayer = 6,810
2. Adventure = 5,851
3. Casual = 5,491
4. Action = 5,421
5. 2D = 3,313
6. RPG = 3,239
7. Strategy = 3,216
8. Simulation = 3,103
9. Atmospheric = 2,725
10. Puzzle = 2,444
11. Story Rich = 2,231
12. Multiplayer = 1,933
13. Cute = 1,902
14. Anime = 1,764
15. Colorful = 1,721
After filtering by Language, 12,155 results match your search. 58

In [21]:
show_df(df_tag_counts_by_language, None)
show_df_dtypes_nans(df_tag_counts_by_language)

Unnamed: 0,tag,num_listings,filtered_by,filter_type
0,Indie,37001,English,Language
1,Action,26502,English,Language
2,Singleplayer,25958,English,Language
3,Adventure,23866,English,Language
4,Casual,23107,English,Language
5,2D,13421,English,Language
6,Strategy,11956,English,Language
7,Simulation,11732,English,Language
8,RPG,10116,English,Language
9,Atmospheric,9670,English,Language


Unnamed: 0,num_missing,dtype
filtered_by,0,object
filter_type,0,object
num_listings,0,object


In [22]:
show_df(df_listings_by_language, None)
show_df_dtypes_nans(df_listings_by_language)

Unnamed: 0,filtered_by,filter_type,num_listings
0,English,Language,"55,628 results match your search. 1,833 titles have been excluded based on your preferences. However, none of these titles would appear on the first page of results."
1,Simplified Chinese,Language,"12,155 results match your search. 580 titles have been excluded based on your preferences. However, none of these titles would appear on the first page of results."
2,Traditional Chinese,Language,"5,586 results match your search. 418 titles have been excluded based on your preferences. However, none of these titles would appear on the first page of results."
3,Japanese,Language,"9,385 results match your search. 575 titles have been excluded based on your preferences. However, none of these titles would appear on the first page of results."
4,Korean,Language,"5,862 results match your search. 357 titles have been excluded based on your preferences. However, none of these titles would appear on the first page of results."
5,Thai,Language,"1,390 results match your search. 247 titles have been excluded based on your preferences. However, none of these titles would appear on the first page of results."
6,Bulgarian,Language,"1,019 results match your search. 215 titles have been excluded based on your preferences."
7,Czech,Language,"2,108 results match your search. 215 titles have been excluded based on your preferences. However, none of these titles would appear on the first page of results."
8,Danish,Language,"1,433 results match your search. 220 titles have been excluded based on your preferences. However, none of these titles would appear on the first page of results."
9,German,Language,"13,801 results match your search. 381 titles have been excluded based on your preferences. However, none of these titles would appear on the first page of results."


Unnamed: 0,num_missing,dtype
filtered_by,0,object
filter_type,0,object
num_listings,0,object


<a id="filtered-by-supported-number-of-players"></a>

### 3.4. [Filtered by Supported Number of Players](#filtered-by-supported-number-of-players)

In [25]:
%%time
filter_type = "number of players"
# (OPTIONAL) Expand filter section
driver.find_element_by_xpath('.//div[@data-collapse-name="category3"]').click()
# Get list of filter section web elements
narrow_by_filter = driver.find_element_by_xpath('.//div[@data-collapse-name="category3"]')
filters = narrow_by_filter.find_element_by_xpath(
    './/div[@class="block_content block_content_inner"]'
).find_elements_by_tag_name("div")
# Scrape filtered counts
df_tag_counts_by_num_players, df_listings_by_num_players = filter_and_count_by_tag(
    driver, range(len(filters)), 2.4, 5, filter_type
)
show_df(df_tag_counts_by_num_players)
show_df_dtypes_nans(df_tag_counts_by_num_players)

Filter Counter = 0, Filtered by = Single-player
0. Indie = 36,470
1. Singleplayer = 26,538
2. Action = 24,861
3. Adventure = 24,189
4. Casual = 22,852
5. 2D = 13,549
6. Simulation = 11,767
7. Strategy = 11,737
8. RPG = 10,499
9. Atmospheric = 9,552
10. Puzzle = 9,079
11. Story Rich = 7,848
12. 3D = 7,143
13. Pixel Graphics = 7,018
14. Colorful = 6,717
15. Exploration = 6,339
After filtering by Single-player, 54,609 results match your search. 1,880 titles have been excluded based on your preferences. However, none of these titles would appear on the first page of results.

Filter Counter = 1, Filtered by = Multi-player
0. Action = 8,335
1. Indie = 7,377
2. Multiplayer = 6,376
3. Singleplayer = 4,563
4. Casual = 4,418
5. Strategy = 3,574
6. Adventure = 3,550
7. Co-op = 3,371
8. PvP = 3,044
9. Simulation = 2,593
10. Early Access = 2,521
11. 2D = 2,381
12. Local Multiplayer = 2,280
13. RPG = 2,253
14. Shooter = 2,088
15. Arcade = 2,042
After filtering by Multi-player, 12,860 results match 

In [26]:
show_df(df_listings_by_num_players, None)
show_df_dtypes_nans(df_listings_by_num_players)

Unnamed: 0,filtered_by,filter_type,num_listings
0,Single-player,number of players,"54,609 results match your search. 1,880 titles have been excluded based on your preferences. However, none of these titles would appear on the first page of results."
1,Multi-player,number of players,"12,860 results match your search. 40 titles have been excluded based on your preferences. However, none of these titles would appear on the first page of results."
2,PvP,number of players,"8,128 results match your search. 18 titles have been excluded based on your preferences. However, none of these titles would appear on the first page of results."
3,Online PvP,number of players,"5,793 results match your search. 15 titles have been excluded based on your preferences. However, none of these titles would appear on the first page of results."
4,LAN PvP,number of players,"566 results match your search. 2 titles have been excluded based on your preferences. However, none of these titles would appear on the first page of results."
5,Shared/Split Screen PvP,number of players,"3,449 results match your search. 8 titles have been excluded based on your preferences. However, none of these titles would appear on the first page of results."
6,Co-op,number of players,"6,417 results match your search. 26 titles have been excluded based on your preferences. However, none of these titles would appear on the first page of results."
7,Online Co-op,number of players,"3,531 results match your search. 13 titles have been excluded based on your preferences. However, none of these titles would appear on the first page of results."
8,LAN Co-op,number of players,"530 results match your search. 1 title has been excluded based on your preferences. However, this title would not have appeared on the first page of results."
9,Shared/Split Screen Co-op,number of players,"2,721 results match your search. 15 titles have been excluded based on your preferences. However, none of these titles would appear on the first page of results."


Unnamed: 0,num_missing,dtype
filtered_by,0,object
filter_type,0,object
num_listings,0,object


<a id="filtered-by-supported-number-of-feature"></a>

### 3.5. [Filtered by Feature](#filtered-by-supported-number-of-feature)

In [37]:
%%time
filter_type = "feature"
# (OPTIONAL) Expand filter section
driver.find_element_by_xpath('.//div[@data-collapse-name="category2"]').click()
# Get list of filter section web elements
narrow_by_filter = driver.find_element_by_xpath('.//div[@data-collapse-name="category2"]')
filters = narrow_by_filter.find_element_by_xpath(
    './/div[@class="block_content block_content_inner"]'
).find_elements_by_tag_name("div")
# Scrape filtered counts
df_tag_counts_by_feature, df_listings_by_feature = filter_and_count_by_tag(
    driver, filter_indexes_dict[filter_type], 2.4, 5.4, filter_type
)
show_df(df_tag_counts_by_feature)
show_df_dtypes_nans(df_tag_counts_by_feature)

Filter Counter = 0, Filtered by = Played with Steam Controller
0. Singleplayer = 1,877
1. Action = 1,798
2. Adventure = 1,369
3. Multiplayer = 1,227
4. Indie = 925
5. Great Soundtrack = 859
6. Atmospheric = 813
7. RPG = 782
8. Open World = 747
9. Co-op = 742
10. Story Rich = 732
11. Third Person = 623
12. 2D = 571
13. Simulation = 544
14. Funny = 527
15. Anime = 515
After filtering by Played with Steam Controller, 2,436 results match your search. 15 titles have been excluded based on your preferences. However, none of these titles would appear on the first page of results.

Filter Counter = 2, Filtered by = Steam Achievements
0. Indie = 19,819
1. Singleplayer = 13,985
2. Action = 13,442
3. Adventure = 12,250
4. Casual = 11,657
5. 2D = 7,806
6. Strategy = 5,878
7. Atmospheric = 5,547
8. Puzzle = 5,450
9. Simulation = 5,250
10. RPG = 5,193
11. Story Rich = 4,495
12. Pixel Graphics = 4,116
13. Colorful = 3,812
14. Multiplayer = 3,574
15. Arcade = 3,532
After filtering by Steam Achievement

Unnamed: 0,tag,num_listings,filtered_by,filter_type
0,Singleplayer,1877,Played with Steam Controller,feature
1,Action,1798,Played with Steam Controller,feature
2,Adventure,1369,Played with Steam Controller,feature
3,Multiplayer,1227,Played with Steam Controller,feature
4,Indie,925,Played with Steam Controller,feature
5,Great Soundtrack,859,Played with Steam Controller,feature
6,Atmospheric,813,Played with Steam Controller,feature
7,RPG,782,Played with Steam Controller,feature
8,Open World,747,Played with Steam Controller,feature
9,Co-op,742,Played with Steam Controller,feature


Unnamed: 0,num_missing,dtype
filtered_by,0,object
filter_type,0,object
num_listings,0,object


CPU times: user 7.71 s, sys: 259 ms, total: 7.97 s
Wall time: 5min 42s


In [39]:
show_df(df_listings_by_feature, None)
show_df_dtypes_nans(df_listings_by_feature)

Unnamed: 0,filtered_by,filter_type,num_listings
0,Played with Steam Controller,feature,"2,436 results match your search. 15 titles have been excluded based on your preferences. However, none of these titles would appear on the first page of results."
1,Steam Achievements,feature,"27,344 results match your search. 956 titles have been excluded based on your preferences. However, none of these titles would appear on the first page of results."
2,Full controller support,feature,"13,099 results match your search. 97 titles have been excluded based on your preferences. However, none of these titles would appear on the first page of results."
3,Steam Trading Cards,feature,"8,927 results match your search. 133 titles have been excluded based on your preferences. However, none of these titles would appear on the first page of results."
4,Captions available,feature,"1,301 results match your search. 21 titles have been excluded based on your preferences."
5,Steam Workshop,feature,"1,555 results match your search. 12 titles have been excluded based on your preferences. However, none of these titles would appear on the first page of results."
6,SteamVR Collectibles,feature,42 results match your search.
7,Partial Controller Support,feature,"8,362 results match your search. 151 titles have been excluded based on your preferences. However, none of these titles would appear on the first page of results."
8,Steam Cloud,feature,"13,822 results match your search. 326 titles have been excluded based on your preferences. However, none of these titles would appear on the first page of results."
9,Valve Anti-Cheat enabled,feature,112 results match your search.


Unnamed: 0,num_missing,dtype
filtered_by,0,object
filter_type,0,object
num_listings,0,object


<a id="filtered-by-tag"></a>

### 3.6. [Filtered by Tag](#filtered-by-tag)

In [59]:
%%time
filter_type = "tag"
# Get list of filter section web elements
narrow_by_filter = driver.find_element_by_xpath('.//div[@data-collapse-name="tags"]')
filters = narrow_by_filter.find_element_by_xpath(
    './/div[@class="block_content block_content_inner"]/div'
).find_elements_by_tag_name("div")
# Scrape filtered counts
df_listings_by_tag = filter_by_tag_and_count_by_tag(
    driver,
    filters,
    range(0, 16),
    min_wait_to_update=1.5,
    max_wait_to_update=3.4,
    filter_type="tag",
)
show_df(df_listings_by_tag)
show_df_dtypes_nans(df_listings_by_tag)

Filter Counter = 0, Filtered by = Indie 38,221
After filtering by Indie, 38,220 results match your search. 1,137 titles have been excluded based on your preferences. However, none of these titles would appear on the first page of results.

Filter Counter = 1, Filtered by = Action 26,922
After filtering by Action, 26,922 results match your search. 391 titles have been excluded based on your preferences. However, none of these titles would appear on the first page of results.

Filter Counter = 2, Filtered by = Singleplayer 26,805
After filtering by Singleplayer, 26,805 results match your search. 1,119 titles have been excluded based on your preferences. However, none of these titles would appear on the first page of results.

Filter Counter = 3, Filtered by = Adventure 24,870
After filtering by Adventure, 24,870 results match your search. 698 titles have been excluded based on your preferences. However, none of these titles would appear on the first page of results.

Filter Counter = 4, 

Unnamed: 0,filtered_by,filter_type,num_listings
0,Indie,tag,"38,220 results match your search. 1,137 titles have been excluded based on your preferences. However, none of these titles would appear on the first page of results."
1,Action,tag,"26,922 results match your search. 391 titles have been excluded based on your preferences. However, none of these titles would appear on the first page of results."
2,Singleplayer,tag,"26,805 results match your search. 1,119 titles have been excluded based on your preferences. However, none of these titles would appear on the first page of results."
3,Adventure,tag,"24,870 results match your search. 698 titles have been excluded based on your preferences. However, none of these titles would appear on the first page of results."
4,Casual,tag,"23,948 results match your search. 1,286 titles have been excluded based on your preferences. However, none of these titles would appear on the first page of results."
5,2D,tag,"13,950 results match your search. 680 titles have been excluded based on your preferences."
6,Strategy,tag,"12,522 results match your search. 217 titles have been excluded based on your preferences. However, none of these titles would appear on the first page of results."
7,Simulation,tag,"12,210 results match your search. 590 titles have been excluded based on your preferences. However, none of these titles would appear on the first page of results."
8,RPG,tag,"11,117 results match your search. 500 titles have been excluded based on your preferences. However, none of these titles would appear on the first page of results."
9,Atmospheric,tag,"9,786 results match your search. 177 titles have been excluded based on your preferences. However, none of these titles would appear on the first page of results."


Unnamed: 0,num_missing,dtype
filtered_by,0,object
filter_type,0,object
num_listings,0,object


CPU times: user 214 ms, sys: 3.09 ms, total: 217 ms
Wall time: 1min 51s


<a id="filtered-by-virtual-reality-support"></a>

### 3.7. [Filtered by Virtual Reality Support](#filtered-by-virtual-reality-support)

In [113]:
%%time
filter_type = "VR Support"
# (OPTIONAL) Expand filter section
driver.find_element_by_xpath('.//div[@data-collapse-name="vrsupport"]').click()
# Get list of filter section web elements
narrow_by_filter = driver.find_element_by_xpath('.//div[@data-collapse-name="vrsupport"]')
filters = narrow_by_filter.find_element_by_xpath(
    './/div[@class="block_content block_content_inner"]'
).find_elements_by_class_name("tab_filter_control_row")
# Scrape filtered counts
df_tag_counts_by_vr_support, df_listings_by_vr_support = filter_and_count_by_tag(
    driver, range(len(filters)), 1.4, 2, filter_type
)
show_df(df_tag_counts_by_vr_support)
show_df_dtypes_nans(df_tag_counts_by_vr_support)

Filter Counter = 0, Filtered by = VR Only
0. VR = 3,841
1. Indie = 2,509
2. Action = 2,167
3. Casual = 1,879
4. Adventure = 1,422
5. Simulation = 1,401
6. Singleplayer = 1,084
7. Early Access = 923
8. First-Person = 648
9. Sports = 550
10. Strategy = 435
11. Shooter = 432
12. 3D = 416
13. FPS = 390
14. Free to Play = 382
15. Atmospheric = 376
After filtering by VR Only, 4,104 results match your search. 57 titles have been excluded based on your preferences.

Filter Counter = 1, Filtered by = VR Supported
0. VR = 4,456
1. Indie = 3,029
2. Action = 2,566
3. Casual = 2,163
4. Simulation = 1,712
5. Adventure = 1,685
6. Singleplayer = 1,368
7. Early Access = 1,094
8. First-Person = 886
9. Sports = 650
10. Strategy = 553
11. 3D = 533
12. Shooter = 512
13. Atmospheric = 508
14. FPS = 474
15. Free to Play = 448
After filtering by VR Supported, 4,865 results match your search. 88 titles have been excluded based on your preferences. However, none of these titles would appear on the first page of

Unnamed: 0,tag,num_listings,filtered_by,filter_type
0,VR,3841,VR Only,VR Support
1,Indie,2509,VR Only,VR Support
2,Action,2167,VR Only,VR Support
3,Casual,1879,VR Only,VR Support
4,Adventure,1422,VR Only,VR Support
5,Simulation,1401,VR Only,VR Support
6,Singleplayer,1084,VR Only,VR Support
7,Early Access,923,VR Only,VR Support
8,First-Person,648,VR Only,VR Support
9,Sports,550,VR Only,VR Support


Unnamed: 0,num_missing,dtype
filtered_by,0,object
filter_type,0,object
num_listings,0,object


CPU times: user 6.55 s, sys: 261 ms, total: 6.81 s
Wall time: 3min 58s


In [114]:
show_df(df_listings_by_vr_support, None)
show_df_dtypes_nans(df_listings_by_vr_support)

Unnamed: 0,filtered_by,filter_type,num_listings
0,VR Only,VR Support,"4,104 results match your search. 57 titles have been excluded based on your preferences."
1,VR Supported,VR Support,"4,865 results match your search. 88 titles have been excluded based on your preferences. However, none of these titles would appear on the first page of results."
2,Valve Index,VR Support,"4,295 results match your search. 85 titles have been excluded based on your preferences. However, none of these titles would appear on the first page of results."
3,HTC Vive,VR Support,"4,635 results match your search. 86 titles have been excluded based on your preferences. However, none of these titles would appear on the first page of results."
4,Oculus Rift,VR Support,"3,598 results match your search. 83 titles have been excluded based on your preferences. However, none of these titles would appear on the first page of results."
5,Windows Mixed Reality,VR Support,"1,415 results match your search. 19 titles have been excluded based on your preferences. However, none of these titles would appear on the first page of results."
6,Tracked Motion Controllers,VR Support,"4,584 results match your search. 84 titles have been excluded based on your preferences. However, none of these titles would appear on the first page of results."
7,Gamepad,VR Support,570 results match your search.
8,Keyboard / Mouse,VR Support,"672 results match your search. 14 titles have been excluded based on your preferences. However, none of these titles would appear on the first page of results."
9,Seated,VR Support,"2,827 results match your search. 77 titles have been excluded based on your preferences. However, none of these titles would appear on the first page of results."


Unnamed: 0,num_missing,dtype
filtered_by,0,object
filter_type,0,object
num_listings,0,object


<a id="filtered-by-price"></a>

### 3.8. [Filtered by Price](#filtered-by-price)

Filtering by price requires use of one-way slider that controls the maximum listing price to the displayed. Before filtering listings by price, we'll refresh the browser before manipulating this slider. Through trial-and-error, this was found to help reduce/remove stickyness in the slider movement (automated by the Selenium webdriver) when changing the maximum price filters.

In [None]:
driver.refresh()

In [199]:
%%time
narrow_by_filter = driver.find_element_by_xpath('.//div[@data-collapse-name="price"]')
move = ActionChains(driver)
price_slider = narrow_by_filter.find_element_by_xpath(
    './/div[@class="block_content block_content_inner"]/div/div/input'
)

The code below can be used to undo a slider movement and could be useful if the helper function (called next) errors out leaving the web driver in an undesirable/unusable state (eg. with the wrong type of filter applied to one of the checkboxes)

In [199]:
# move.click_and_hold(price_slider).move_by_offset(100, 0).release().perform()
# time.sleep(3)

CPU times: user 0 ns, sys: 3.65 ms, total: 3.65 ms
Wall time: 3.59 s


In [201]:
%%time
df_tag_counts_by_price, df_listings_by_price = filter_and_count_by_tag_with_slider(
    driver,
    move,
    price_slider,
    price_slider_range_filters,
    min_wait_to_update=2.5,
    max_wait_to_update=6.4,
    filter_type="Price",
)
show_df(df_tag_counts_by_price)
show_df_dtypes_nans(df_tag_counts_by_price)

Filter Counter = 0, Filtered by = 72
0. Indie = 29,452
1. Action = 19,508
2. Singleplayer = 19,076
3. Casual = 18,633
4. Adventure = 18,042
5. 2D = 10,054
6. Strategy = 9,160
7. Simulation = 8,862
8. RPG = 7,495
9. Puzzle = 7,152
10. Atmospheric = 6,622
11. Story Rich = 5,150
12. Pixel Graphics = 5,102
13. Arcade = 4,818
14. Early Access = 4,762
15. Colorful = 4,605
After filtering by 72, 42,115 results match your search. 1,549 titles have been excluded based on your preferences. However, none of these titles would appear on the first page of results.

Filter Counter = 1, Filtered by = 66
0. Indie = 29,440
1. Action = 19,454
2. Singleplayer = 19,011
3. Casual = 18,626
4. Adventure = 18,007
5. 2D = 10,045
6. Strategy = 9,134
7. Simulation = 8,828
8. RPG = 7,464
9. Puzzle = 7,149
10. Atmospheric = 6,597
11. Story Rich = 5,122
12. Pixel Graphics = 5,099
13. Arcade = 4,809
14. Early Access = 4,760
15. Colorful = 4,601
After filtering by 66, 42,025 results match your search. 1,547 titles ha

Unnamed: 0,tag,num_listings,filtered_by,filter_type
0,Indie,29452,72,Price
1,Action,19508,72,Price
2,Singleplayer,19076,72,Price
3,Casual,18633,72,Price
4,Adventure,18042,72,Price
5,2D,10054,72,Price
6,Strategy,9160,72,Price
7,Simulation,8862,72,Price
8,RPG,7495,72,Price
9,Puzzle,7152,72,Price


Unnamed: 0,num_missing,dtype
filtered_by,0,int64
filter_type,0,object
num_listings,0,object


CPU times: user 5.51 s, sys: 247 ms, total: 5.75 s
Wall time: 5min 23s


In [202]:
show_df(df_listings_by_price, None)
show_df_dtypes_nans(df_listings_by_price)

Unnamed: 0,filtered_by,filter_type,num_listings
0,72,Price,"42,115 results match your search. 1,549 titles have been excluded based on your preferences. However, none of these titles would appear on the first page of results."
1,66,Price,"42,025 results match your search. 1,547 titles have been excluded based on your preferences. However, none of these titles would appear on the first page of results."
2,60,Price,"41,990 results match your search. 1,547 titles have been excluded based on your preferences. However, none of these titles would appear on the first page of results."
3,54,Price,"41,871 results match your search. 1,546 titles have been excluded based on your preferences. However, none of these titles would appear on the first page of results."
4,48,Price,"41,763 results match your search. 1,539 titles have been excluded based on your preferences. However, none of these titles would appear on the first page of results."
5,42,Price,"41,473 results match your search. 1,532 titles have been excluded based on your preferences. However, none of these titles would appear on the first page of results."
6,36,Price,"41,206 results match your search. 1,524 titles have been excluded based on your preferences. However, none of these titles would appear on the first page of results."
7,30,Price,"40,603 results match your search. 1,506 titles have been excluded based on your preferences. However, none of these titles would appear on the first page of results."
8,24,Price,"39,701 results match your search. 1,488 titles have been excluded based on your preferences. However, none of these titles would appear on the first page of results."
9,18,Price,"36,653 results match your search. 1,400 titles have been excluded based on your preferences."


Unnamed: 0,num_missing,dtype
filtered_by,0,int64
filter_type,0,object
num_listings,0,object


<a id="vertically-concatenate-the-individual-filtered-datasets"></a>

### 3.9. [Vertically concatenate the individual filtered datasets](#vertically-concatenate-the-individual-filtered-datasets)

In [207]:
dfs_tag_counts = [
    df_tag_counts_by_os,
    df_tag_counts_by_language,
    df_tag_counts_by_num_players,
    df_tag_counts_by_feature,
    df_tag_counts_by_vr_support,
    df_tag_counts_by_price,
]
dfs_listings = [
    df_listings_by_os,
    df_listings_by_language,
    df_listings_by_num_players,
    df_listings_by_feature,
    df_listings_by_vr_support,
    df_listings_by_price,
    df_listings_by_tag,
]

In [214]:
df_tag_counts_all_filters = pd.concat(dfs_tag_counts, ignore_index=True)
assert df_tag_counts_all_filters["filter_type"].nunique() == len(dfs_tag_counts)
show_df(df_tag_counts_all_filters)
show_df_dtypes_nans(df_tag_counts_all_filters)

Unnamed: 0,tag,num_listings,filtered_by,filter_type
0,Indie,38213,Windows,OS
1,Action,26908,Windows,OS
2,Singleplayer,26792,Windows,OS
3,Adventure,24861,Windows,OS
4,Casual,23941,Windows,OS
5,2D,13942,Windows,OS
6,Strategy,12517,Windows,OS
7,Simulation,12206,Windows,OS
8,RPG,11113,Windows,OS
9,Atmospheric,9782,Windows,OS


Unnamed: 0,num_missing,dtype
tag,0,object
num_listings,0,int64
filtered_by,0,object
filter_type,0,object


In [218]:
df_listings_all_filters = pd.concat(dfs_listings, ignore_index=True)
assert len(df_listings_all_filters) == len(df_listings_all_filters)
show_df(df_listings_all_filters)
show_df_dtypes_nans(df_listings_all_filters)

Unnamed: 0,filtered_by,filter_type,num_listings
0,Windows,OS,"57,608 results match your search. 1,890 titles have been excluded based on your preferences. However, none of these titles would appear on the first page of results."
1,macOS,OS,"13,376 results match your search. 324 titles have been excluded based on your preferences. However, none of these titles would appear on the first page of results."
2,SteamOS + Linux,OS,"8,692 results match your search. 260 titles have been excluded based on your preferences."
3,English,Language,"55,628 results match your search. 1,833 titles have been excluded based on your preferences. However, none of these titles would appear on the first page of results."
4,Simplified Chinese,Language,"12,155 results match your search. 580 titles have been excluded based on your preferences. However, none of these titles would appear on the first page of results."
5,Traditional Chinese,Language,"5,586 results match your search. 418 titles have been excluded based on your preferences. However, none of these titles would appear on the first page of results."
6,Japanese,Language,"9,385 results match your search. 575 titles have been excluded based on your preferences. However, none of these titles would appear on the first page of results."
7,Korean,Language,"5,862 results match your search. 357 titles have been excluded based on your preferences. However, none of these titles would appear on the first page of results."
8,Thai,Language,"1,390 results match your search. 247 titles have been excluded based on your preferences. However, none of these titles would appear on the first page of results."
9,Bulgarian,Language,"1,019 results match your search. 215 titles have been excluded based on your preferences."


Unnamed: 0,num_missing,dtype
filtered_by,0,object
filter_type,0,object
num_listings,0,object


<a id="post-processing"></a>

### 3.10. [Post-Processing](#Post-Processing)

We'll apply a single [post-processing](https://link.springer.com/chapter/10.1007/3-540-44673-7_13) step to extract the number of displayed and excluded listings in the listings dataset. Currently, these are embedded in the string that was scraped from the search results page. Here, we'll extract them into separate columns. We'll also convert the datatype of each of these extracted columns from a string to an integer.

In [265]:
listings_cols = ["included", "excluded"]
df_listings_all_filters[listings_cols] = df_listings_all_filters[
    "num_listings"
].str.split("match your search. ", expand=True)
for col in listings_cols:
    df_listings_all_filters[col] = (
        df_listings_all_filters[col].str.replace(",", "").str.extract("(\d+)")
    ).fillna(0)
df_listings_all_filters = df_listings_all_filters.astype(
    {"included": int, "excluded": int}
)
show_df(df_listings_all_filters)
show_df_dtypes_nans(df_listings_all_filters)

Unnamed: 0,filtered_by,filter_type,num_listings,included,excluded
0,Windows,OS,"57,608 results match your search. 1,890 titles have been excluded based on your preferences. However, none of these titles would appear on the first page of results.",57608,1890
1,macOS,OS,"13,376 results match your search. 324 titles have been excluded based on your preferences. However, none of these titles would appear on the first page of results.",13376,324
2,SteamOS + Linux,OS,"8,692 results match your search. 260 titles have been excluded based on your preferences.",8692,260
3,English,Language,"55,628 results match your search. 1,833 titles have been excluded based on your preferences. However, none of these titles would appear on the first page of results.",55628,1833
4,Simplified Chinese,Language,"12,155 results match your search. 580 titles have been excluded based on your preferences. However, none of these titles would appear on the first page of results.",12155,580
5,Traditional Chinese,Language,"5,586 results match your search. 418 titles have been excluded based on your preferences. However, none of these titles would appear on the first page of results.",5586,418
6,Japanese,Language,"9,385 results match your search. 575 titles have been excluded based on your preferences. However, none of these titles would appear on the first page of results.",9385,575
7,Korean,Language,"5,862 results match your search. 357 titles have been excluded based on your preferences. However, none of these titles would appear on the first page of results.",5862,357
8,Thai,Language,"1,390 results match your search. 247 titles have been excluded based on your preferences. However, none of these titles would appear on the first page of results.",1390,247
9,Bulgarian,Language,"1,019 results match your search. 215 titles have been excluded based on your preferences.",1019,215


Unnamed: 0,num_missing,dtype
filtered_by,0,object
filter_type,0,object
num_listings,0,object
included,0,int64
excluded,0,int64


<a id="export-to-disk"></a>

## 4. [Export to Disk](#export-to-disk)

We'll save the loaded data to a parquet file

We'll first change the datatype of the `filtered_by` column which is mixed with strings and integers since, when filtering by price, the price column appears as an integer (in the `filtered_by` column) as the associated web element was retrieved from an integer-based [range slider](https://freefrontend.com/css-range-sliders/).

In [266]:
for df in [df_listings_all_filters, df_tag_counts_all_filters]:
    df["filtered_by"] = df["filtered_by"].astype("str")
    show_df_dtypes_nans(df)

Unnamed: 0,num_missing,dtype
filtered_by,0,object
filter_type,0,object
num_listings,0,object
included,0,int64
excluded,0,int64


Unnamed: 0,num_missing,dtype
tag,0,object
num_listings,0,int64
filtered_by,0,object
filter_type,0,object


We'll now export the `DataFrame`s with the scraped data to disk

In [None]:
save_to_parquet_file(
    [df_tag_counts_all_filters, df_listings_all_filters],
    [tag_counts_filtered_filepath, num_listings_filtered_filepath],
)

As a sanity check, we'll re-load the files and show the datatypes

In [275]:
df_tag_counts_all_filters_reloaded = pd.read_parquet(
    glob(os.path.join(raw_data_dir, "*_by_tag.parquet.gzip"))[0]
)
show_df(df_tag_counts_all_filters_reloaded, 5)
show_df_dtypes_nans(df_tag_counts_all_filters_reloaded)

Unnamed: 0,tag,num_listings,filtered_by,filter_type
0,Indie,38213,Windows,OS
1,Action,26908,Windows,OS
2,Singleplayer,26792,Windows,OS
3,Adventure,24861,Windows,OS
4,Casual,23941,Windows,OS
1323,3D,1009,0,Price
1324,Early Access,914,0,Price
1325,VR,848,0,Price
1326,Massively Multiplayer,841,0,Price
1327,Atmospheric,838,0,Price


Unnamed: 0,num_missing,dtype
tag,0,object
num_listings,0,int64
filtered_by,0,object
filter_type,0,object


In [277]:
df_listings_all_filters_reloaded = pd.read_parquet(
    glob(os.path.join(raw_data_dir, "*_listings.parquet.gzip"))[0]
)
show_df(df_listings_all_filters_reloaded, 5)
show_df_dtypes_nans(df_listings_all_filters_reloaded)

Unnamed: 0,filtered_by,filter_type,num_listings,included,excluded
0,Windows,OS,"57,608 results match your search. 1,890 titles have been excluded based on your preferences. However, none of these titles would appear on the first page of results.",57608,1890
1,macOS,OS,"13,376 results match your search. 324 titles have been excluded based on your preferences. However, none of these titles would appear on the first page of results.",13376,324
2,SteamOS + Linux,OS,"8,692 results match your search. 260 titles have been excluded based on your preferences.",8692,260
3,English,Language,"55,628 results match your search. 1,833 titles have been excluded based on your preferences. However, none of these titles would appear on the first page of results.",55628,1833
4,Simplified Chinese,Language,"12,155 results match your search. 580 titles have been excluded based on your preferences. However, none of these titles would appear on the first page of results.",12155,580
94,Story Rich,tag,"7,938 results match your search. 373 titles have been excluded based on your preferences. However, none of these titles would appear on the first page of results.",7938,373
95,3D,tag,"7,723 results match your search. 194 titles have been excluded based on your preferences.",7723,194
96,Pixel Graphics,tag,"7,220 results match your search. 92 titles have been excluded based on your preferences.",7220,92
97,Multiplayer,tag,"7,199 results match your search. 13 titles have been excluded based on your preferences. However, none of these titles would appear on the first page of results.",7199,13
98,Colorful,tag,"7,056 results match your search. 258 titles have been excluded based on your preferences. However, none of these titles would appear on the first page of results.",7056,258


Unnamed: 0,num_missing,dtype
filtered_by,0,object
filter_type,0,object
num_listings,0,object
included,0,int64
excluded,0,int64


**Observations**
1. From visual inspection, we can see that the column datatypes are the same as those from the corresponding version of each `DataFrame` holding the raw data from the scraping code.

<a id="close-browser"></a>

## 5. [Close Browser](#close-browser)

In [278]:
driver.quit()

<a id="manually-retrieved-statistics"></a>

## 6. [Manually Retrieved Statistics](#manually-retrieved-statistics)

For each country, collect the number of listings by operating system

In [16]:
# 2021-10-13 23:00:00
country_os_dict = {
    "English": {"Win": 55629, "MacOS": 13196, "SteamOS+Linux": 8654},
    "Simplified Chinese": {"Win": 12159, "MacOS": 3184, "SteamOS+Linux": 1881},
    "Traditional Chinese": {"Win": 5586, "MacOS": 1424, "SteamOS+Linux": 808},
    "Japanese": {"Win": 9386, "MacOS": 2656, "SteamOS+Linux": 1589},
    "Korean": {"Win": 5863, "MacOS": 1883, "SteamOS+Linux": 1141},
    "Thai": {"Win": 1390, "MacOS": 416, "SteamOS+Linux": 278},
    "Bulgarian": {"Win": 1019, "MacOS": 307, "SteamOS+Linux": 247},
    "Czech": {"Win": 2108, "MacOS": 710, "SteamOS+Linux": 500},
    "Danish": {"Win": 1432, "MacOS": 466, "SteamOS+Linux": 325},
    "German": {"Win": 13799, "MacOS": 4424, "SteamOS+Linux": 2758},
    "Spanish (Spain)": {"Win": 12141, "MacOS": 3898, "SteamOS+Linux": 2492},
    "Spanish (Latin America)": {"Win": 2419, "MacOS": 656, "SteamOS+Linux": 429},
    "Greek": {"Win": 1172, "MacOS": 368, "SteamOS+Linux": 287},
    "French": {"Win": 13281, "MacOS": 4213, "SteamOS+Linux": 2661},
    "Italian": {"Win": 9310, "MacOS": 3019, "SteamOS+Linux": 1892},
    "Hungarian": {"Win": 1645, "MacOS": 522, "SteamOS+Linux": 417},
    "Dutch": {"Win": 2675, "MacOS": 1021, "SteamOS+Linux": 637},
    "Norwegian": {"Win": 1359, "MacOS": 419, "SteamOS+Linux": 316},
    "Polish": {"Win": 5441, "MacOS": 1664, "SteamOS+Linux": 1195},
    "Portuguese": {"Win": 3761, "MacOS": 1263, "SteamOS+Linux": 809},
    "Portuguese (Brazil)": {"Win": 6364, "MacOS": 2141, "SteamOS+Linux": 1460},
    "Romanian": {"Win": 1169, "MacOS": 367, "SteamOS+Linux": 277},
    "Russian": {"Win": 12887, "MacOS": 3909, "SteamOS+Linux": 2570},
    "Finnish": {"Win": 1347, "MacOS": 425, "SteamOS+Linux": 326},
    "Swedish": {"Win": 1750, "MacOS": 598, "SteamOS+Linux": 412},
    "Turkish": {"Win": 3325, "MacOS": 989, "SteamOS+Linux": 662},
    "Vietnamese": {"Win": 897, "MacOS": 245, "SteamOS+Linux": 163},
    "Ukranian": {"Win": 1651, "MacOS": 559, "SteamOS+Linux": 425},
}
dfs_country_os = []
for country, listings in country_os_dict.items():
    df_country_os = (pd.DataFrame.from_dict(listings, orient="index").T).assign(
        country=country
    )
    dfs_country_os.append(df_country_os)
df_country_os = pd.concat(dfs_country_os, ignore_index=True)
show_df(df_country_os)
show_df_dtypes_nans(df_country_os)

Unnamed: 0,Win,MacOS,SteamOS+Linux,country
0,55629,13196,8654,English
1,12159,3184,1881,Simplified Chinese
2,5586,1424,808,Traditional Chinese
3,9386,2656,1589,Japanese
4,5863,1883,1141,Korean
5,1390,416,278,Thai
6,1019,307,247,Bulgarian
7,2108,710,500,Czech
8,1432,466,325,Danish
9,13799,4424,2758,German


Unnamed: 0,num_missing,dtype
Win,0,int64
MacOS,0,int64
SteamOS+Linux,0,int64
country,0,object


Save to disk

In [None]:
save_to_parquet_file([df_country_os], [num_listings_country_os_filtered_filepath])

---

<span style="float:left">
    2021 | <a href="https://github.com/elsdes3/web-scraping">@elsdes3</a> (MIT)
</span>

<span style="float:right">
    <a href="./1_eda_aggregated.ipynb">1 - Exploratory Data Analysis of Aggregated data >></a>
</span>