# Exploratory Data Analysis - Aggregated Data

In [1]:
%load_ext lab_black
%load_ext autoreload
%autoreload 2

In [2]:
import concurrent.futures
import os
import re
import time
from datetime import datetime
from glob import glob
from itertools import repeat

import pandas as pd
import requests

In [3]:
%aimport src.utils
from src.utils import show_df, show_df_dtypes_nans

<a href="table-of-contents"></a>

## [Table of Contents](#table-of-contents)
0. [About](#about)
1. [User Inputs](#user-inputs)
2. [Load Aggregated Datasets](#load-aggregated-datasets)
3. [Exploratory Data Analysis](#exploratory-data-analysis)
   - 3.1. [Tags](#tags)
   - 3.2. [Operating Systems and Languages](#operating-systems-and-languages)
   - 3.3. [Number of Players](#number-of-players)
   - 3.4. [Feature](#feature)
   - 3.5. [Tag](#tag)
4. [Exploring listing Filters](#exploring-listing-filters)
   - 4.1. [Prices](#prices)
   - 4.2. [Tags](#tags)
   - 4.3. [Filters](#filters)
5. [Conclusion](#conclusion)

<a id="about"></a>

## 0. [About](#about)

Explore the aggregated data, with and without various filters, about listings offered by Steam.

The objective of this notebook is to determine if we can apply any filters to the listings before scraping them. Since there tens of thousands of listings, filtering them would help reduce the number of pages of search results and the number of listings themselves that need to be scraped.

<a id="user-inputs"></a>

## 1. [User Inputs](#user-inputs)

In [4]:
PROJ_ROOT_DIR = os.getcwd()

In [5]:
# PC marketshare
# # Global
url_global_pc_marketshare = (
    "https://gs.statcounter.com/os-market-share/desktop/worldwide/chart.php?device="
    "Desktop&device_hidden=desktop&statType_hidden=os_combined&region_hidden=ww"
    "&granularity=monthly&statType=Operating%20System&region=Worldwide&fromInt=201312"
    "&toInt=202109&fromMonthYear=2013-12&toMonthYear=2021-09&csv=1"
)
# # Per Country
countries = [
    "china",
    "japan",
    "south-korea",
    "thailand",
    "bulgaria",
    "czech-republic",
    "denmark",
    "germany",
    "south-america",
    "spain",
    "greece",
    "france",
    "italy",
    "hungary",
    "netherlands",
    "norway",
    "poland",
    "portugal",
    "romania",
    "russian-federation",
    "finland",
    "turkey",
    "viet-nam",
    "ukraine",
]

# Genres and Sub-Genres of listings on Steam store
list_of_genres = [
    "Action",
    "Role-Playing",
    "Strategy",
    "Adventure & Casual",
    "Simulation",
    "Sports & Racing",
]
list_of_sub_genres = [
    "Action Rogue-Like",
    "Arcade & Rhythm",
    "Beat'Em Up",
    "Fighting & Martial Arts",
    "First-Person Shooter",
    "Platformer & Runner",
    "Third-Person Shooter",
    "Adventure",
    "Adventure RPG",
    "Casual",
    "Metroidvania",
    "Puzzle",
    "Story-Rich",
    "Visual Novel",
    "Action RPG",
    "Adventure RPG",
    "JRPG",
    "Party-Based",
    "Rogue-Like",
    "Strategy RPG",
    "Turn-Based",
    "Building & Automation",
    "Business & Tycoon",
    "Dating",
    "Farming & Crafting",
    "Life & Immersive",
    "Sandbox & Physics",
    "Space & Flight",
    "Card & Board",
    "City & Settlement",
    "Grand & 4X",
    "Military",
    "Real-Time Strategy",
    "Tower Defense",
    "Turn-Bsed Strategy",
    "All Sports",
    "Fishing & Hunting",
    "Individual Sports",
    "Racing",
    "Racing Sim",
    "Sports Sim",
    "Team Sports",
]

In [6]:
data_dir = os.path.join(PROJ_ROOT_DIR, "data")
raw_data_dir = os.path.join(data_dir, "raw")

tag_counts_filtered_filename_no_ext = "aggregated_counts_by_tag"
num_listings_filtered_filename_no_ext = "aggregated_num_listings"
num_listings_country_os_filtered_filename_no_ext = "aggregated_num_listings_country_os"

filtered_tag_counts_filepath = glob(
    os.path.join(raw_data_dir, f"{tag_counts_filtered_filename_no_ext}.parquet.gzip")
)[0]
filtered_listing_counts_filepath = glob(
    os.path.join(raw_data_dir, f"{num_listings_filtered_filename_no_ext}.parquet.gzip")
)[0]
filtered_listing_counts_country_os_filepath = glob(
    os.path.join(
        raw_data_dir, f"{num_listings_country_os_filtered_filename_no_ext}.parquet.gzip"
    )
)[0]
# print(filtered_tag_counts_filepath)
# print(filtered_listing_counts_filepath)
# print(filtered_listing_counts_country_os_filepath)

pc_marketshare_dir = os.path.join(raw_data_dir, "pc-marketshare")
if not os.path.exists(pc_marketshare_dir):
    os.mkdir(pc_marketshare_dir)
global_pc_marketshare_filepath = os.path.join(pc_marketshare_dir, "pc_marketshare.csv")
# print(global_pc_marketshare_filepath)

In [7]:
def get_rolling(df, period=5):
    """Get rolling statistics for all columns."""
    return df.set_index("Date").rolling(period).mean().add_suffix(f"_{period}m")


def summarize_tags_per_group(df):
    """Get top five listings, number of unique filters and tags per filter."""
    return pd.DataFrame.from_dict(
        dict(
            top_five=",".join(
                [str(q) for q in df["num_listings"].nlargest(5).tolist()]
            ),
            num_sub_filters=df["filtered_by"].nunique(),
            num_tags_per_filter=df["tag"].nunique(),
        ),
        orient="index",
    ).T


def get_single_country_pc_marketshare(country, pc_marketshare_dir):
    """Get monthly historical PC marketshare stats for single country."""
    if not os.path.isdir():
        os.mkdir("pc-marketshare")
    country_filepath = os.path.join(pc_marketshare_dir, f"pc_marketshare_{country}.csv")
    if not os.path.exists(country_filepath):
        country_url = (
            "https://gs.statcounter.com/os-market-share/desktop/chart.php?device=Desktop&"
            "device_hidden=desktop&statType_hidden=os_combined&region_hidden=TD&"
            f"granularity=monthly&statType=Operating%20System&region={country}&"
            "fromInt=201312&toInt=202109&fromMonthYear=2013-12&toMonthYear=2021-09&csv=1"
        )
        try:
            r = requests.get(country_url)
            print(f"Downloaded data for {country}")
        except Exception:
            print(f"Could not download data for {country}")
        with open(country_filepath, "wb") as file:
            file.write(r.content)
    else:
        print(f"Skipped previously downloaded data for {country}")


def parallelize_processing(custom_func, countries):
    """Parallelize retrieval of PC marketshare stats for multiple countries."""
    MAX_THREADS = len(countries)
    args = [countries, repeat(pc_marketshare_dir)]
    with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_THREADS) as executor:
        return executor.map(custom_func, *args)

<a id="load-aggregated-datasets"></a>

## 2. [Load Aggregated Datasets](#load-aggregated-datasets)

We'll start by loading the aggregated data for Steam game listings.

Loading the aggregated data showing the number of tag assignments by Steam users, after applying various types of filters

In [8]:
%%time
df_tags = pd.read_parquet(filtered_tag_counts_filepath, engine="auto")
show_df(df_tags, 5)
show_df_dtypes_nans(df_tags)

Unnamed: 0,tag,num_listings,filtered_by,filter_type
0,Indie,38213,Windows,OS
1,Action,26908,Windows,OS
2,Singleplayer,26792,Windows,OS
3,Adventure,24861,Windows,OS
4,Casual,23941,Windows,OS
1323,3D,1009,0,Price
1324,Early Access,914,0,Price
1325,VR,848,0,Price
1326,Massively Multiplayer,841,0,Price
1327,Atmospheric,838,0,Price


Unnamed: 0,num_missing,dtype
tag,0,object
num_listings,0,int64
filtered_by,0,object
filter_type,0,object


CPU times: user 192 ms, sys: 2.3 ms, total: 195 ms
Wall time: 191 ms


Loading the aggregated data showing the number of game listings, after applying various types of filters

In [9]:
%%time
df_listings = pd.read_parquet(filtered_listing_counts_filepath, engine="auto")
show_df(df_listings, 5)
show_df_dtypes_nans(df_listings)

Unnamed: 0,filtered_by,filter_type,num_listings,included,excluded
0,Windows,OS,"57,608 results match your search. 1,890 titles have been excluded based on your preferences. However, none of these titles would appear on the first page of results.",57608,1890
1,macOS,OS,"13,376 results match your search. 324 titles have been excluded based on your preferences. However, none of these titles would appear on the first page of results.",13376,324
2,SteamOS + Linux,OS,"8,692 results match your search. 260 titles have been excluded based on your preferences.",8692,260
3,English,Language,"55,628 results match your search. 1,833 titles have been excluded based on your preferences. However, none of these titles would appear on the first page of results.",55628,1833
4,Simplified Chinese,Language,"12,155 results match your search. 580 titles have been excluded based on your preferences. However, none of these titles would appear on the first page of results.",12155,580
94,Story Rich,tag,"7,938 results match your search. 373 titles have been excluded based on your preferences. However, none of these titles would appear on the first page of results.",7938,373
95,3D,tag,"7,723 results match your search. 194 titles have been excluded based on your preferences.",7723,194
96,Pixel Graphics,tag,"7,220 results match your search. 92 titles have been excluded based on your preferences.",7220,92
97,Multiplayer,tag,"7,199 results match your search. 13 titles have been excluded based on your preferences. However, none of these titles would appear on the first page of results.",7199,13
98,Colorful,tag,"7,056 results match your search. 258 titles have been excluded based on your preferences. However, none of these titles would appear on the first page of results.",7056,258


Unnamed: 0,num_missing,dtype
filtered_by,0,object
filter_type,0,object
num_listings,0,object
included,0,int64
excluded,0,int64


CPU times: user 12.7 ms, sys: 656 µs, total: 13.4 ms
Wall time: 10.1 ms


In [10]:
%%time
df_listings_country_os = pd.read_parquet(filtered_listing_counts_country_os_filepath, engine="auto")
show_df(df_listings_country_os, 5)
show_df_dtypes_nans(df_listings_country_os)

Unnamed: 0,Win,MacOS,SteamOS+Linux,country
0,55629,13196,8654,English
1,12159,3184,1881,Simplified Chinese
2,5586,1424,808,Traditional Chinese
3,9386,2656,1589,Japanese
4,5863,1883,1141,Korean
23,1347,425,326,Finnish
24,1750,598,412,Swedish
25,3325,989,662,Turkish
26,897,245,163,Vietnamese
27,1651,559,425,Ukranian


Unnamed: 0,num_missing,dtype
Win,0,int64
MacOS,0,int64
SteamOS+Linux,0,int64
country,0,object


CPU times: user 8.6 ms, sys: 535 µs, total: 9.14 ms
Wall time: 7.13 ms


<a id="exploratory-data-analysis"></a>

## 3. [Exploratory Data Analysis](#exploratory-data-analysis)

<a id="tags"></a>

### 3.1. [Tags](#tags)

[Steam user tags](https://store.steampowered.com/news/app/593110/view/1714119088658959583). This is further explained in this [tutorial](https://www.howtogeek.com/659678/how-to-customize-steam-search-settings/).

There are a large number of user-defined tags available to filter the listings

In [11]:
df_tags["tag"].value_counts().reset_index().rename(
    columns={"tag": "num_counts", "index": "tag"}
)

Unnamed: 0,tag,num_counts
0,Indie,83
1,Adventure,83
2,Action,83
3,Singleplayer,83
4,Casual,81
5,Strategy,76
6,Simulation,74
7,Atmospheric,69
8,2D,64
9,Multiplayer,54


The top 10 most popular tags (by number of listings they are assigned to), after applying each other type of filter, is shown below

In [12]:
df_most_popular_tag_by_filter = (
    df_tags.groupby(by=["filter_type", "filtered_by"])["num_listings"]
    .nlargest(10)
    .reset_index(level=[0, 1])
    .reset_index(drop=True)
)
df_most_popular_tag_by_filter["rank"] = (
    df_most_popular_tag_by_filter.groupby(["filter_type", "filtered_by"])[
        "num_listings"
    ]
    .rank(ascending=False, method="dense", na_option="keep")
    .astype(int)
)
show_df(
    df_most_popular_tag_by_filter.merge(
        df_tags, how="inner", on=["filter_type", "filtered_by", "num_listings"]
    ),
    5,
)

Unnamed: 0,filter_type,filtered_by,num_listings,rank,tag
0,Language,Bulgarian,771,1,Indie
1,Language,Bulgarian,711,2,Casual
2,Language,Bulgarian,598,3,Singleplayer
3,Language,Bulgarian,463,4,Puzzle
4,Language,Bulgarian,451,5,2D
840,number of players,Single-player,13549,6,2D
841,number of players,Single-player,11767,7,Simulation
842,number of players,Single-player,11737,8,Strategy
843,number of players,Single-player,10499,9,RPG
844,number of players,Single-player,9552,10,Atmospheric


Within each filter, the number of times a tag is the most popular one (assigned to the most listings) is shown below

In [13]:
show_df(
    df_most_popular_tag_by_filter.merge(
        df_tags, how="inner", on=["filter_type", "filtered_by", "num_listings"]
    )
    .groupby(["tag", "rank"], as_index=False)["filter_type"]
    .count()
    .rename(columns={"filter_type": "number_of_occurrences"})
    .sort_values(by=["number_of_occurrences"], ascending=False, ignore_index=True)
)

Unnamed: 0,tag,rank,number_of_occurrences
0,Indie,1,50
1,2D,6,31
2,Action,3,30
3,Adventure,5,29
4,Casual,5,25
5,Indie,2,23
6,Singleplayer,3,22
7,Casual,4,22
8,Adventure,4,20
9,Action,2,20


The top 3 tags by operating system are shown below

In [14]:
df_tags_os_top_tags = (
    df_tags.query("filter_type == 'OS'")
    .groupby(["filtered_by"])["num_listings"]
    .nlargest(3)
    .reset_index(level=[0])
    .reset_index(drop=True)
)
df_tags_os_top_tags["rank"] = (
    df_tags_os_top_tags.groupby(["filtered_by"])["num_listings"]
    .rank(ascending=False, method="dense", na_option="keep")
    .astype(int)
)
df_tags_os_top_tags.merge(df_tags, how="inner", on=["filtered_by", "num_listings"])

Unnamed: 0,filtered_by,num_listings,rank,tag,filter_type
0,SteamOS + Linux,6698,1,Indie,OS
1,SteamOS + Linux,4258,2,Singleplayer,OS
2,SteamOS + Linux,3779,3,Adventure,OS
3,Windows,38213,1,Indie,OS
4,Windows,26908,2,Action,OS
5,Windows,26792,3,Singleplayer,OS
6,macOS,9907,1,Indie,OS
7,macOS,6238,2,Singleplayer,OS
8,macOS,5956,3,Adventure,OS


<a id="operating-systems-and-languages"></a>

### 3.2. [Operating Systems and Languages](#operating-systems-and-languages)

The [global PC marketshare by desktop operating systems](https://gs.statcounter.com/os-market-share/desktop/worldwide/#monthly-201312-202109) is shown below

In [15]:
%%time
if not os.path.exists(global_pc_marketshare_filepath):
    r = requests.get(global_pc_marketshare_filepath)
    with open(global_pc_marketshare_filepath, "wb") as file:
        file.write(r.content)

CPU times: user 28 µs, sys: 0 ns, total: 28 µs
Wall time: 56 µs


The 3-month rolling average of this global data is shown below

In [16]:
dateparser = lambda x: datetime.strptime(x, "%Y-%m")
df_pc_marketshare = pd.read_csv(
    global_pc_marketshare_filepath,
    index_col="Date",
    date_parser=dateparser,
    parse_dates=["Date"],
    usecols=["Date", "Windows", "OS X", "Linux", "Chrome OS", "Unknown"],
).rename(columns={"OS X": "MacOS"})
df_pc_marketshare.index = df_pc_marketshare.index.strftime("%Y-%m")
df_pc_marketshare["Linux"] += df_pc_marketshare["Chrome OS"]
df_pc_marketshare = df_pc_marketshare.drop(columns=["Chrome OS"])
df_pc_marketshare = pd.concat(
    [df_pc_marketshare, df_pc_marketshare.rolling(3).mean().add_suffix("_3m")], axis=1
)
show_df(df_pc_marketshare, 10)

Unnamed: 0_level_0,Windows,MacOS,Unknown,Linux,Windows_3m,MacOS_3m,Unknown_3m,Linux_3m
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2013-12,89.55,7.83,0.08,1.23,,,,
2014-01,88.87,8.35,0.05,1.27,,,,
2014-02,89.65,8.39,0.26,1.3,89.356667,8.19,0.13,1.266667
2014-03,89.61,8.58,0.47,1.34,89.376667,8.44,0.26,1.303333
2014-04,89.18,8.85,0.44,1.53,89.48,8.606667,0.39,1.39
2014-05,88.83,8.85,0.54,1.77,89.206667,8.76,0.483333,1.546667
2014-06,89.27,8.56,0.61,1.56,89.093333,8.753333,0.53,1.62
2014-07,89.25,8.59,0.63,1.53,89.116667,8.666667,0.593333,1.62
2014-08,89.15,8.65,0.61,1.59,89.223333,8.6,0.616667,1.56
2014-09,88.49,9.15,0.72,1.64,88.963333,8.796667,0.653333,1.586667


By comparison, the overall number of game listings by operating system is shown below

In [17]:
df_listings_os = (
    df_listings.query("filter_type == 'OS'")
    .drop(columns=["num_listings", "filter_type", "excluded"])
    .set_index("filtered_by")
    .T
)
df_listings_os = pd.concat(
    [
        df_listings_os,
        df_listings_os.apply(lambda x: x.div(x.sum()).mul(100), axis=1).add_suffix(
            "_pct"
        ),
    ],
    axis=1,
)
show_df(df_listings_os)

filtered_by,Windows,macOS,SteamOS + Linux,Windows_pct,macOS_pct,SteamOS + Linux_pct
included,57608,13376,8692,72.302826,16.787991,10.909182


**Observations**
1. MacBooks are built for efficient and reliable computing, not gaming ([1](https://www.gamingscan.com/mac-good-for-gaming/), [2](https://www.gamedesigning.org/tech/gaming-on-mac/#The-Verdict), [3](https://www.quora.com/How-efficient-is-a-MacBook-Pro-for-gaming)). So, Windows systems hold the majority of the market share over the PC gaming industry. It is not surprising to see this reflected in the games offered on Steam, with Windows support for nearly four times as many games as those supported by a Mac OS.
2. SteamOS is an open-source OS provided by Valve. It was intended to support Steam Machines (pre-built systems). It has specific hardware requirements and does not include a beginner-friendly installation procedure ([1](https://store.steampowered.com/steamos/)). It is also based on Linux. Unfortunately, it hasn't been supported much since [2019](https://happygamer.com/steamos-isnt-dead-just-sidelined-valve-has-plans-to-go-back-to-their-linux-based-os-60428/). The Steam client can also run on other Linux distributions. In general, Linux is far behind Windows and MacOS in [overall PC market share](https://gs.statcounter.com/os-market-share/desktop/worldwide). This too is reflected in the number of listings compatible with a Linux-based OS.

As an approximate comparison against the distribution of game listings by language, the corresponding PC marketshare on a per-country basis is shown below (excluding English-speaking countries, which could cover multiple countries)

In [18]:
%%time
_ = parallelize_processing(get_single_country_pc_marketshare, countries)

Skipped previously downloaded data for china
Skipped previously downloaded data for south-korea
Skipped previously downloaded data for japan
Skipped previously downloaded data for thailand
Skipped previously downloaded data for bulgaria
Skipped previously downloaded data for czech-republic
Skipped previously downloaded data for south-america
Skipped previously downloaded data for france
Skipped previously downloaded data for spain
Skipped previously downloaded data for greece
Skipped previously downloaded data for denmark
Skipped previously downloaded data for italy
Skipped previously downloaded data for hungary
Skipped previously downloaded data for netherlands
Skipped previously downloaded data for norway
Skipped previously downloaded data for germany
Skipped previously downloaded data for poland
Skipped previously downloaded data for romaniaSkipped previously downloaded data for portugal

Skipped previously downloaded data for turkey
Skipped previously downloaded data for finland
Sk

In [19]:
%%time
per_country_marketshare_files = glob(os.path.join(pc_marketshare_dir, "pc_marketshare_*.csv"))
# Specify datetime format for pd.read_csv()
dateparser = lambda x: datetime.strptime(x, "%Y-%m")
df_countrywise_pc_marketshare = pd.concat(
    [
        pd.read_csv(
            f,
            usecols=["Date", "Windows", "OS X", "Linux", "Chrome OS", "Unknown"],
            date_parser=dateparser,
            parse_dates=["Date"]
        ).rename(columns={"OS X": "MacOS"}).assign(
            country=os.path.basename(f).split("_")[-1].replace(".csv", "")
        )
        for f in per_country_marketshare_files
    ],
    ignore_index=True
).sort_values(by=["country", "Date"]).reset_index(drop=True)
# Combine Chrome OS and Linux, since Chrome OS has always been Linux-based
df_countrywise_pc_marketshare["Linux"] += df_countrywise_pc_marketshare["Chrome OS"]
df_countrywise_pc_marketshare = df_countrywise_pc_marketshare.drop(columns=["Chrome OS"])
# Change date format
df_countrywise_pc_marketshare["Date"] = df_countrywise_pc_marketshare["Date"].dt.strftime("%Y-%m")
show_df(df_countrywise_pc_marketshare, 5)
show_df_dtypes_nans(df_countrywise_pc_marketshare)

Unnamed: 0,Date,Windows,Unknown,MacOS,Linux,country
0,2013-12,97.84,0.0,1.75,0.1,bulgaria
1,2014-01,95.56,0.0,4.35,0.08,bulgaria
2,2014-02,95.14,0.28,4.03,0.55,bulgaria
3,2014-03,93.66,0.11,5.46,0.77,bulgaria
4,2014-04,94.01,0.39,5.02,0.58,bulgaria
2251,2021-05,73.67,23.91,1.42,0.99,viet-nam
2252,2021-06,70.58,26.38,2.03,1.0,viet-nam
2253,2021-07,77.06,18.67,3.53,0.74,viet-nam
2254,2021-08,79.49,15.92,3.47,1.12,viet-nam
2255,2021-09,69.74,25.87,3.72,0.67,viet-nam


Unnamed: 0,num_missing,dtype
Date,0,object
Windows,0,float64
Unknown,0,float64
MacOS,0,float64
Linux,0,float64
country,0,object


CPU times: user 72.5 ms, sys: 4.09 ms, total: 76.6 ms
Wall time: 76.6 ms


and (the rolling-average) on a per-country basis

In [20]:
df_countrywise_pc_marketshare = df_countrywise_pc_marketshare.merge(
    df_countrywise_pc_marketshare.groupby("country")
    .apply(get_rolling, period=3)
    .reset_index(),
    how="inner",
    on=["country", "Date"],
).sort_values(by=["country"])
show_df(df_countrywise_pc_marketshare, 4)

Unnamed: 0,Date,Windows,Unknown,MacOS,Linux,country,Windows_3m,Unknown_3m,MacOS_3m,Linux_3m
0,2013-12,97.84,0.0,1.75,0.1,bulgaria,,,,
68,2019-08,81.24,12.59,5.4,0.77,bulgaria,84.293333,9.223333,5.8,0.683333
67,2019-07,83.96,10.31,5.08,0.65,bulgaria,86.273333,6.47,6.533333,0.723333
66,2019-06,87.68,4.77,6.92,0.63,bulgaria,87.02,4.206667,8.01,0.763333
2187,2016-01,80.17,10.85,7.1,1.88,viet-nam,84.563333,6.953333,6.946667,1.54
2186,2015-12,85.93,5.28,7.79,1.0,viet-nam,86.893333,4.4,7.1,1.606667
2195,2016-09,81.61,6.24,11.58,0.57,viet-nam,82.073333,7.933333,7.78,2.213333
2255,2021-09,69.74,25.87,3.72,0.67,viet-nam,75.43,20.153333,3.573333,0.843333


The number of listings by language and operating system is shown below

In [21]:
df_listings_country_os = pd.concat(
    [
        df_listings_country_os,
        df_listings_country_os.iloc[:, :-1]
        .apply(lambda x: x.div(x.sum()).mul(100), axis=1)
        .add_suffix("_pct"),
    ],
    axis=1,
)
show_df(df_listings_country_os)

Unnamed: 0,Win,MacOS,SteamOS+Linux,country,Win_pct,MacOS_pct,SteamOS+Linux_pct
0,55629,13196,8654,English,71.79881,17.031712,11.169478
1,12159,3184,1881,Simplified Chinese,70.593358,18.485834,10.920808
2,5586,1424,808,Traditional Chinese,71.450499,18.214377,10.335124
3,9386,2656,1589,Japanese,68.857751,19.484997,11.657252
4,5863,1883,1141,Korean,65.972769,21.188253,12.838978
5,1390,416,278,Thai,66.698656,19.961612,13.339731
6,1019,307,247,Bulgarian,64.780674,19.516847,15.702479
7,2108,710,500,Czech,63.532248,21.398433,15.069319
8,1432,466,325,Danish,64.417454,20.962663,14.619883
9,13799,4424,2758,German,65.769029,21.085744,13.145227


Verify that there is no month in any country where Linux marketshare was greater than that of MacOS

In [22]:
assert (
    df_countrywise_pc_marketshare.loc[
        df_countrywise_pc_marketshare["Linux"] > df_countrywise_pc_marketshare["MacOS"]
    ]
).empty

**Notes**
1. Since there might be speakers of a language not living in their native country, we can't directly associate the absolute percentages in these two datasets (game listings and PC marketshare) since one is only showing language while the other only shows country.

**Observations**
1. The Windows OS dominates the game listings by language, while MacOS is consistently ahead of the combination of SteamOS and Linux. By PC marketshare, we can generally see that (if the OS is known) Windows is again the dominant OS, with MacOS second, but there are many row where the OS was unknown and this entry comes in second in terms of marketshare for many combinations of country and month.

<a id="number-of-players"></a>

### 3.3. [Number of Players](#number-of-players)

Showing the listings by supported number of players

In [23]:
df_listings.query("filter_type == 'number of players'").sort_values(
    by=["included"], ascending=False
).reset_index(drop=True)

Unnamed: 0,filtered_by,filter_type,num_listings,included,excluded
0,Single-player,number of players,"54,609 results match your search. 1,880 titles...",54609,1880
1,Multi-player,number of players,"12,860 results match your search. 40 titles ha...",12860,40
2,PvP,number of players,"8,128 results match your search. 18 titles hav...",8128,18
3,Co-op,number of players,"6,417 results match your search. 26 titles hav...",6417,26
4,Online PvP,number of players,"5,793 results match your search. 15 titles hav...",5793,15
5,Shared/Split Screen,number of players,"4,669 results match your search. 17 titles hav...",4669,17
6,Online Co-op,number of players,"3,531 results match your search. 13 titles hav...",3531,13
7,Shared/Split Screen PvP,number of players,"3,449 results match your search. 8 titles have...",3449,8
8,Shared/Split Screen Co-op,number of players,"2,721 results match your search. 15 titles hav...",2721,15
9,Cross-Platform Multiplayer,number of players,"1,904 results match your search. 4 titles have...",1904,4


**Observations**
1. Single- and multi-player games dominate the listings on the Steam platform store, but the majority of games are single-player games. [Co-Op games](https://www.reviewgeek.com/38827/play-couch-co-op-games-online-through-steam-remote-play/) are a sub-category of [multi-player games, but multi-player games don't have to be Co-Op games](https://gaming.stackexchange.com/a/123758/280783). There are different flavors of Co-Op games on Steam.
2. There are a very small number of multi-player games that support a different platform for each player.
3. Local multi-player games can be played over the internet ([1](https://www.theverge.com/2019/10/10/20907701/steam-remote-play-together-local-multiplayer-internet-beta-valve)).
4. Peer-vs-peer multi-player games are where one player must be competing against, and not teaming up with, another ([1](https://en.wikipedia.org/wiki/Player_versus_player)). Again, these are a sub-category of multi-player games.

Multi-player games are a key part of the Steam platform, so on the Steam search, we won't filter out games by the number of players.

<a id="feature"></a>

### 3.4. [Feature](#feature)

In [24]:
df_listings.query("filter_type == 'feature'").sort_values(
    by=["included"], ascending=False
).reset_index(drop=True)

Unnamed: 0,filtered_by,filter_type,num_listings,included,excluded
0,Steam Achievements,feature,"27,344 results match your search. 956 titles h...",27344,956
1,Steam Cloud,feature,"13,822 results match your search. 326 titles h...",13822,326
2,Full controller support,feature,"13,099 results match your search. 97 titles ha...",13099,97
3,Steam Trading Cards,feature,"8,927 results match your search. 133 titles ha...",8927,133
4,Partial Controller Support,feature,"8,362 results match your search. 151 titles ha...",8362,151
5,Remote Play Together,feature,"5,225 results match your search. 20 titles hav...",5225,20
6,Played with Steam Controller,feature,"2,436 results match your search. 15 titles hav...",2436,15
7,Remote Play on TV,feature,"1,964 results match your search. 12 titles hav...",1964,12
8,Steam Workshop,feature,"1,555 results match your search. 12 titles hav...",1555,12
9,Captions available,feature,"1,301 results match your search. 21 titles hav...",1301,21


Unlike filtering by tag, there isn't a way to filter out games by feature. So, we'll keep all features in the search results to be scraped.

<a id="tag"></a>

### 3.5. [Tag](#tag)

In [25]:
df_listings.query("filter_type == 'tag'").sort_values(
    by=["included"], ascending=False
).reset_index(drop=True)

Unnamed: 0,filtered_by,filter_type,num_listings,included,excluded
0,Indie,tag,"38,220 results match your search. 1,137 titles...",38220,1137
1,Action,tag,"26,922 results match your search. 391 titles h...",26922,391
2,Singleplayer,tag,"26,805 results match your search. 1,119 titles...",26805,1119
3,Adventure,tag,"24,870 results match your search. 698 titles h...",24870,698
4,Casual,tag,"23,948 results match your search. 1,286 titles...",23948,1286
5,2D,tag,"13,950 results match your search. 680 titles h...",13950,680
6,Strategy,tag,"12,522 results match your search. 217 titles h...",12522,217
7,Simulation,tag,"12,210 results match your search. 590 titles h...",12210,590
8,RPG,tag,"11,117 results match your search. 500 titles h...",11117,500
9,Atmospheric,tag,"9,786 results match your search. 177 titles ha...",9786,177


Helpful links related to tags
1. All about user-defined tags on the Steam platform ([1](https://www.makeuseof.com/tag/new-steam-search-find-games/), [2](https://store.steampowered.com/news/app/593110/view/1714119088658959583), [3](https://store.steampowered.com/tag/))
2. Benefits of filtering Steam listings by a user-defined tag ([1](https://www.theverge.com/2020/2/25/21153257/steam-labs-new-search-features-tag-filter-price-vr))

**Observations**
1. There is a wide range of tags assigned by users to games on the Steam platform. Among the top 16 most popular tags, there are tags assigned to less than 10,000 games and there also tags assigned to more than 20,000 games.
2. Some tags are the same as the name of a *Genre* (like Adventure & Casual) but other tags are more granular and cover a sub-genre (like *Casual*, *Sports Sim*).
3. Tags help filter the large (and growing) game listings library so that users don't have to sift through games they aren't interested in. This could also help eliminate games that are are not very relevant, possibly if they aren't assigned any tags at all.
4. From the output displayed above, the bottom five tags (row indexes from 11 to 15 inclusive) are those that are assigned to fewer than 10,000 games.

<a id="exploring-listing-filters"></a>

## 4. [Exploring listing Filters](#exploring-listing-filters)

<a id="prices"></a>

### 4.1. [Prices](#prices)

**Steam's business model is to take a commission from all games sold on its platform. Steam collects the commission from game developers who sell their games on the Steam store. But, some games are offered for free. Explore the price of the games**

**What are the five most popular user-defined tags of the free games?**

In [26]:
df_tags.query("filtered_by == '0'")

Unnamed: 0,tag,num_listings,filtered_by,filter_type
1312,Indie,3984,0,Price
1313,Free to Play,3681,0,Price
1314,Action,2931,0,Price
1315,Casual,2734,0,Price
1316,Adventure,2411,0,Price
1317,Singleplayer,2382,0,Price
1318,Multiplayer,1562,0,Price
1319,Strategy,1432,0,Price
1320,RPG,1393,0,Price
1321,2D,1291,0,Price


The five most popular user-defined tags of the free games are shown below

In [27]:
df_top_tags = df_tags.query(
    "filtered_by == '0' and tag not in ['Free to Play']"
).nlargest(5, "num_listings")
show_df(df_top_tags)

Unnamed: 0,tag,num_listings,filtered_by,filter_type
1312,Indie,3984,0,Price
1314,Action,2931,0,Price
1315,Casual,2734,0,Price
1316,Adventure,2411,0,Price
1317,Singleplayer,2382,0,Price


The five most assigned tags for all other filters (i.e. excluding the `Price` filter) are shown below

In [28]:
(
    df_tags.query("filter_type not in ['Price']")
    .groupby(["filter_type"])["num_listings"]
    .nlargest(5)
    .reset_index(level=[0])
    .reset_index(drop=True)
)

Unnamed: 0,filter_type,num_listings
0,Language,37001
1,Language,26502
2,Language,25958
3,Language,23866
4,Language,23107
5,OS,38213
6,OS,26908
7,OS,26792
8,OS,24861
9,OS,23941


All tags for *English* listings when filtering by every other filter (excluding *Price*) are shown below. First the unique tags within a single filter (Operating System) are shown below

In [29]:
df_tags.query("filter_type == 'OS'")

Unnamed: 0,tag,num_listings,filtered_by,filter_type
0,Indie,38213,Windows,OS
1,Action,26908,Windows,OS
2,Singleplayer,26792,Windows,OS
3,Adventure,24861,Windows,OS
4,Casual,23941,Windows,OS
5,2D,13942,Windows,OS
6,Strategy,12517,Windows,OS
7,Simulation,12206,Windows,OS
8,RPG,11113,Windows,OS
9,Atmospheric,9782,Windows,OS


**Observations**
1. As we can see, there are always 16 tags displayed. However, these don't have to be the same 16 tags for every filter that is applied.

The number of listings and number of unique tags for each filter are shown below

In [30]:
df_top_five = (
    df_tags.query("filter_type not in ['Price']")
    .groupby(["filter_type"])
    .apply(summarize_tags_per_group)
    .reset_index(level=[0])
)
df_top_five[[f"tag_{t}" for t in range(1, 5 + 1)]] = df_top_five["top_five"].str.split(
    ",", expand=True
)
df_top_five = (
    df_top_five.drop(columns=["top_five"])
    .set_index(["filter_type", "num_sub_filters", "num_tags_per_filter"])
    .stack()
    .reset_index()
    .rename(columns={0: "num_listings"})
    .drop(columns=["level_3"])
    .astype({"num_listings": int})
)
show_df(df_top_five)

Unnamed: 0,filter_type,num_sub_filters,num_tags_per_filter,num_listings
0,Language,28,27,37001
1,Language,28,27,26502
2,Language,28,27,25958
3,Language,28,27,23866
4,Language,28,27,23107
5,OS,3,18,38213
6,OS,3,18,26908
7,OS,3,18,26792
8,OS,3,18,24861
9,OS,3,18,23941


**Notes**
1. The `num_tags_per_filter` column refers to the number of tags within each grouping of `filter_type`.

The above groupby is re-displayed, but now with the tag name and the number of unique tags included

In [31]:
show_df(
    df_top_five.merge(df_tags, on=["filter_type", "num_listings"])[
        [
            "filter_type",
            "num_sub_filters",
            "filtered_by",
            "tag",
            "num_listings",
            "num_tags_per_filter",
        ]
    ]
)

Unnamed: 0,filter_type,num_sub_filters,filtered_by,tag,num_listings,num_tags_per_filter
0,Language,28,English,Indie,37001,27
1,Language,28,English,Action,26502,27
2,Language,28,English,Singleplayer,25958,27
3,Language,28,English,Adventure,23866,27
4,Language,28,English,Casual,23107,27
5,OS,3,Windows,Indie,38213,18
6,OS,3,Windows,Action,26908,18
7,OS,3,Windows,Singleplayer,26792,18
8,OS,3,Windows,Adventure,24861,18
9,OS,3,Windows,Casual,23941,18


**Observations**
1. The most assigned tags after using the other filters are also among the most assigned ones for free listings. However, for all the other filtered search results (i.e. the non-free ones), the subject of the filter also appears as one of the top five tags when listings are filtered by
   - `number of players` (*Singleplayer*)
   - `VR Support` (*VR*)

   A list of at most the top 16 tags can be used to filter listings from the search results page. If attempting to filter listings by the top five most popular tags, it may not be particularly interesting if these tags are the same for when listings are filtered by *number of players* or by *operating system* because these tags are too general. For users looking to filter listings using niche-tags, that would not be inside the top five most common ones or even in the top 16 displayed tags, they will need to manually start entering the tag name in the *Narrow by Tag* search box and rely on the auto-completion feature to help isolate that tag. This is also true for the *VR support* section where the *VR* tag is dominant for every sub-filter in the *Narrow by VR Support* filter section but is too generic to be useful.

   The benefit of allowing users to add custom tags is to have to the ability to filter listings based on tags defined by other gamers. A disadvantage is that, if the most common tags are too widespread across the different types of filters that can be applied (from the search results page) then a new user will need to manually search for the tag they're interested in.

**For each available price range (starting from zero USD, or free listings) that can be used to filter listings on the websore, what are the two most commonly assigned user-defined tags?**

The two most common tags in each price range are below

In [32]:
df_tags.query("filter_type == 'Price'").groupby(["filter_type", "filtered_by"])[
    "num_listings"
].nlargest(2).reset_index(level=[0, 1]).merge(
    df_tags.query("filter_type == 'Price'"),
    on=["filter_type", "filtered_by", "num_listings"],
).rename(
    columns={"filtered_by": "Max Price"}
).drop(
    columns=["filter_type"]
)

Unnamed: 0,Max Price,num_listings,tag
0,0,3984,Indie
1,0,3681,Free to Play
2,12,23537,Indie
3,12,15912,Casual
4,18,26779,Indie
5,18,17439,Casual
6,24,28597,Indie
7,24,18268,Action
8,30,29032,Indie
9,30,18734,Action


The most common tag in each price range is below

In [33]:
df_tags.query("filter_type == 'Price'").groupby(["filter_type", "filtered_by"])[
    "num_listings"
].nlargest(1).reset_index(level=[0, 1]).merge(
    df_tags.query("filter_type == 'Price'"),
    on=["filter_type", "filtered_by", "num_listings"],
).rename(
    columns={"filtered_by": "Max Price"}
).drop(
    columns=["filter_type"]
)

Unnamed: 0,Max Price,num_listings,tag
0,0,3984,Indie
1,12,23537,Indie
2,18,26779,Indie
3,24,28597,Indie
4,30,29032,Indie
5,36,29273,Indie
6,42,29346,Indie
7,48,29414,Indie
8,54,29427,Indie
9,6,15934,Indie


**Observations**
1. Again, the most assigned tags overall are also among the most assigned ones when listings are filtered.
2. For free games, the tag *Free to Play* appears, which isn't particularly insightful.
3. The number of games for which the tag is assigned drops significantly for Free games or games sold for 6 USD or less. It is possible that for these two price ranges, users are less interested in the tags assigned by other users or in the tags they could assign themselves and more interested in the fact that the games are the among the cheapest on the platform.

<a id="tags"></a>

### 4.2. [Tags](#tags)

**As mentioned above, Steam users are allowed to post tags for games. A valid tag can be a theme, genre, gaming attribute or any other phrase (no profanity is allowed). As the Steam store catalogue grows over time, this tagging system makes it easy for customers to find a particular type(s) of game they are looking for. It is possible that users will simply assign tags that match the Genre or sub-genre of the game. In this case, tags are not heplful to new customers to the platform since they don't have the ability to efficiently filter listings based on these tags.**

**What are the 25 most-common user-assigned tags on the Steam store?**

All tags and the number of times they are assigned to a listing are shown below

In [34]:
show_df(
    df_tags[["tag", "num_listings"]]
    .sort_values(by="num_listings", ascending=False)
    .reset_index(drop=True),
    10,
)

Unnamed: 0,tag,num_listings
0,Indie,38213
1,Indie,37001
2,Indie,36470
3,Indie,29452
4,Indie,29440
5,Indie,29438
6,Indie,29427
7,Indie,29414
8,Indie,29346
9,Indie,29273


The maximum number of times a given tag is assigned is shown below

In [35]:
show_df(
    df_tags.groupby("tag", as_index=False)["num_listings"]
    .max()
    .sort_values(by="num_listings", ascending=False)
    .reset_index(drop=True)
)

Unnamed: 0,tag,num_listings
0,Indie,38213
1,Action,26908
2,Singleplayer,26792
3,Adventure,24861
4,Casual,23941
5,2D,13942
6,Strategy,12517
7,Simulation,12206
8,RPG,11113
9,Atmospheric,9782


**Notes**
1. Repeat occurrences of a tag have not been show here. If a tag is repeated (which is possible after applying different types of filters) then only the occurrence of this tag with the maximum number of listings that it was was assigned to has been taken.

Finally, the top 25 most common tags are shown below

In [36]:
show_df(
    df_tags.groupby("tag", as_index=False)["num_listings"]
    .max()
    .nlargest(25, "num_listings")
    .reset_index(drop=True)
)

Unnamed: 0,tag,num_listings
0,Indie,38213
1,Action,26908
2,Singleplayer,26792
3,Adventure,24861
4,Casual,23941
5,2D,13942
6,Strategy,12517
7,Simulation,12206
8,RPG,11113
9,Atmospheric,9782


**Excluding Genres, what are the most popular tags assigned to games by users? Show the listings with such tags.**

A list of game genres is available in `list_of_genres` and its contents are shown below

In [37]:
print(list_of_genres)

['Action', 'Role-Playing', 'Strategy', 'Adventure & Casual', 'Simulation', 'Sports & Racing']


A list of tags that do not overlap with listing Genres is shown below

In [38]:
non_genre_tags = list(set(df_tags["tag"].unique().tolist()) - set(list_of_genres))

This list is used to filter listing tags below (and the top and bottom 10 most occurrences are shown)

In [39]:
show_df(
    df_tags.query("tag in @non_genre_tags")
    .reset_index(drop=True)
    .sort_values(by="num_listings", ascending=False)
    .reset_index(drop=True),
    10,
)

Unnamed: 0,tag,num_listings,filtered_by,filter_type
0,Indie,38213,Windows,OS
1,Indie,37001,English,Language
2,Indie,36470,Single-player,number of players
3,Indie,29452,72,Price
4,Indie,29440,66,Price
5,Indie,29438,60,Price
6,Indie,29427,54,Price
7,Indie,29414,48,Price
8,Indie,29346,42,Price
9,Indie,29273,36,Price


In [40]:
print(
    f"There are {df_tags['tag'].nunique()} unique user-defined tags that are displayed. "
    f"{len(non_genre_tags)} of them are not also Genres."
)

There are 55 unique user-defined tags that are displayed. 52 of them are not also Genres.


**How many of the top 25 most-assigned tags are not Genres or sub-genres?**

In addition to the list game genres, a similar list of sub-genres is available in `list_of_sub_genres` and its contents are shown below

In [41]:
print(list_of_sub_genres)

['Action Rogue-Like', 'Arcade & Rhythm', "Beat'Em Up", 'Fighting & Martial Arts', 'First-Person Shooter', 'Platformer & Runner', 'Third-Person Shooter', 'Adventure', 'Adventure RPG', 'Casual', 'Metroidvania', 'Puzzle', 'Story-Rich', 'Visual Novel', 'Action RPG', 'Adventure RPG', 'JRPG', 'Party-Based', 'Rogue-Like', 'Strategy RPG', 'Turn-Based', 'Building & Automation', 'Business & Tycoon', 'Dating', 'Farming & Crafting', 'Life & Immersive', 'Sandbox & Physics', 'Space & Flight', 'Card & Board', 'City & Settlement', 'Grand & 4X', 'Military', 'Real-Time Strategy', 'Tower Defense', 'Turn-Bsed Strategy', 'All Sports', 'Fishing & Hunting', 'Individual Sports', 'Racing', 'Racing Sim', 'Sports Sim', 'Team Sports']


Similar to the query required to exclude tags that matched the name of a Genre, the query to get the tags that don't overlap with the combined list of genres and sub-genres is below

In [42]:
non_genre_subgenre_tags = list(
    set(df_tags["tag"].unique().tolist()) - set(list_of_genres + list_of_sub_genres)
)
show_df(
    df_tags[
        (df_tags["tag"].isin(non_genre_subgenre_tags))
        & (~df_tags["tag"].str.contains("RPG"))
    ]
    .reset_index(drop=True)
    .sort_values(by="num_listings", ascending=False)
    .reset_index(drop=True),
    10,
)

Unnamed: 0,tag,num_listings,filtered_by,filter_type
0,Indie,38213,Windows,OS
1,Indie,37001,English,Language
2,Indie,36470,Single-player,number of players
3,Indie,29452,72,Price
4,Indie,29440,66,Price
5,Indie,29438,60,Price
6,Indie,29427,54,Price
7,Indie,29414,48,Price
8,Indie,29346,42,Price
9,Indie,29273,36,Price


**Notes**
1. One of the sub-genres contains an abbreviation and a modification to the filters is required to separately exclud this from the list of tags.

In [43]:
print(
    f"There are {df_tags['tag'].nunique()} unique user-defined tags that are displayed. "
    f"{len(non_genre_subgenre_tags)} of them are not also Genres or Sub-Genres."
)

There are 55 unique user-defined tags that are displayed. 48 of them are not also Genres or Sub-Genres.


**Notes**
1. The total number of unique user-defined tags refers to a unique set from the top 16 most popular tags that are displayed at a time. Depending on the filter applied, a different set of 16 tags will be displayed. The total number of tags counted above refers to all unique tags from the combined list of tags that ever get displayed by selecting various filters individually.

   There are many more tags that fall outside the top-16 most popular ones and are not displayed. These have not been included in this count.

**Observations**
1. It is re-assuring that there is not much overlap between the tags and Genres and Sub-Genres. this makes filtering by tags an alternative to filtering from the *Categories* dropdown on the Steam store page.

<a id="filters"></a>

### 4.3. [Filters](#filters)

**What fraction of games are listed as single-player or multi-player?**

As shown below, *Singleplayer* and *Multiplayer* appear as tags.

In [44]:
df_listings[df_listings["filtered_by"].str.contains("Single|Multi")]

Unnamed: 0,filtered_by,filter_type,num_listings,included,excluded
31,Single-player,number of players,"54,609 results match your search. 1,880 titles...",54609,1880
32,Multi-player,number of players,"12,860 results match your search. 40 titles ha...",12860,40
42,Cross-Platform Multiplayer,number of players,"1,904 results match your search. 4 titles have...",1904,4
85,Singleplayer,tag,"26,805 results match your search. 1,119 titles...",26805,1119
97,Multiplayer,tag,"7,199 results match your search. 13 titles hav...",7199,13


We don't want to count listings based on these filters since there could be overlap between the number of listings. Instead, we'll focus only on the the *Narrow by VR Support* section

In [45]:
df_single_multi_player = df_listings[
    (df_listings["filtered_by"].str.contains("Single|Multi"))
    & (df_listings["filter_type"] != "tag")
][["filtered_by", "included", "excluded"]].reset_index(drop=True)
for c in ["included", "excluded"]:
    df_single_multi_player[f"{c}_pct"] = (
        df_single_multi_player[c] / df_single_multi_player[c].sum()
    ) * 100
show_df(df_single_multi_player)

Unnamed: 0,filtered_by,included,excluded,included_pct,excluded_pct
0,Single-player,54609,1880,78.717945,97.713098
1,Multi-player,12860,40,18.537471,2.079002
2,Cross-Platform Multiplayer,1904,4,2.744584,0.2079


**Observations**
1. The majority of listings are single-player games. Approximately 20 percent of all listings offer multi-player support.
2. There are games that can support single-player and multi-player modes, however the scraped data was filtered by one value in the *Narrow by number of players* section at a time and so does not support counting listings that support multiple numbers of players.

**What fraction of games requires a virtual reality headset?**

Again, we'll exclude listings filtered by tag that might contain a a term related to a VR headset

In [46]:
df_single_multi_player = df_listings[
    (df_listings["filtered_by"].str.contains("Valve Index|HTC|Oculus|Reality"))
    & (df_listings["filter_type"] != "tag")
][["filtered_by", "included", "excluded"]].reset_index(drop=True)
for c in ["included", "excluded"]:
    df_single_multi_player[f"{c}_pct"] = (
        df_single_multi_player[c] / df_single_multi_player[c].sum()
    ) * 100
show_df(df_single_multi_player)

Unnamed: 0,filtered_by,included,excluded,included_pct,excluded_pct
0,Valve Index,4295,85,30.803988,31.135531
1,HTC Vive,4635,86,33.242487,31.501832
2,Oculus Rift,3598,83,25.805063,30.40293
3,Windows Mixed Reality,1415,19,10.148462,6.959707


**Observations**
1. There are four different VR headsets available to Steam users. [Valve Index](https://en.wikipedia.org/wiki/Valve_Index) (developed by Valve) and [HTC's offering](https://en.wikipedia.org/wiki/HTC_Vive) are supported by the most listings on the platform. The Oculus Rift headset is discontinued. Microsoft's [VR headset](https://en.wikipedia.org/wiki/Windows_Mixed_Reality) supports the least number of offerings on the platform.

<a id="conclusion"></a>

## 5. [Conclusion](#conclusion)

The objective of this notebook is to explore the aggregated data scraped from the Steam webstore in order to determine if filters can be applied in order to reduce the number of listings to be scraped.

Some options for filters that may be considered are
- filter by VR headset
  - the listings that support the discontinued Oculus headset could be excluded, but these take up nearly a third of the VR-compatible listings
- filter by number of players
  - there are a minor number of listings that support cross-platform multi-player games, so these could be excluded
  - unfortunately, these are a subset of the overall multi-player group of listings; excluding them would mean that we don't get the full picture if we want to compare single- to multi-player games
- filtering by price, genre or tag is not a good option since we'll definitely want to explore the listing by each of those filters

With the exception of language (English games only), it seems there are no clear filters that can be applied to reduce the size of the data to be scraped. Note that many games that are supported in English also enjoy multi-language support, so only including English games will include some of the listings in other languages that Steam supports. It is also worth pointing out that the a listing offered in a particular language is not the same as a listing offered in the home country where that language is spoken since those listings might be played speakers of that language but who reside in other countries. Also, filtering out listings means we won't be able to explore the data based on language. Given that the majority of listings are offered in English, this should not compromise the data qualitatively when it comes to exploring other attributes such as Genre, Tag, Platform, etc.

So, we'll proceed to only filtering all games on the Steam store by language (selecting English games only) and then scrape these listings.

---

<span style="float:left">
    <a href="./0_get_agg_data.ipynb"><< 0 - Scrape Aggregated data from Steam webstore</a>
</span>

<span style="float:right">
    <a href="./2_selenium.ipynb">2 - Navigating webstore with Selenium webdriver >></a>
</span>