# Scraping search results with the `selenium` library

In [1]:
%load_ext lab_black
%load_ext autoreload
%autoreload 2

In [2]:
import math
import os
import re
import time
from collections import OrderedDict
from glob import glob
from random import choice, randint, sample, shuffle, uniform

import pandas as pd
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException, TimeoutException
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import Select, WebDriverWait

In [3]:
%aimport src.bs4_helpers
import src.bs4_helpers as bsh

%aimport src.failure_records
from src.failure_records import dict_failed_extraction_from_listing_page

%aimport src.page_helpers
from src.page_helpers import check_movement, get_pages

%aimport src.page_scrapers
from src.page_scrapers import scrape_listing, scrape_single_page_search_results

%aimport src.selenium_helpers
from src.selenium_helpers import (
    enter_age,
    scroll_up_down_page,
    smooth_scroll_until_element_in_view,
    sort_search_results,
)

%aimport src.single_page_navigation_helpers
from src.single_page_navigation_helpers import (
    perform_random_navigation_on_page,
    randomly_interact_with_tag_based_filters,
    randomly_interact_with_feature_based_filters,
)

%aimport src.utils
from src.utils import save_to_parquet_file, show_df, show_df_dtypes_nans

%aimport src.webscraping_utils
from src.webscraping_utils import get_random_user_agent

In [4]:
options = Options()
# options.add_argument("--headless")  # Runs Chrome in headless mode.
options.add_argument("--window-size=1920,1080")
options.add_argument("--no-sandbox")  # Bypass OS security model
options.add_argument("--disable-gpu")  # applicable to windows os only
options.add_argument("start-maximized")  #
options.add_argument("disable-infobars")
options.add_argument("--incognito")
options.add_argument("--disable-extensions")
options.add_argument("--proxy-server='direct://'")
options.add_argument("--proxy-bypass-list=*")
options.add_experimental_option(
    "prefs",
    {"profile.default_content_setting_values.notifications": 2},
)
options.add_argument(f"user-agent={get_random_user_agent()}")
options.arguments

Selected user-agent from: chrome-incognito


['--window-size=1920,1080',
 '--no-sandbox',
 '--disable-gpu',
 'start-maximized',
 'disable-infobars',
 '--incognito',
 '--disable-extensions',
 "--proxy-server='direct://'",
 '--proxy-bypass-list=*',
 'user-agent=Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36']

<a href="table-of-contents"></a>

## [Table of Contents](#table-of-contents)
0. [About](#about)
1. [User Inputs](#user-inputs)
2. [Launch Browser](#launch-browser)
3. [Load (Blank) Search Results Page](#load-(blank)-search-results-page)
4. [Specify Language](#specify-language)
5. [Navigation Around the (Blank) Search Listings Page](#navigation-around-the-(blank)-search-listings-page)
   - 5.1. [Randomly Navigate around a Page](#randomly-navigate-around-a-page)
   - 5.2. [Randomly Select and Un-Select Tags](#randomly-select-and-un-select-tags)
   - 5.3. [Randomly Select and Un-Select Features Filters](#randomly-select-and-un-select-features-filters)
6. [Sort Listings](#sort-listings)
7. [Navigate to starting page](#navigate-to-starting-page)
8. [Scrape](#scrape)
9. [Examine collected outputs](#examine-collected-outputs)
10. [(Optional) Convert single-page CSV files to `parquet` file](#convert-single-page-csv-files-to-`parquet`-file)
11. [Close Browser](#close-browser)

<a id="about"></a>

## 0. [About](#about)

Scraping game listings from the Steam web store using Selenium webdriver.

<a id="user-inputs"></a>

## 1. [User Inputs](#user-inputs)

In [5]:
PROJ_ROOT_DIR = os.getcwd()

In [6]:
url = "https://store.steampowered.com/search/?category1=998&page=1"
page_numbers_to_scrape = [50]

# Specify all possible random movements across page, a subset
# of which will be performed before and after scraping
all_possible_pre_scrape_movements = [
    "navigate",
    "filter_by_tag",
    "filter_by_feature",
]
all_possible_post_scrape_movements = [
    "click_store_homepage_logo",
    "hover_over_browse_steam_section",
    "scroll_until_store_homepage_logo_viewable",
]

In [7]:
data_dir = os.path.join(PROJ_ROOT_DIR, "data")
raw_data_dir = os.path.join(data_dir, "raw")
selenium_data_dir = os.path.join(raw_data_dir, "selenium")

webdriver_path = os.path.join(
    os.path.expanduser("~"), "chromedriver_linux64", "chromedriver"
)

page_to_start_scraping = page_numbers_to_scrape[0]

# Randomly specify pre-scraping actions to be performed
pre_scrape_movements = sample(all_possible_pre_scrape_movements, 2)

# Randomly specify post-scraping actions to be performed
post_scrape_movements = sample(all_possible_post_scrape_movements, 2)

<a id="launch-browser"></a>

## 2. [Launch Browser](#launch-browser)

In [8]:
driver = webdriver.Chrome(executable_path=webdriver_path, options=options)
driver.delete_all_cookies()

<a id="load-(blank)-search-results-page"></a>

## 3. [Load (Blank) Search Results Page](#load-(blank)-search-results-page)

In [9]:
driver.get(url)

<a id="specify-language"></a>

## 4. [Specify Language](#specify-language)

Smoothly scroll to bottom of page, with pauses along the way

In [10]:
%%time
scroll_up_down_page(
    driver,
    by_how_much=22,
    min_num_pauses=1,
    max_num_pauses=3,
    min_pause=0.1,
    max_pause=2.4,
    scroll_method="slow",
    scroll_direction="down",
)
time.sleep(uniform(4.2,8.8))

CPU times: user 77.2 ms, sys: 8.18 ms, total: 85.4 ms
Wall time: 12.1 s


Select language

In [11]:
language_selection = driver.find_element_by_xpath('.//div[@data-loc="English"]')
language_selection.click()
time.sleep(uniform(5.2, 7.8))

Smoothly scroll up until the *Install Steam* button is in view

In [12]:
%%time
install_steam_btn = driver.find_element_by_xpath(
    './/a[@class="header_installsteam_btn_content"]'
)
driver = smooth_scroll_until_element_in_view(driver, install_steam_btn)
time.sleep(uniform(4.8,6.6))

CPU times: user 1.24 ms, sys: 1.04 ms, total: 2.29 ms
Wall time: 5.81 s


<a id="navigation-around-the-(blank)-search-listings-page"></a>

## 5. [Navigation Around the (Blank) Search Listings Page](#navigation-around-the-(blank)-search-listings-page)

<a id="randomly-navigate-around-a-page"></a>

### 5.1. [Randomly Navigate around a Page](#randomly-navigate-around-a-page)

(OPTIONAL Pre-Scrape action) Option 1/3 - Smoothly scroll down to bring the *Categories* menu into view

In [13]:
if "navigate" in pre_scrape_movements:
    categories_flyout = driver.find_element_by_xpath(
        './/div[@data-flyout="genre_flyout"]'
    )
    driver = smooth_scroll_until_element_in_view(driver, categories_flyout)
    time.sleep(uniform(3.2, 6.5))

(OPTIONAL Pre-Scrape action) Option 1/3 - Randomly navigate across the search results page

In [14]:
%%time
if "navigate" in pre_scrape_movements:
    driver = perform_random_navigation_on_page(driver, randint(2, 5), randint(5, 10))
    time.sleep(uniform(3.0,6.5))

Moved the mouse cursor over the categories fly-out
Retrieved raw categories and sub-categories, including blanks
Retrieved categories and sub-categories with links
Performed 4 hovers on page
Hovered over the Install Steam button
CPU times: user 23.9 ms, sys: 3.15 ms, total: 27.1 ms
Wall time: 12.5 s


<a id="randomly-select-and-un-select-tags"></a>

### 5.2. [Randomly Select and Un-Select Tags](#randomly-select-and-un-select-tags)

(OPTIONAL Pre-Scrape action) Option 2/3 - Randomly select one or multiple tags as filters and then de-select them

In [15]:
%%time
if "filter_by_tag" in pre_scrape_movements:
    driver = randomly_interact_with_tag_based_filters(driver, randint(1, 3), randint(3, 5))
    time.sleep(uniform(2.2,4.5))

CPU times: user 1e+03 ns, sys: 1e+03 ns, total: 2 µs
Wall time: 4.53 µs


<a id="randomly-select-and-un-select-features-filters"></a>

### 5.3. [Randomly Select and Un-Select Features Filters](#randomly-select-and-un-select-features-filters)

(OPTIONAL Pre-Scrape action) Option 3/3 - Smoothly scroll to bring *Filter by Feature* into view

In [16]:
if "filter_by_feature" in pre_scrape_movements:
    feat_header = driver.find_element_by_xpath(
        './/div[@data-collapse-name="category2"]/div'
    )
    driver = smooth_scroll_until_element_in_view(driver, feat_header)
    time.sleep(uniform(1.8, 5.1))

(OPTIONAL Pre-Scrape action) Option 3/3 - Randomly select one or multiple feature filters and then de-select them

In [17]:
%%time
if "filter_by_feature" in pre_scrape_movements:
    driver = randomly_interact_with_feature_based_filters(driver, randint(2, 4), randint(4, 5))
    time.sleep(uniform(3.0,4.2))

Selected and Un-selected 4 features
CPU times: user 35.8 ms, sys: 1.39 ms, total: 37.2 ms
Wall time: 20.3 s


<a id="sort-listings"></a>

## 6. [Sort Listings](#sort-listings)

Smoothly scroll until the *Install Steam* button is in view

In [18]:
%%time
install_steam_btn = driver.find_element_by_xpath(
    './/a[@class="header_installsteam_btn_content"]'
)
driver = smooth_scroll_until_element_in_view(driver, install_steam_btn)
time.sleep(uniform(3.2,6.5))

CPU times: user 927 µs, sys: 1.69 ms, total: 2.62 ms
Wall time: 4.98 s


Scroll through sort search results

In [19]:
%%time
driver = sort_search_results(driver)
time.sleep(uniform(2.8,4.7))

Scrolled to sort option: User Reviews
Scrolled to sort option: Name
Scrolled to sort option: Lowest Price
Scrolled to sort option: Relevance
Scrolled to sort option: Highest Price
Scrolled to sort option: Release date
CPU times: user 29.2 ms, sys: 639 µs, total: 29.8 ms
Wall time: 17.5 s


**Notes**
1. Two available options are to sort search results by Relevance or in descending order of release date. Sorting by *Release date* gives the newest listings but these rarely have user reviews. Sorting by *Relevance* gives more games with user reviews but are possibly older.

**Observations**
1. Selecting a sort option clears the page-view display (the suffix *&page=`<page-number>`* is removed) that was specified from the main url loaded browser earlier (step 3.). This results in an infinite scroll through up to tens of thousands of search results. By comparison, the page view display shows upto 25 listings per page with the option to navigate between pages. In order to preserve the page-based display of the search results, when scrolling through the search result sort options, we will not click on one of the sort options. Instead, we will just randomly scroll through each sort option. As a result, the default sort option (*Relevance*) will be used to order the search results that are scraped.

Smoothly scroll down to bring pagination into view

In [20]:
%%time
element = driver.find_element_by_class_name("search_pagination_left")
driver = smooth_scroll_until_element_in_view(driver, element)
time.sleep(uniform(1.8,5.1))

CPU times: user 2.54 ms, sys: 0 ns, total: 2.54 ms
Wall time: 5.04 s


Click on page 2, since an inconsistent number of search results is displayed on page 1

In [21]:
page_selections = driver.find_elements_by_xpath(
    './/div[@class="search_pagination_right"]/a'
)
page_selections[0].click()
time.sleep(uniform(3.2, 6.9))

<a id="navigate-to-starting-page"></a>

## 7. [Navigate to starting page](#navigate-to-starting-page)

Determine current page number and available pages

In [22]:
# Get available page numbers
pagination = driver.find_element_by_xpath('.//div[@class="search_pagination"]')
current_page_num, _, _ = get_pages(pagination, True)
time.sleep(uniform(5.4, 8.1))

Available page numbers=1,2,3,4, Current page=2, Max page=4


Smoothly scroll down to bring page selection into view

In [23]:
%%time
element = driver.find_element_by_class_name("search_pagination_left")
driver = smooth_scroll_until_element_in_view(driver, element)
time.sleep(uniform(2.4, 6.1))

CPU times: user 1.41 ms, sys: 970 µs, total: 2.38 ms
Wall time: 5.62 s


Extract last available page

In [24]:
right_pagination = driver.find_element_by_xpath(
    './/div[@class="search_pagination_right"]'
)
last_page = int(right_pagination.find_elements_by_tag_name("a")[-2].text)

Move forward or backward to get to desired starting page

In [25]:
%%time
if page_to_start_scraping <= last_page:
    # Indicate that we can scrape the specified page
    can_scrape = True

    # Navigate to specified page
    while current_page_num != page_to_start_scraping:
        # Get available page numbers
        pagination = driver.find_element_by_xpath('.//div[@class="search_pagination"]')

        # Scroll down
        scroll_start = time.time()
        scroll_up_down_page(
            driver,
            by_how_much=22,  # controlls scrolling speed
            min_num_pauses=1,
            max_num_pauses=3,  # for no pauses, set min_num_pauses = max_num_pauses
            min_pause=0.1,
            max_pause=2.4,
            scroll_method="slow",
            scroll_direction="down",
        )
        scroll_duration = time.time() - scroll_start
        print(f"Scrolled for {scroll_duration:.2f} sec")

        # Get all page navigation web elements
        page_num_li = [
            elem
            for page_nav_items in [
                pagination.find_element_by_xpath(
                    f'.//div[@class="search_pagination_{nav_dir}"]'
                ).find_elements_by_tag_name("a")
                for nav_dir in ["left", "right"]
            ]
            for elem in page_nav_items
        ]
        can_move_back, can_move_forward = check_movement(pagination)
        current_page_num, curr_page_list, all_page_nums = get_pages(pagination, True)
        # print(current_page_num)

        if page_to_start_scraping not in all_page_nums:
            # If at the max page AND can move forward, then move forward
            if current_page_num == max(all_page_nums) and can_move_forward:
                move_from_max_page = page_num_li[-1]
                move_from_max_page.click()
                print(
                    f"Moved forward from max page ({max(all_page_nums)}) to "
                    f"page {max(all_page_nums)+1} ", end=""
                )
            # If not at the max page then move to the max page
            if current_page_num < max(all_page_nums):
                move_to_max_page = page_num_li[-3]
                move_to_max_page.click()
                print(f"Moved forward to max page ({max(all_page_nums)}) ", end="")
        else:
            # Click on one of the pages in view
            available_page_to_click = None
            for tag in page_num_li:
                if (
                    tag.text not in ["...", ">", "<"]
                    and int(tag.text) == page_to_start_scraping
                ):
                    available_page_to_click = tag
            print(f"Will click on page {int(available_page_to_click.text)} ", end="")
            available_page_to_click.click()
        pause_bw_moving = uniform(2.5, 3.5)
        print(f"(first pausing for {pause_bw_moving:.2f} seconds...", end="")
        time.sleep(pause_bw_moving)
        print("done.)")

        pagination = driver.find_element_by_xpath('.//div[@class="search_pagination"]')
        current_page_num, _, _ = get_pages(pagination, False)
        # print(current_page_num)
    else:
        print(f"At desired page {page_to_start_scraping}")
        time.sleep(uniform(2.8,4.5))
        print(f"Paused for {pause_bw_moving:.2f} seconds at the bottom of the desired page.")
else:
    # Indicate that we cannot scrape the specified page
    can_scrape = False

Scrolled for 0.63 sec
Can move back, Can move forward
Available page numbers=1,2,3,4, Current page=2, Max page=4
Moved forward to max page (4) (first pausing for 2.66 seconds...done.)
Scrolled for 0.31 sec
Can move back, Can move forward
Available page numbers=1,2,3,4,5,6, Current page=4, Max page=6
Moved forward to max page (6) (first pausing for 3.22 seconds...done.)
Scrolled for 8.03 sec
Can move back, Can move forward
Available page numbers=1,4,5,6,7,8, Current page=6, Max page=8
Moved forward to max page (8) (first pausing for 2.73 seconds...done.)
Scrolled for 1.51 sec
Can move back, Can move forward
Available page numbers=1,6,7,8,9,10, Current page=8, Max page=10
Moved forward to max page (10) (first pausing for 3.10 seconds...done.)
Scrolled for 6.86 sec
Can move back, Can move forward
Available page numbers=1,8,9,10,11,12, Current page=10, Max page=12
Moved forward to max page (12) (first pausing for 2.77 seconds...done.)
Scrolled for 0.33 sec
Can move back, Can move forward
A

Smoothly scroll until sort search results dropdown menu is in view

In [26]:
%%time
if can_scrape:
    dropdown_sort = driver.find_element_by_xpath('.//a[@class="trigger"]')
    driver = smooth_scroll_until_element_in_view(driver, dropdown_sort)
    time.sleep(uniform(1.9,3.8))

CPU times: user 2 ms, sys: 497 µs, total: 2.5 ms
Wall time: 3 s


<a id="scrape"></a>

## 8. [Scrape](#scrape)

In [27]:
%%time
if can_scrape:
    for page_num in page_numbers_to_scrape:
        # Scrape search results
        scrape_single_page_search_results(driver, selenium_data_dir)

        # Get all listings on search results page
        search_results_div = driver.find_elements_by_xpath(
            './/div[@id="search_resultsRows"]/a'
        )
        for k, search_result in enumerate(search_results_div[0:]):
            # Update driver
            if k != 0:
                search_result = driver.find_elements_by_xpath(
                    './/div[@id="search_resultsRows"]/a'
                )[k]
            # Get game title
            listing_info = search_result.find_element_by_xpath(
                './/div[@class="responsive_search_name_combined"]'
            )
            # print(1)
            title_os = listing_info.find_element_by_tag_name("div")
            title = (
                title_os.find_element_by_class_name(
                    "title"
                ).text.lower().split("\ngenre: ")[0].split("title: ")[-1].title()
            )
            title = re.sub(r"\W+", "", title.replace(' ', '_'))
            fname = f"p{page_num}_l{k+1}_{title}.csv"

            # Get filepath to where file should be saved
            listing_filepath = os.path.join(selenium_data_dir, fname)

            # Scrape listing, if not prevoiusly done
            if not glob(os.path.join(selenium_data_dir, f"*{title.replace(' ', '_')}.csv")):
                # search_result = search_results_div[0]
                actions = ActionChains(driver)

                # Move mouse cursor to listing
                actions.move_to_element(search_result).perform()
                driver = smooth_scroll_until_element_in_view(driver, search_result)

                # Click on listing
                search_result.click()
                print(
                    f"Navigated to and clicked on link for listing {k+1} ({title})...",
                    end="",
                )
                post_click_pause_duration = uniform(3.5, 6.1)
                print(f"Pausing for {post_click_pause_duration:.3f} seconds...", end="")
                time.sleep(post_click_pause_duration)
                print("done.")

                # Get through age check, if necessary
                try:
                    age_container = driver.find_element_by_xpath(
                        './/div[@class="agegate_text_container"]'
                    )
                    _ = age_container.find_element_by_xpath(".//h2").text
                    driver, age_entry = enter_age(driver)
                    print(f"Passed through age check for listing {k+1} ({title})")
                except Exception:
                    age_entry = False
                    print(f"No age check for listing {k+1} ({title})")

                start_time = time.time()
                # Generate a random integer to determine scrolling behaviour
                scroll_int = randint(1, 100)

                # Scroll down to update user-review stats
                pre_scroll_down_pause_duration = uniform(1.1, 1.9)
                time.sleep(pre_scroll_down_pause_duration)
                # Scroll smoothly to bottom of page
                scroll_up_down_page(
                    driver,
                    by_how_much=22,
                    min_num_pauses=1,
                    max_num_pauses=3,
                    min_pause=2.5,
                    max_pause=3.9,
                    scroll_method="slow",
                    scroll_direction="down",
                )
                post_scroll_down_pause_duration = uniform(1.5, 2.9)
                time.sleep(post_scroll_down_pause_duration)
                print(
                    f"Paused for {pre_scroll_down_pause_duration:.2f} seconds. "
                    "Scrolled to the bottom of the page."
                )

                # Scraping code
                driver = scrape_listing(driver, k+1, page_num, selenium_data_dir)

                # Scroll up
                pre_scroll_up_pause_duration = uniform(2.5, 3.9)
                time.sleep(pre_scroll_up_pause_duration)
                print(f"Paused for {pre_scroll_up_pause_duration:.2f} seconds.")
                if scroll_int < randint(1, 45):
                    # Scroll up until Install button is in view
                    try:
                        comm_hub_div = driver.find_element_by_xpath(
                            './/div[@class="apphub_OtherSiteInfo"]'
                        )
                        driver = smooth_scroll_until_element_in_view(driver, comm_hub_div)
                        print("Scrolled up until Community Hub button is in view.")
                    except NoSuchElementException:
                        install_steam_btn = driver.find_element_by_xpath(
                            './/a[@class="header_installsteam_btn_content"]'
                        )
                        driver = smooth_scroll_until_element_in_view(driver, install_steam_btn)
                        print("Scrolled up until Install Steam button is in view.")
                else:
                    # Scroll up until store homepage logo is in view
                    home_logo = driver.find_element_by_xpath('.//span[@id="logo_holder"]')
                    driver = smooth_scroll_until_element_in_view(driver, home_logo)
                    print("Scrolled up until store homepage logo is in view.")

                time_on_page = time.time() - start_time
                print(
                    f"Scraped since file p*_l*_{title}.csv was not found.\n"
                    f"Time spent on page = {time_on_page:.2f} seconds.\n"
                )
                # Go back to search results page
                if age_entry:
                    driver.back()
                    driver.back()
                else:
                    driver.back()
                # Pause on the search results page
                list_page_pause = uniform(1.4, 3.1)
                time.sleep(list_page_pause)
                print(f"Returned to search results page and paused for {list_page_pause:.2f} seconds.")
            else:
                print(f"File for listing {title} was found. Did nothing.\n")
    time.sleep(uniform(4.5,5.9))

Retrieved listings from search results page 50.
Saving data to /home/edesz/Downloads/web-scraping/data/raw/selenium/search_results_page_50_20211022_141007.parquet.gzip...done.
Exported search results for page 50.

File for listing Nuclear_Blaze was found. Did nothing.

File for listing Troubleshooter_Abandoned_Children was found. Did nothing.

File for listing Nostale__Anime_Mmorpg was found. Did nothing.

File for listing Stationeers was found. Did nothing.

File for listing Halo_Wars_Definitive_Edition was found. Did nothing.

Navigated to and clicked on link for listing 6 (112_Operator)...Pausing for 4.822 seconds...done.
No age check for listing 6 (112_Operator)
Paused for 1.38 seconds. Scrolled to the bottom of the page.
Starting with listing 6
Scraped game title for listing 6 (112_Operator)
Scraped listing 6
Exported p50_l6_112_Operator to CSV file
Done with listing 6 in 1.843 sec.
Paused for 3.68 seconds.
Paused for 3.68 seconds. Scrolled up until store homepage logo is in view.

<a id="examine-collected-outputs"></a>

## 9. [Examine collected outputs](#examine-collected-outputs)

Get a list of the created CSV filepaths

In [28]:
csv_file_list = glob(os.path.join(selenium_data_dir, "*.csv"))
csv_file_list.sort(key=lambda f: int(re.sub("\D", "", f)))
print(f"Found {len(csv_file_list)} CSV files of listings.")

Found 874 CSV files of listings.


Concatenate all CSV files into a single DataFrame

In [29]:
%%time
df_listings = pd.concat(
    [pd.read_csv(f) for f in csv_file_list],
    ignore_index=True,
).drop_duplicates()
cols_to_hide = ["user_defined_tags", "languages"]
show_df(df_listings.drop(columns=cols_to_hide), 1)
show_df_dtypes_nans(df_listings)

Unnamed: 0,review_type_all,overall_review_rating,pct_overall,pct_overall_threshold,pct_overall_lang,pct_overall_threshold_lang,platforms,num_steam_achievements,drm,rating,rating_descriptors,review_type_positive,review_type_negative,review_language_mine,Title,Genre,Release Date,Early Access Release Date,Developer,Publisher,Franchise,num_languages,page_num,listing_num
0,1014.0,Very Positive,93.0,positive,92.0,positive,win,30.0,,,,938.0,76.0,412.0,The Riftbreaker,"Action, Adventure, Indie, RPG, Simulation, Strategy","14 Oct, 2021",,EXOR Studios,"EXOR Studios, Surefire.Games","EXOR Studios, surefiregames",10.0,2,1
873,13373.0,Very Positive,86.0,positive,89.0,positive,win,47.0,,m,"Blood, Strong Language, Suggestive Themes, Violence",11544.0,1829.0,7219.0,NieR Replicant™ ver.1.22474487139...,"Action, Adventure, RPG","23 Apr, 2021",,"Square Enix, Toylogic Inc.",Square Enix,,9.0,14,20


Unnamed: 0,num_missing,dtype
review_type_all,15,float64
overall_review_rating,15,object
pct_overall,15,float64
pct_overall_threshold,15,object
pct_overall_lang,16,float64
pct_overall_threshold_lang,16,object
platforms,33,object
user_defined_tags,2,object
num_steam_achievements,203,float64
drm,586,object


CPU times: user 1.98 s, sys: 19.3 ms, total: 2 s
Wall time: 1.99 s


<a id="(optional)-convert-single-page-csv-files-to-`parquet`-file"></a>

## 10. [(Optional) Convert single-page CSV files to `parquet` file](#(optional)-convert-single-page-csv-files-to-`parquet`-file)

Create dictionary of page numbers and `.parquet` filepaths

In [30]:
page_nums_available = list(
    set(
        [
            int(re.findall(r"\d+", os.path.basename(f).split("_", 2)[0])[0])
            for f in glob(os.path.join(selenium_data_dir, "p*_*.csv"))
        ]
    )
)
print(len(page_nums_available))

49


In [31]:
%%time
dict_dfs_fpaths = {}
for page_num in page_nums_available:
    timestr = time.strftime("%Y%m%d_%H%M%S")
    parquet_filepath = os.path.join(selenium_data_dir, f"listings_page_{page_num}_{timestr}.parquet")
    dict_dfs_fpaths[parquet_filepath] = pd.concat(
        [pd.read_csv(f) for f in glob(os.path.join(selenium_data_dir, f"p{page_num}_*.csv"))],
        ignore_index=True,
    ).drop_duplicates().astype({"Publisher": str})
show_df(pd.concat(list(dict_dfs_fpaths.values()), ignore_index=True).drop(columns=cols_to_hide), 2)

Unnamed: 0,review_type_all,overall_review_rating,pct_overall,pct_overall_threshold,pct_overall_lang,pct_overall_threshold_lang,platforms,num_steam_achievements,drm,rating,rating_descriptors,review_type_positive,review_type_negative,review_language_mine,Title,Genre,Release Date,Early Access Release Date,Developer,Publisher,Franchise,num_languages,page_num,listing_num
0,32144.0,Very Positive,90.0,positive,91.0,positive,win,191.0,"Requires agreement to a 3rd-party EULA, STAR WARS™: The Old Republic™ EULA",t,"Blood and Gore, Mild Language, Sexual Themes, Violence",29223.0,2921.0,22042.0,STAR WARS™: The Old Republic™,"Free to Play, Massively Multiplayer, RPG","20 Dec, 2011",,BioWare,Electronic Arts,Star Wars,3.0,2,23
1,1014.0,Very Positive,93.0,positive,92.0,positive,win,30.0,,,,938.0,76.0,412.0,The Riftbreaker,"Action, Adventure, Indie, RPG, Simulation, Strategy","14 Oct, 2021",,EXOR Studios,"EXOR Studios, Surefire.Games","EXOR Studios, surefiregames",10.0,2,1
872,8726.0,Overwhelmingly Positive,99.0,positive,99.0,positive,"win, mac, linux",8.0,,e,,8661.0,65.0,6190.0,A Short Hike,"Adventure, Indie","30 Jul, 2019",,adamgryu,adamgryu,,5.0,50,20
873,3352.0,Mixed,83.0,positive,80.0,positive,"win, vr_required",,,,,2713.0,639.0,2747.0,,,,,,,,1.0,50,13


CPU times: user 1.97 s, sys: 24.3 ms, total: 1.99 s
Wall time: 1.99 s


Combine all CSVs for a single page of listings into a single `.parquet` file per page

In [32]:
# save_to_parquet_file(list(dict_dfs_fpaths.values()), list(dict_dfs_fpaths.keys()))

<a id="close-browser"></a>

## 11. [Close Browser](#close-browser)

(OPTIONAL Post-Scrape action) Option 1/3 - Smoothly scroll up until store homepage logo is visible

In [33]:
%%time
if "click_store_homepage_logo" in post_scrape_movements:
    store_home_button = driver.find_element_by_xpath('.//span[@id="logo_holder"]')
    driver = smooth_scroll_until_element_in_view(driver, store_home_button)
    time.sleep(uniform(4.8,5.2))

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 3.34 µs


(OPTIONAL Post-Scrape action) Option 1/3 - Move the mouse cursor over the store homepage logo

In [34]:
%%time
if "click_store_homepage_logo" in post_scrape_movements:
    time.sleep(uniform(2.4, 8.5))
    store_home_button = driver.find_element_by_xpath('.//span[@id="logo_holder"]')
    actions = ActionChains(driver)
    actions.move_to_element(store_home_button).perform()
    time.sleep(uniform(4.8,8.6))

CPU times: user 2 µs, sys: 1e+03 ns, total: 3 µs
Wall time: 3.34 µs


(OPTIONAL Post-Scrape action) Option 1/3 - Click the store homepage logo to go to the Steam store homepage

In [35]:
%%time
if "click_store_homepage_logo" in post_scrape_movements:
    store_home_button.click()
    time.sleep(uniform(5.8, 10.9))

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 2.86 µs


(OPTIONAL Post-Scrape action) Option 2/3 - Randomly hover of each of the four sections in the *BROWSE STEAM* section

In [36]:
%%time
if "click_store_homepage_logo" in post_scrape_movements:
    if "hover_over_browse_steam_section" in post_scrape_movements:
        browse_steam_section = driver.find_element_by_xpath(
            './/div[@class="big_buttons home_page_content"]'
        )
        driver = smooth_scroll_until_element_in_view(driver, browse_steam_section)
        browse_headings = driver.find_element_by_xpath(
            './/div[@class="big_buttons home_page_content"]'
        ).find_elements_by_xpath('.//div[@class="button_container"]/a')
        shuffle(browse_headings)
        for browse_heading in browse_headings:
            actions = ActionChains(driver)
            actions.move_to_element(browse_heading).perform()
            hover_duration = uniform(5.2,9.8)
            print(f"Hovering over {browse_heading.text} for {hover_duration:.2f} sec...", end="")
            time.sleep(hover_duration)
            print("done.")
        time.sleep(uniform(3.2,9.4))

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 3.1 µs


(OPTIONAL Post-Scrape action) Option 3/3 - Randomly scroll smoothly up until store homepage logo is visible

In [37]:
%%time
if randint(1, 5) == 4 and "scroll_until_store_homepage_logo_viewable" in post_scrape_movements:
    store_home_button = driver.find_element_by_xpath('.//span[@id="logo_holder"]')
    driver = smooth_scroll_until_element_in_view(driver, store_home_button)
time.sleep(uniform(7.5,10.9))

CPU times: user 1.54 ms, sys: 114 µs, total: 1.66 ms
Wall time: 9.61 s


Close the browser

In [38]:
driver.quit()

---

<span style="float:left">
    <a href="./1_eda_aggregated.ipynb"><< 1 - Exploratory Data Analysis of Aggregated data</a>
</span>

<span style="float:right">
    <a href="./3_requests_download.ipynb">3 - Downloading with requests library >></a>
</span>