# Scraping search results with the `requests` library

In [1]:
# %load_ext lab_black
%load_ext autoreload
%autoreload 2

Import Python modules

In [2]:
import os
import re
import time
from glob import glob
from random import choice, uniform
from zipfile import ZipFile

import requests
import pandas as pd
from bs4 import BeautifulSoup

Import any custom modules

In [3]:
# Manually assembled list of browser headers to submit in a GET request
%aimport src.webscraping_utils
from src.webscraping_utils import get_custom_headers_list

<a href="table-of-contents"></a>

## [Table of Contents](#table-of-contents)
0. [About](#about)
1. [User Inputs](#user-inputs)
2. [Scrape](#scrape)
3. [Examine collected outputs](#examine-collected-outputs)

<a id="about"></a>

## 0. [About](#about)

In this notebook, we will iterate over pages of search results for game listings on the Steam store using the `requests` library. We will get the HTML for each page of search results and then scrape the price, listing URL and supported platforms from each listing on each page. The scraping process will be described later in this notebook.

<a id="user-inputs"></a>

## 1. [User Inputs](#user-inputs)

Define variables that can be changed when running this notebook

In [4]:
PROJ_ROOT_DIR = os.getcwd()

In [13]:
# Cookies to be sent to Steam store to get access to listings that have an age requirement
cookies = {'birthtime': '283993201', 'mature_content': '1', "lastagecheckage":"14-0-1973"}

# Search results page numbers to be scraped
page_to_start_scraping = 50
number_pages_to_scrape = 50

# Delay before navigating to a new page of search results
min_pause_between_pages = 2.8
max_pause_between_pages = 4.2

Define variables that depend on the variables defined above
- get paths to directories where single-row CSV files (produced after scraping) will be stored
- adjust the number of pages of search results to be scraped
  - this will depend on the maximum number of pages available and on the search results page numbers that are to be scraped (defined above in `page_to_start_scraping` and `number_pages_to_scrape`)
    - Step 1. extract the maximum number of pages available, from the soup for the first page of search results
    - Step 2. reduce `number_pages_to_scrape` if the sum of `page_to_start_scraping` and `number_pages_to_scrape` is larger than the maximum number of pages of search results available
- create a list of manually assembled dictionaries of browser header, using the custom module imported earlier

In [14]:
data_dir = os.path.join(PROJ_ROOT_DIR, "data")
raw_data_dir = os.path.join(data_dir, "raw")
requests_data_dir = os.path.join(raw_data_dir, "requests")

# Adjust number of pages to scrape, based on last available page number
# # Step 1. Get first page of search results and determine last available page number
r = requests.get("https://store.steampowered.com/search/?category1=998&page=1")
soup = BeautifulSoup(r.content, "html.parser")
page_right_container = soup.find("div", {"class": "search_pagination_right"})
last_page = int(page_right_container.find_all("a")[-2].text)

# # Step 2. Adjust number of pages to scrape, based on last available page number
# - if the sum of the manually specified number of pages to scrape and the starting
#   page number exceeds last available page number, then re-calculate number of pages
#   to scrape by subtracting the page to start scraping from the last available page number
if page_to_start_scraping + number_pages_to_scrape > last_page:
    number_pages_to_scrape = last_page - page_to_start_scraping

# Define list of request headers to (randomly) choose from
headers_list = get_custom_headers_list()

Export list of `DataFrame`s to parquet files

In [7]:
def save_to_parquet_file(dfs: list, parquet_filepaths: list):
    """Save DataFrame to parquet file."""
    for parquet_filepath, df in zip(parquet_filepaths, dfs):
        try:
            print(f"Saving data to {parquet_filepath + '.gzip'}", end="...")
            df.to_parquet(
                parquet_filepath + ".gzip",
                engine="auto",
                index=False,
                compression="gzip",
            )
            print("done.")
        except Exception as e:
            print(str(e))
            raise

Define function to perform scraping of single search results page, using the following steps
1. Try to get one or more rows of search results
   - if this is not possible, then no search results were returned (don't scrape this)
     - assign attributes to this page number with `None` in all the expected columns
   - if this is possible, then scrape search results page (see next step for details)
2. If one or more rows of search results is displayed, then iterate over each row of search results and use `BeautifulSoup` to scrape the displayed information on the web page
3. Export scraped attributes to CSV

In [9]:
def scrape_single_page_search_results(
    soup, raw_data_dir, current_page_num, request_status_code, verbose=False
):
    """Scrape a single page of search results."""
    # 1. Get all search results displayed on page
    search_results_div = soup.find(
        'div',
        {"id" :"search_resultsRows"}
    ).find_all("a", class_="search_result_row")
    d_search_results = []
    try:
        # 1. Verify some search results on page
        assert len(search_results_div) > 0
        try:
            # 2. Iterate over all rows of search results found on page
            for k, search_result in enumerate(search_results_div):
                listing_info = search_result.find("div", class_="responsive_search_name_combined")

                # Get title
                title_os = listing_info.find("div", class_="col search_name ellipsis")
                title = title_os.find("span", class_="title").text.strip()
                title = re.sub(r"\W+", "", title.replace(' ', '_'))

                # Get app_id and listing URL
                app_id = search_result["data-ds-appid"]
                url = f"https://store.steampowered.com/app/{app_id}/{title}/"

                # Get supported platforms
                try:
                    platform_spans = title_os.find("p").find_all("span")
                    platform_names = ",".join(
                        [
                            p["class"][-1]
                            for p in platform_spans
                        ]
                    )
                except Exception:
                    platform_names = [None, None, None]
                # print(3)

                # Get release date
                try:
                    rel_date = listing_info.find("div", class_="col search_released responsive_secondrow")
                    release_date = rel_date.text.strip()
                except Exception:
                    release_date = None

                # Get discount percent
                xpath_discount_price = (
                    './/div[contains(@class,"search_price_discount_combined")]/div'
                )
                discount_price = listing_info.find(
                    "div", {"class": "search_price_discount_combined"}
                ).find_all("div")
                discount_pct = (
                    discount_price[0].text.strip() if discount_price[0].text.strip() else None
                )
                
                # Get original and discount price (if any)
                try:
                    price = listing_info.find(
                        "div", class_="search_price_discount_combined"
                    ).find("div", class_="search_price")
                    price = price.text.strip()
                    _, original_price, discount_price = price.split("$")
                    original_price, discount_price = [original_price, discount_price]
                except Exception:
                    # print(5)
                    original_price = (
                        discount_price[1].text.strip().replace("$", "")
                        if discount_price[1].text.strip()
                        else None
                    )
                    discount_price = None

                # print summary message to screen
                if verbose:
                    print(
                        current_page_num,
                        request_status_code,
                        k + 1,
                        title,
                        app_id,
                        platform_names,
                        release_date,
                        discount_pct,
                        original_price,
                        discount_price,
                    )

                # Append dict of scraped information to list
                d_search_results.append(
                    {
                        "page": current_page_num,
                        "request_status_code": request_status_code,
                        "listing_counter": k + 1,
                        "title": title,
                        "url": url,
                        "platform_names": platform_names,
                        "release_date": release_date,
                        "discount_pct": discount_pct,
                        "original_price": original_price,
                        "discount_price": discount_price,
                    }
                )
            print(
                "Retrieved listings from search results page "
                f"{current_page_num}."
            )
        # Handle error during scraping of single search results page
        except Exception:
            d_search_results.append(
                {
                    "page": current_page_num,
                    "request_status_code": request_status_code,
                    "listing_counter": k + 1,
                    "title": None,
                    "url": None,
                    "platform_names": None,
                    "release_date": None,
                    "discount_pct": None,
                    "original_price": None,
                    "discount_price": None,
                }
            )
            print(
                "Error retrieving listings from search results page "
                f"{current_page_num}."
            )
    # Handle error of no search results
    except Exception:
        d_search_results = [
            {
                "page": current_page_num,
                "request_status_code": request_status_code,
                "listing_counter": k + 1,
                "title": None,
                "url": None,
                "platform_names": None,
                "release_date": None,
                "discount_pct": None,
                "original_price": None,
                "discount_price": None,
            }
            for _ in range(25)
        ]
        print("No listings on search results page " f"{current_page_num}.\n")

    # 3. Create DataFrame from list of dicts and export to CSV
    df_single_page_search_results_single_page = pd.DataFrame.from_records(
        d_search_results
    )
    timestr = time.strftime("%Y%m%d_%H%M%S")
    parquet_filepath = os.path.join(
        raw_data_dir,
        f"search_results_page_{current_page_num}_{timestr}.parquet",
    )
    if not os.path.exists(parquet_filepath):
        save_to_parquet_file(
            [df_single_page_search_results_single_page], [parquet_filepath]
        )
        print(f"Exported search results for page {current_page_num}.\n")
    else:
        print(
            "File was found with search results information for page "
            f"{current_page_num}. Did nothing.\n"
        )

We'll now scrape all search results that are displayed. We'll define a `while` loop that compares the number of search results found to the number to be scraped (calculated from `page_to_start_scraping` and `number_pages_to_scrape`). This comparison will be used to determine whether to proceed with the next iteration of the `while` loop (if the comparison returns `True`) or not (if it returns `False`). If okay to move ahead, then the loop will move to the next page of search results.

This process includes the following two steps
1. If number of search results found is less than the total number of search results to be scraped (see `page_to_start_scraping` and `number_pages_to_scrape`), then continue (and increment the page number by 1); else break out of the `while` loop.
2. If able to continue from step 1. then
   - (a) get a random header from the list of headers above
   - (b) send a GET request to get HTML from page
   - (c) Pause before scraping
   - (d) Scrape HTML to get listing attributes

In [11]:
def scrape_search_results_pages(
    raw_data_dir,
    cookies={
        'birthtime': '283993201',
        'mature_content': '1',
        "lastagecheckage":"1-0-1900",
    },
    base_url=(
        "https://store.steampowered.com/search/?"
        "category1=998&supportedlang=english&page="
    ),
    n=100,
    page=500,
    min_pause_between_pages=1.5,
    max_pause_between_pages=2.5,
    verbose=False,
):
    """Scrape pages of search results."""
    # 1. Check number of search results available compared to number re
    while page * 25 < n:
        page += 1
        # 2. (a) Get random request header
        headers = choice(headers_list)
        # 2. (b) Send GET request
        with requests.Session() as r:
            response = r.get(
                url=base_url+str(page),
                cookies=cookies,
                headers=headers,
            )
        # 2. (c) Pause before scraping
        print(f"Received response to request from page {page}...", end="")
        pause_duration = uniform(min_pause_between_pages, max_pause_between_pages)
        print(f"Pausing for {pause_duration:.2f} sec...", end="")
        time.sleep(pause_duration)
        print("done.")
        # 2. (d) Scrape search results
        soup = BeautifulSoup(response.content, "html.parser")
        scrape_single_page_search_results(
            soup,
            raw_data_dir,
            page,
            response.status_code,
            verbose,
        )
        print(
            f"Completed page {page} [collected {(page-1) * 25}-{page * 25} search results "
            f"(max. search result index wanted = {n})].\n"
        )

<a id="scrape"></a>

## 2. [Scrape](#scrape)

Calculate the number of search results that will be retrieved
- there are 25 listings displayed on a single page of search results

In [15]:
ending_search_result = ((page_to_start_scraping - 1) + number_pages_to_scrape) * 25
print(ending_search_result)

2475


Scrape the number of listings required (see `number_pages_to_scrape` defined earlier), starting from the required starting page number (see `page_to_start_scraping`)

In [16]:
%%time
scrape_search_results_pages(
    requests_data_dir,
    n=ending_search_result,
    page=page_to_start_scraping - 1,
    min_pause_between_pages=min_pause_between_pages,
    max_pause_between_pages=max_pause_between_pages,
    verbose=True,
)

Received response to request from page 50...Pausing for 4.01 sec...done.
50 200 1 Blazing_Sails 1158940 win Nov 5, 2020 -30% 14.99 10.49
50 200 2 LEGO_Harry_Potter_Years_14 21130 win Jun 25, 2010 None 19.99 None
50 200 3 Source_of_Madness 1315610 win Sep 22, 2021 None 16.99 None
50 200 4 Roll 1585910 win,mac May 13, 2021 None 4.99 None
50 200 5 Cloud_Gardens 1372320 win,mac Sep 1, 2021 None 17.99 None
50 200 6 Soul_Nomad__the_World_Eaters 1535610 win Aug 31, 2021 None 19.99 None
50 200 7 Yakuza_5_Remastered 1105510 win Jan 28, 2021 None 19.99 None
50 200 8 Maid_of_Sker 826940 win Jul 28, 2020 -55% 24.99 11.24
50 200 9 Cultist_Simulator 718670 win,mac,linux May 31, 2018 None 19.99 None
50 200 10 Sakuna_Of_Rice_and_Ruin 1356670 win Nov 10, 2020 None 39.99 None
50 200 11 Shovel_Knight_Treasure_Trove 250760 win,mac,linux Jun 26, 2014 None 39.99 None
50 200 12 Max_Payne_3 204100 win May 31, 2012 None 19.99 None
50 200 13 The_Sinking_City 750130 win Feb 26, 2021 None 39.99 None
50 200 14 Blo

<a id="examine-collected-outputs"></a>

## 3. [Examine collected outputs](#examine-collected-outputs)

Get a list of the created Parquet filepaths

In [17]:
parquet_file_list = glob(os.path.join(requests_data_dir, "*.parquet.gzip"))
parquet_file_list.sort(key=lambda f: int(re.sub("\D", "", f)))
print(f"Found {len(parquet_file_list)} Parquet files of search results.")

Found 50 Parquet files of search results.


Vertically concatenate all Parquet files into a single `DataFrame` and show this output

In [18]:
%%time
df_search_results = pd.concat(
    [pd.read_parquet(f, engine="auto") for f in parquet_file_list],
    ignore_index=True,
).drop_duplicates()
display(df_search_results.head(5).append(df_search_results.tail(5)))
display(
    df_search_results.isna().sum().rename("num_missing").to_frame().merge(
        df_search_results.dtypes.rename("dtype").to_frame(),
        left_index=True,
        right_index=True,
    )
)

Unnamed: 0,page,request_status_code,listing_counter,title,url,platform_names,release_date,discount_pct,original_price,discount_price
0,50,200,1,Blazing_Sails,https://store.steampowered.com/app/1158940/Bla...,win,"Nov 5, 2020",-30%,14.99,10.49
1,50,200,2,LEGO_Harry_Potter_Years_14,https://store.steampowered.com/app/21130/LEGO_...,win,"Jun 25, 2010",,19.99,
2,50,200,3,Source_of_Madness,https://store.steampowered.com/app/1315610/Sou...,win,"Sep 22, 2021",,16.99,
3,50,200,4,Roll,https://store.steampowered.com/app/1585910/Roll/,"win,mac","May 13, 2021",,4.99,
4,50,200,5,Cloud_Gardens,https://store.steampowered.com/app/1372320/Clo...,"win,mac","Sep 1, 2021",,17.99,
1245,99,200,21,Sid_Meiers_Civilization_III_Complete,https://store.steampowered.com/app/3910/Sid_Me...,win,"Oct 25, 2006",,4.99,
1246,99,200,22,FATE_Undiscovered_Realms,https://store.steampowered.com/app/276890/FATE...,win,"Apr 17, 2014",,7.99,
1247,99,200,23,Haven_Park,https://store.steampowered.com/app/1549550/Hav...,"win,mac,linux","Aug 5, 2021",,8.99,
1248,99,200,24,Peggle_Nights,https://store.steampowered.com/app/3540/Peggle...,"win,mac","Oct 15, 2008",,4.99,
1249,99,200,25,Internet_Cafe_Simulator,https://store.steampowered.com/app/1136160/Int...,win,"Oct 25, 2019",-50%,9.99,4.99


Unnamed: 0,num_missing,dtype
page,0,int64
request_status_code,0,int64
listing_counter,0,int64
title,0,object
url,0,object
platform_names,0,object
release_date,0,object
discount_pct,1180,object
original_price,7,object
discount_price,1180,object


CPU times: user 152 ms, sys: 21.3 ms, total: 173 ms
Wall time: 424 ms


---

<a href="./2_selenium.ipynb"><< 2 - Navigating webstore with Selenium webdriver</a>

<span style="float:right">
    <a href="./4_filter_requests_listings.ipynb">4 - Filter search results scraped with the requests library >></a>
</span>