# Scraping listings with the `requests` library

In [1]:
# %load_ext lab_black
%load_ext autoreload
%autoreload 2

Import Python modules

In [2]:
import datetime
import os
import re
import time
from collections import OrderedDict
from glob import glob
from random import choice, uniform, random, randint, randrange

import pandas as pd
import requests
from bs4 import BeautifulSoup

Import any custom modules

In [3]:
# Beautifulsoup modules to extract listing attributes
%aimport src.bs4_helpers
import src.bs4_helpers as bsh

# Dictionary with key-value pairs to use if a scrape fails
%aimport src.failure_records
from src.failure_records import dict_failed_extraction_from_listing_page

# Manually assembled list of browser headers to submit in a GET request
%aimport src.webscraping_utils
from src.webscraping_utils import get_custom_headers_list

<a href="table-of-contents"></a>

## [Table of Contents](#table-of-contents)
0. [About](#about)
1. [User Inputs](#user-inputs)
2. [Load relevant columns from search results](#load-relevant-columns-from-search-results)
3. [Scrape](#scrape)
4. [Examine collected outputs](#examine-collected-outputs)

<a id="about"></a>

## 0. [About](#about)

In this notebook, we will iterate over the rows of the CSV file created by `5_filter_requests_listings.ipynb` and scrape the listing data in the URL column of that row. The scraping process will be described later in this notebook.

<a id="user-inputs"></a>

## 1. [User Inputs](#user-inputs)

Define variables that can be changed when running this notebook

In [4]:
PROJ_ROOT_DIR = os.getcwd()

In [5]:
# Cookies to be sent to Steam store to get access to listings that have an age requirement
cookies = {"mature_content": "1", "lastagecheckage": "14-0-1973"}

# Page numbers from CSV file to be scraped
pages_to_scrape = list(range(506, 550+1))

# Delay between sending request to successive listing URLs
min_pause_between_listings = 3.0
max_pause_between_listings = 5.4

# Delay between scraping listings from different pages of the page_num column in CSV file
min_pause_between_pages = 0.4
max_pause_between_pages = 0.8

Define variables that depend on the variables defined above
- get paths to directories where single-row CSV files (produced after scraping) will be stored
- create a list of manually assembled dictionaries of browser header, using the custom module imported earlier

In [6]:
# Path to data/raw/requests
data_dir = os.path.join(PROJ_ROOT_DIR, "data")
raw_data_dir = os.path.join(data_dir, "raw")
requests_data_dir = os.path.join(raw_data_dir, "requests")

# Create filepath for CSV containing non-duplicated listing URLs (created in
# 5_filter_requests_listings.ipynb) to be scraped
requests_listings_to_scrape_filepath = os.path.join(
    requests_data_dir,
    "requests_listings_to_scrape.csv",
)

# Get list of dicts of requests headers (when making a single
# request, a random header dict from this list will be chosen)
headers_list = get_custom_headers_list()

Define custom function to create a random birth date to append to request cookies ([link](https://stackoverflow.com/a/64072787/4057186))

In [7]:
def random_human_readable_timestamp_to_unix(
    start_date=datetime.datetime(1980, 4, 6),
    end_date=datetime.datetime(2000, 12, 7),
):
    """Create a random UNIX timestamp to use as birthdate in a request cookie."""
    time_between_dates = end_date - start_date
    days_between_dates = time_between_dates.days
    random_number_of_days = randrange(days_between_dates)
    random_date = start_date + datetime.timedelta(days=random_number_of_days)
    # print(random_date)
    return int(random_date.timestamp())

Define function to perform scraping of single game listing, using the following steps
1. Scrape listing
   - Try to get the listing title
     - if this is not possible, then the listing is either for a collection of games (won't scrape this) or is displaying content that we have not seen before (consider this an error and skip)
       - if it is an errror, then assign attributes to this listing with `None` in all expected columns
     - if this is possible, then call the `BeautifulSoup` custom helper module imported above and use it to scrape attributes from the listing
2. Export scraped attributes to CSV
3. Print time spent scraping

In [8]:
def scrape_listing_requests(soup, listing_num, page_num, raw_data_dir, url):
    """Scrape a single listing with the requests library."""
    print(f"Starting with listing {listing_num}")
    start_time = time.time()
    # 1. Scrape
    try:
        details_block = soup.find("div", {"id": "genresAndManufacturer"})
        game_title = (
            details_block.text.lower()
            .split("\ngenre: ")[0]
            .split("title: ")[-1]
            .title()
        )
        game_title = re.sub(r"\W+", "", game_title.replace(" ", "_"))
        print(f"Scraped game title for listing {listing_num} ({game_title})")
        try:
            listing_details = bsh.scrape_game_listing(soup)
            print(f"Scraped listing {listing_num}")
        except Exception:
            listing_details = dict_failed_extraction_from_listing_page()
            print(f"Error with listing {listing_num}. Used failure record.")
    except Exception:
        listing_details = dict_failed_extraction_from_listing_page()
        # Check if the listing is a collection
        try:
            collection_text = soup.find("h2", {"class": "no_margin"}).text.lower()
            game_title = re.sub(r"\W+", "", game_title.replace(" ", "_"))
            exp_collection_text = "items included in this package"
            if collection_text == exp_collection_text:
                game_title = soup.find("h2", {"class": "pageheader"}).text
                print(
                    "Listing is collection of games. Used failure record. "
                    "Skipped page scrape."
                )
            else:
                game_title = "Unknown"
                print(
                    f"For included items, got {collection_text}. "
                    "Used failure record."
                )
        except Exception:
            print("Unknown reason for error.")

    # 2. Export to CSV
    fname = f"p{page_num}_l{listing_num}_{game_title}.csv"
    df_listing_details = (
        pd.DataFrame.from_records([listing_details])
        .assign(page_num=page_num)
        .assign(listing_num=listing_num)
        .assign(url=url)
    )
    # display(df_listing_details)
    listing_filepath = os.path.join(requests_data_dir, fname)
    df_listing_details.to_csv(listing_filepath, index=False)
    print(f"Exported listing attributes for {game_title} to CSV file")
    
    # 3. Print duration
    duration = time.time() - start_time
    print(f"Done with page {page_num} listing {listing_num} in {duration:.3f} sec.")

<a id="load-relevant-columns-from-search-results"></a>

## 2. [Load relevant columns from search results](#load-relevant-columns-from-search-results)

We'll start by loading the CSV file of listings to be scraped. Recall that this was created by `5_filter_requests_listings.ipynb`

In [9]:
%%time
df = pd.read_csv(requests_listings_to_scrape_filepath)
display(df.head(5).append(df.tail(5)))

Unnamed: 0,page,listing_counter,title,url
0,50,1,Blazing_Sails,https://store.steampowered.com/app/1158940/Bla...
1,50,2,LEGO_Harry_Potter_Years_14,https://store.steampowered.com/app/21130/LEGO_...
2,50,3,Source_of_Madness,https://store.steampowered.com/app/1315610/Sou...
3,50,4,Roll,https://store.steampowered.com/app/1585910/Roll/
4,50,5,Cloud_Gardens,https://store.steampowered.com/app/1372320/Clo...
45994,2232,7,CasterLords,https://store.steampowered.com/app/812780/Cast...
45995,2232,8,Animal_Shelter,https://store.steampowered.com/app/1239320/Ani...
45996,2232,9,Ekko__A_Thiefs_Melody,https://store.steampowered.com/app/1344990/Ekk...
45997,2232,11,Animal_Shelter_Prologue,https://store.steampowered.com/app/1661260/Ani...
45998,2232,12,Dynasty_of_the_Sands,https://store.steampowered.com/app/1143070/Dyn...


CPU times: user 77.8 ms, sys: 24.7 ms, total: 102 ms
Wall time: 102 ms


**Notes**
1. These are the vertically concatenated search results. Each page of search results has the same page number (see the `page` column) and a different `listing_counter`. Duplicated listings are excluded from this `DataFrame` so each URL is unique.

<a id="scrape"></a>

## 3. [Scrape](#scrape)

We'll now iterate over each row of the `DataFrame` for the CSV file, retrieve HTML from the URL column and scrape the listing attributes. We will scrape each listing on a single page of search results (from the `page` column in the above `DataFrame`) and then pause before scraping listings from the next page. This willl be repeated for every page in the list of pages (`pages_to_scrape`) to be scraped (defined in the [User Inputs](#user-inputs) section earlier).

The steps in this process are as follows
1. Get listing page number, URL and title
2. Send a GET request to get HTML for each URL
   - as part of this, get a random
     - header from the list of headers above
     - birth date from the helper function defined above to generate this date
3. Scrape HTML to get listing attributes
4. Pause before sending next GET request
5. Pause if at end of a single page from the CSV file

In [10]:
%%time
# Iterate over all listings belonging to a single page of search
# results (in the above DataFrame)
for page_to_scrape in pages_to_scrape:
    for idx, row in df.query(
        f"page == {page_to_scrape}"
    ).reset_index(drop=True).iterrows():
        # Get listing URL, page number and title
        page_num = row["page"]
        url = row["url"]
        listing_num = row["listing_counter"]

        # 2. Send GET request
        # # Retrieve random header
        headers = choice(headers_list)
        # # Create random birth date
        cookies["birthtime"] = str(random_human_readable_timestamp_to_unix())
        start_time = time.time()
        # # Send GET request
        with requests.Session() as r:
            response = r.get(
                url=url,
                cookies=cookies,
                headers=headers,
            )
        print(
            f"Received response to request from URL for "
            f"page {page_num} listing {listing_num} ({os.path.basename(url.rstrip('/'))})"
        )

        # 3. Scraping code
        soup = BeautifulSoup(response.content, "html.parser")
        # print(soup.prettify())
        scrape_listing_requests(soup, listing_num, page_num, requests_data_dir, url)

        time_spent = time.time() - start_time
        print(f"Time spent processing = {time_spent:.2f} seconds.")

        # 4. Pause between listings
        inter_listing_pause = uniform(
            min_pause_between_listings, max_pause_between_listings
        )
        print(f"Pausing for {inter_listing_pause:.2f} seconds...", end="")
        time.sleep(inter_listing_pause)
        print("done.\n")

    # 5. Pause between pages
    inter_page_pause = uniform(min_pause_between_pages, max_pause_between_pages)
    print(
        f"Reached end of page {page_to_scrape}. "
        f"Pausing for {inter_page_pause:.2f} seconds between pages...",
        end="",
    )
    if page_to_scrape < max(pages_to_scrape):
        time.sleep(inter_page_pause)
    print("done.\n")

Received response to request from URL for page 471 listing 1 (Crazy_Stone_Deep_Learning_The_First_Edition)
Starting with listing 1
Scraped game title for listing 1 (Crazy_Stone_Deep_Learning_The_First_Edition)
Scraped listing 1
Exported listing attributes for Crazy_Stone_Deep_Learning_The_First_Edition to CSV file
Done with page 471 listing 1 in 0.073 sec.
Time spent processing = 0.58 seconds.
Pausing for 3.11 seconds...done.

Received response to request from URL for page 471 listing 2 (Dark_Zone_Defense)
Starting with listing 2
Scraped game title for listing 2 (Dark_Zone_Defense)
Scraped listing 2
Exported listing attributes for Dark_Zone_Defense to CSV file
Done with page 471 listing 2 in 0.074 sec.
Time spent processing = 0.59 seconds.
Pausing for 4.71 seconds...done.

Received response to request from URL for page 471 listing 3 (A_Trip_to_Yugoslavia_Directors_Cut)
Starting with listing 3
Scraped game title for listing 3 (A_Trip_To_Yugoslavia_DirectorS_Cut)
Scraped listing 3
Export

<a id="examine-collected-outputs"></a>

## 4. [Examine collected outputs](#examine-collected-outputs)

Get a list of the created CSV filepaths

In [11]:
csv_file_list = glob(os.path.join(requests_data_dir, "p*_l*_*.csv"))
csv_file_list.sort(key=lambda f: int(re.sub("\D", "", f)))
print(f"Found {len(csv_file_list)} CSV files of listings.")

Found 4256 CSV files of listings.


Vertically concatenate all CSV files into a single DataFrame

In [12]:
%%time
df_listings = pd.concat(
    [pd.read_csv(f) for f in csv_file_list],
    ignore_index=True,
).drop_duplicates()
cols_to_hide = ["user_defined_tags", "languages"]
display(df_listings.head(2).append(df_listings.tail(2)).drop(columns=cols_to_hide))
display(
    df_listings.isna().sum().rename("num_missing").to_frame().merge(
        df_listings.dtypes.rename("dtype").to_frame(),
        left_index=True,
        right_index=True,
    )
)

Unnamed: 0,review_type_all,overall_review_rating,pct_overall,pct_overall_threshold,pct_overall_lang,pct_overall_threshold_lang,platforms,num_steam_achievements,drm,rating,...,Genre,Release Date,Early Access Release Date,Developer,Publisher,Franchise,num_languages,page_num,listing_num,url
0,,Very Positive,90.0,positive,,,win,,,,...,"Action, Casual, Indie, Simulation, Strategy, E...","Mar 23, 2020","Mar 23, 2020",Nezon Production,Nezon Production,,2,53,1,https://store.steampowered.com/app/1216320/Shi...
1,,Very Positive,93.0,positive,,,win,,"Requires agreement to a 3rd-party EULA, Hot La...",,...,"Action, Adventure, Indie, Racing","Sep 19, 2019",,Klei Entertainment,Klei Entertainment,Klei Entertainment,14,53,2,https://store.steampowered.com/app/382560/Hot_...
4254,,Mostly Negative,48.0,positive,,,win,,,e,...,,"Jan 19, 2012",,SEGA,SEGA,Sonic the Hedgehog,6,185,15,https://store.steampowered.com/app/202530/Soni...
4255,,Mixed,69.0,positive,,,"win, mac, linux",,,,...,,"Apr 5, 2018",,"Nova B12, OtakuNovel",Nova B12,,2,407,24,https://store.steampowered.com/app/825550/Blue...


Unnamed: 0,num_missing,dtype
review_type_all,4256,float64
overall_review_rating,5,object
pct_overall,7,float64
pct_overall_threshold,7,object
pct_overall_lang,4256,float64
pct_overall_threshold_lang,4256,float64
platforms,25,object
user_defined_tags,0,object
num_steam_achievements,4256,float64
drm,3705,object


CPU times: user 13.4 s, sys: 192 ms, total: 13.6 s
Wall time: 13.6 s


Verify that the number of unique URLs matches the number of rows in this dataset (this means the URL column should be unique in the `DataFrame`, since we are expecting this)
- consider page numbers 100 onwards since pages 50-100 included some duplicate results that appeared after page 100 (more detailed discussion is given in `6_merge_searches_listings.ipynb`)

In [13]:
try:
    assert df_listings[df_listings["page_num"] >= 100]["url"].nunique() == len(df_listings[df_listings["page_num"] >= 100])
except AssertionError:
    print("Found duplicated URL.")
    print(
        f"Number of unique URLs = {df_listings[df_listings['page_num'] >= 100]['url'].nunique()}\n"
        f"Number of rows = {len(df_listings[df_listings['page_num'] >= 100])}"
    )

**Observations**
1. As expected, the URL column is unique in this dataset and there are as many URLs as rows in the `DataFrame`.

---

<span style="float:left">
    <a href="./4_filter_requests_listings.ipynb"><< 4 - Filter Search Results scraped with requests library</a>
</span>

<span style="float:right">
    <a href="./6_merge_searches_listings.ipynb">6 - Merge scraped Search Results Dataset with Listings Dataset >></a>
</span>