In [100]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import bs4
import re
import time
import logging
import json

from undetected_chromedriver import Chrome, ChromeOptions
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.by import By

# Preconfig

In [3]:
logger = logging.getLogger("etl_logger")
logger.setLevel(logging.DEBUG)
logger.propagate = False

# Installing Packages

In [4]:
!pip install selenium undetected-chromedriver

Defaulting to user installation because normal site-packages is not writeable


# Extraction

In [6]:
def extract_list_videogames(browser, list_number=1):

    """extracts games from an ordered list"""

    try:
        browser.get('https://steamdb.info/charts/')
        time.sleep(10)
        html = browser.page_source
        soup = bs4.BeautifulSoup(html, "html.parser")
    except Exception as e:
        browser.quit()
        logger.debug("Hubo un problema",e)
    else:
        browser.quit()
        return soup


In [23]:
def get_game_prices(game_url,browser,keep_browser):

    """extracts a game's cost per country"""

    if keep_browser:
        browser.execute_script(f"window.open('https://steamdb.info{game_url}', '_blank');")
        browser.switch_to.window(browser.window_handles[-1])
    else:
        browser.get('https://steamdb.info'+game_url)
    time.sleep(30+np.random.random()*60)
    us_cookie = {
        "name":"__Host-cc",
        "value":"us"
    }
    browser.add_cookie(us_cookie)
    browser.refresh()
    button = browser.find_element(By.ID,'js-currency-selector')
    button.click()
    time.sleep(5)
    button = browser.find_element(By.CSS_SELECTOR,'button[data-cc="us"]')
    button.click()
    time.sleep(5)
    html = browser.page_source
    soup = bs4.BeautifulSoup(html, "html.parser")
    return soup

# Transform

In [8]:
def process_country_prices(soup):

    """extracts the raw value and country tags and processes them into clean values"""

    country_tags_raw = soup.find("div",class_="table-responsive").find_all("td",attrs={"data-cc":True})
    price_tags_raw = soup.find("div",class_="table-responsive").find_all("td",class_="table-prices-converted")
    prices_processed = []
    for tag in price_tags_raw:
        if "%" not in tag.text:
            if tag.text == "N/A":
                prices_processed.append(np.nan)
            else:
                prices_processed.append(re.findall(r"[0-9]+,?[0-9]*",tag.text)[0])

    countries_processed = [tag.text.strip("\n").strip(" ") for tag in country_tags_raw]
    countries_prices_processed = list(zip(countries_processed, prices_processed))
    return countries_prices_processed

In [9]:
options = ChromeOptions()
options.add_argument("disable-popup-blocking") # disables popup blocking for being able to open a new tab
browser = Chrome(options=options)
videogame_list_soup = extract_list_videogames(browser)
videogame_urls_tags = videogame_list_soup.find_all("a", href=lambda href: href and href.startswith("/app/"))
videogame_urls_processed = list(set([tag["href"].strip(" ").rstrip("/charts/") for tag in videogame_urls_tags])) # removes /charts/ and whitespaces from urls

  browser = Firefox(firefox_binary="C:\Program Files\Mozilla Firefox\\firefox.exe")


In [26]:
games_dict = {}
options = ChromeOptions()
options.add_argument("--disable-popup-blocking") # disables popup blocking for being able to open a new tab
browser = Chrome(use_subprocess=True,options=options) # use_subprocess is necessary for bypassing anti-scrapping measurements
for i,url in enumerate(videogame_urls_processed): 
    try:
        if i == 0:
            videogame_website_soup = get_game_prices(url,browser,keep_browser=False) # don't open a new tab for first game
        else:
            videogame_website_soup = get_game_prices(url,browser,keep_browser=True) # open a new tab for subsequent games not to be asked for cloudfare human validation
    except NoSuchElementException as e: # if there's a free videogame, this error will propagate which causes the script to close the browser and open another
        logging.debug(e)
        logging.debug("restarting webbrowser...")
        
        browser.quit()
        options = ChromeOptions()
        options.add_argument("--disable-popup-blocking")
        browser = Chrome(use_subprocess=True,options=options)
    if videogame_website_soup is not None: # if the videogame is not free (there's an available price), then add it to the dict 
        games_dict[f"{i}"] = process_country_prices(videogame_website_soup)
        

# Load

In [101]:
country_games_value = dict() 
for games_value_list in games_dict.values(): # here we rearrange the dict to have the countries as key and each videogame's price in a list as values
    for country, value in games_value_list:
        if country_games_value.get(country) == None:
            country_games_value[f"{country}"] = []
        country_games_value[f"{country}"].append(value)

games_id = ["https://store.steampowered.com"+url for url in videogame_urls_processed]
steam_prices_df = pd.DataFrame(country_games_value, index=games_id)

In [103]:
steam_prices_df.to_excel("../Data/list_1_processed.xlsx")
with open("../Data/list_1_raw.json","w") as f: # let's save the unprocessed data just in case :)
    json.dump(games_dict,f)