Simple page scraper function. Repeatedly collects data from a single target address, and stores the page text. Modular, uses parallel sessions to scale well to more rapid scraping on varying pages. Easily modified.

As-is, the code grabs data from an Apex Legends status page to track player selection statistics, which will be an interesting time-series dataset to test how gameplay changes and cosmetic items affect character choice.

In [None]:
import datetime as dt
import time
from requests_futures.sessions import FuturesSession
from bs4 import BeautifulSoup
from pathlib import Path

In [None]:
#SET TARGET PAGE HERE
target_address = "https://apexlegendsstatus.com/gamestats"

In [None]:
#Establish storage directory
p = Path('./stored_pages/')
p.mkdir(exist_ok=True)

In [None]:
def get_page(address):
    
    '''Parallel page grabbing. Get the page, return the 
    base content, along with a savename based on the page
    address and the time of the request.''' 
    
    #Assumes "session" is a futuressessions instance, and
    #requires one to run, i.e. "session = FuturesSession()".
    #Using above, does up to 8 parallel calls (default). 
    
    resp = session.get(address)
    
    #Pull out just the content
    pagecont = resp.result().content
    pagetext = resp.result().text
    responsetype = resp.result().status_code
    
    #Establish time scrape completed
    timestr = dt.datetime.strftime(dt.datetime.now(), '%Y%m%d_%H%M')

    #Basic name sanitation for save file. Add as needed.
    savename = address
    savename = savename.replace('http://','')
    savename = savename.replace('https://','')
    savename = savename.replace('/','_')
    savename = savename.replace('.','_')
    savename = savename + '_txt_' + timestr
    
    return pagecont, pagetext, savename, responsetype

In [None]:
def save_page(address):
    pagecont, pagetext, savename, responsecode = get_page(address)

    soup = BeautifulSoup(pagetext, 'html.parser')
    rawtext = soup.get_text()
    
    #Save it if it loaded properly, otherwise skip save.
    if responsecode == 200:
        with open('./stored_pages/{}.txt'.format(savename), 'wb') as fname:
            fname.write(bytes(rawtext, 'utf-8'))
        return 'Saved'
    else:
        return 'Response code {}'.format(responsecode)

In [None]:
#Repeated call: Super simple. Just call, wait 5 minutes, call, repeat.

session = FuturesSession()

for n in range(10000):
    #Probably not ideal to use try/except here, but I'm fine with some
    #data loss, only rarely fails to load. 
    try:
        stat = save_page(target_address)
        print(stat)
    except:
        print('Did not save')
    time.sleep(300)
    
#Works fine running continuously, as long as your computer doesn't sleep.