In [516]:
import re
import requests
import pandas as pd
from bs4 import BeautifulSoup
from time import sleep

## 1. WalkHighlands

In [517]:
def get_robots(url):
    r = requests.get(url)
    print(r.text)

In [518]:
MY_HEADER = {'user-agent': 'Filip Balucha (s1913040@ed.ac.uk)'}
URL_WALKHIGHLANDS = 'https://www.walkhighlands.co.uk/'

In [519]:
# Check robots.txt
get_robots(URL_WALKHIGHLANDS + 'robots.txt')

User-agent: *
Disallow: /admanage/
Disallow: /sending/

User-agent: magpie-crawler
Disallow: /


### Scrape the webpage with a list of most climbed munros

In [520]:
# Get HTML
url = URL_WALKHIGHLANDS + 'munros/most-climbed'
r = requests.get(url, headers=MY_HEADER)
r.status_code

200

In [521]:
# Parse response
soup = BeautifulSoup(r.content, 'html.parser')

In [522]:
# Parse the list of munros
munros = []
munro_elems = soup.findAll('tr')
munro_elems = munro_elems
header_indices = [0, 142]
for i, munro_elem in enumerate(munro_elems):
    if i in header_indices:  # ignore table headers
        continue
    name = str(munro_elem.a.contents[0])
    href = munro_elem.a['href']
    ascents = int(munro_elem.find_all('td')[-1].contents[0])
    munros.append({
        'name': name, 
        'ascent_count': ascents, 
        'href': href
    })

### Scrape individual munro subpages

Download subpage HTMLs to prevent repeated requests

In [529]:
def download(munros):
    for munro in munros:
        name = munro["name"]
        print(f'Downloading: {name}')
        url = URL_WALKHIGHLANDS + 'munros/' + munro['href']
        r = requests.get(url, headers=MY_HEADER)
        if r.status_code != 200:
            print(f'Error: request to {url} returned status code {r.status_code}')
            continue
        # Store HTML in cache/
        with open(f'cache/{name}.html', 'wb') as out:
            out.write(r.content)
            print('Success!\n')
        sleep(10)

In [532]:
cached = os.listdir('cache')
to_download = [munro for munro in munros if f'{munro["name"]}.html' not in cached]

In [534]:
download(to_download)

Add helper methods to help parsing munro subpages:

In [525]:
def parse_munro_subpage(url):
    r = requests.get(url, headers=MY_HEADER)
    if r.status_code != 200:
        print(f'Error: request to {url} returned status code {r.status_code}')
        return
    # Parse response
    details = _extract_munro_details(r)
    return details


In [526]:
# Extract details from a response object
def _extract_munro_details(r):
    # Parse response
    soup = BeautifulSoup(r.content, 'html.parser')
    details = {}
    # Extract rating
    rating_str = soup.find('strong', itemprop='ratingValue').contents[0]
    rating = float(rating_str)
    details['rating'] = rating
    # Extract altitude
    altitude_str = soup.findAll('p')[5].contents[0]
    altitude = int(re.sub(r'\D', '', altitude_str))  # extract integer from string
    details['altitude'] = altitude
    # Extract accommodation count
    accom_count = _get_accom_count(soup)
    details['accom_count'] = accom_count
    
    return details

In [527]:
# Get accommodation count
def _get_accom_count(soup):
    # Accommodation list is bounded by two headers:
    current = soup.findAll('h3')[1]
    end = soup.findAll('h3')[2]
    accom_count = 0
    # Walk the list between the two headers
    while current is not end:
        if current.name == 'p':  # hotels are stored in paragraph tags
            accom_count += 1
        current = current.next_sibling  # move to next element
    return accom_count

In [528]:
# for munro in munros:
#     print(f'Request for: {munro["name"]}')
#     url = URL_WALKHIGHLANDS + 'munros/' + munro['href']
#     details = parse_munro_subpage(url)
#     munro.update(details)  # add new details to munro
#     print('Success!')
#     sleep(10)

In [None]:
# TODO
# URGENT
# Go through each HTML
    # Parse HTML for to get altitude and rating
    # -> df -> CSV

# LESS URGENT
# Create a cache: accom_hyperlink -> accom_count
# Go through each HTML
    # Parse HTML for to get accomodation hyperlinks
        # For each hyperlink, try to fetch data from cache, else compute and update cache
        # Note: Sgorr nam Fiannaidh (Aonach Eagach) caused trouble, so make sure that works properly (weird header span)
        # If there is no hyperlink for some accommodation, store 0

# FOR LATER
# Join with 2nd DB