In [1]:
import re
import requests
import pandas as pd
from bs4 import BeautifulSoup
from time import sleep

## 1. WalkHighlands

In [2]:
def get_robots(url):
    r = requests.get(url)
    print(r.text)

In [3]:
MY_HEADER = {'user-agent': 'Filip Balucha (s1913040@ed.ac.uk)'}
URL_WALKHIGHLANDS = 'https://www.walkhighlands.co.uk/'

In [4]:
# Check robots.txt
get_robots(URL_WALKHIGHLANDS + 'robots.txt')

User-agent: *
Disallow: /admanage/
Disallow: /sending/

User-agent: magpie-crawler
Disallow: /


### Scrape the webpage with a list of most climbed munros

In [5]:
# Get HTML
url = URL_WALKHIGHLANDS + 'munros/most-climbed'
r = requests.get(url, headers=MY_HEADER)
r.status_code

200

In [6]:
# Parse response
soup = BeautifulSoup(r.content, 'html.parser')

In [7]:
# Parse the list of munros
munros = []
munro_elems = soup.findAll('tr')
munro_elems = munro_elems
header_indices = [0, 142]
for i, munro_elem in enumerate(munro_elems):
    if i in header_indices:  # ignore table headers
        continue
    name = str(munro_elem.a.contents[0])
    href = munro_elem.a['href']
    ascents = int(munro_elem.find_all('td')[-1].contents[0])
    munros.append({
        'name': name, 
        'ascent_count': ascents, 
        'href': href
    })

### Scrape individual munro subpages

Download subpage HTMLs to prevent repeated requests

In [8]:
def download(munros):
    for munro in munros:
        name = munro["name"]
        print(f'Downloading: {name}')
        url = URL_WALKHIGHLANDS + 'munros/' + munro['href']
        r = requests.get(url, headers=MY_HEADER)
        if r.status_code != 200:
            print(f'Error: request to {url} returned status code {r.status_code}')
            continue
        # Store HTML in cache/
        with open(f'cache/{name}.html', 'wb') as out:
            out.write(r.content)
            print('Success!\n')
        sleep(10)

In [9]:
cached = os.listdir('cache')
to_download = [munro for munro in munros if f'{munro["name"]}.html' not in cached]

In [10]:
download(to_download)

Determine accommodation types

In [11]:
def get_accom_hrefs(soup):
    ACCOM_TEXT = 'Walker-friendly accommodation in the area'
    accom_elem = soup.find(lambda tag: tag.contents and tag.contents[0] == ACCOM_TEXT)
    current = accom_elem.next_sibling.next_sibling  # skip newline element
    hrefs = []
    while (current.name == 'p'):
        href = current.a['href']
        hrefs.append(href)
        current = current.next_sibling
    return hrefs

In [12]:
def _get_accom_type(href):
    # The hrefs look like:
    # /lochlomond/cottages_drymen.shtml -> cottages
    # /lochlomond/hostels.shtml -> hostels
    accom_type = href.split('.')[0]  # remove file suffix
    accom_type = accom_type.split('/')  # remove subpage
    accom_type = accom_type[2]
    accom_type = accom_type.split('_')[0]
    return accom_type

In [13]:
accom_types = set()
skipped = []  # munro subpages that could not be parsed
for munro in munros:
    name = munro['name']
    html_file = f'./cache/{name}.html'
    with open(html_file) as f:
        contents = f.read()
        soup = BeautifulSoup(contents, 'html')
        try:
            hrefs = get_accom_hrefs(soup)
            accom_types.update(map(_get_accom_type, hrefs))
        except:
            skipped.append(name)
 
print(skipped)
print(accom_types)

['Sgorr nam Fiannaidh (Aonach Eagach)']
{'bedandbreakfast', 'cottages', 'hotels', 'hostels'}


Parse munro subpages

In [14]:
# Extract details from a response object
def _extract_munro_details(soup):
    details = {}
    # Extract rating
    rating_str = soup.find('strong', itemprop='ratingValue').contents[0]
    rating = float(rating_str)
    details['rating'] = rating
    # Extract altitude
    altitude_str = soup.findAll('p')[5].contents[0]
    altitude = int(re.sub(r'\D', '', altitude_str))  # extract integer from string
    details['altitude'] = altitude
    # Extract accommodation count # TODO
    # accom_count = _get_accom_count(soup)
    # details['accom_count'] = accom_count
    
    return details

In [15]:
def parse_munro_subpage(url):
    r = requests.get(url, headers=MY_HEADER)
    if r.status_code != 200:
        print(f'Error: request to {url} returned status code {r.status_code}')
        return
    # Parse response
    details = _extract_munro_details(r)
    return details


In [16]:
# Get accommodation hrefs
accom_text = 'Walker-friendly accommodation in the area'
accom_elem = soup.find(lambda tag: tag.contents and tag.contents[0] == accom_text)
current = accom_elem.next_sibling.next_sibling  # skip newline element
hrefs = []
while (current.name == 'p'):
    href = current.a['href']
    hrefs.append(href)
    current = current.next_sibling

In [17]:
# Set up cache and download accommodation HTMLs

In [20]:
for munro in munros:
    name = munro['name']
    html_file = f'./cache/{name}.html'
    with open(html_file) as f:
        contents = f.read()
        soup = BeautifulSoup(contents, 'html')
        details = _extract_munro_details(soup)
        munro.update(details)

In [23]:
df = pd.DataFrame.from_dict(munros)
df.head()

Unnamed: 0,name,ascent_count,href,rating,altitude
0,Ben Lomond,20531,ben-lomond,3.8,974
1,Ben Nevis,17892,ben-nevis,3.9,1345
2,Ben Lawers,16063,ben-lawers,3.9,1214
3,Schiehallion,15924,schiehallion,3.6,1083
4,Beinn Ghlas,15646,beinn-ghlas,3.4,1103


In [25]:
del df['href']

In [27]:
df.to_csv('datasets/clean_v1.csv', index=False)

In [None]:
# TODO
# URGENT
# Go through each HTML
    # Parse HTML for to get altitude and rating
    # -> df -> CSV

# LESS URGENT
# Create a cache: accom_hyperlink -> accom_count
# Go through each HTML
    # Parse HTML for to get accomodation hyperlinks
        # For each hyperlink, try to fetch data from cache, else compute and update cache
        # Note: Sgorr nam Fiannaidh (Aonach Eagach) caused trouble, so make sure that works properly (weird header span)
        # If there is no hyperlink for some accommodation, store 0

# FOR LATER
# Join with 2nd DB