In [1]:
import re
import os
import requests
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from time import sleep

# 1. WalkHighlands

In [2]:
MY_HEADER = {'user-agent': 'Filip Balucha (s1913040@ed.ac.uk)'}
URL_WALKHIGHLANDS = 'https://www.walkhighlands.co.uk/'

## Scrape the webpage with a list of most climbed munros

In [3]:
# Get HTML
url = URL_WALKHIGHLANDS + 'munros/most-climbed'
r = requests.get(url, headers=MY_HEADER)
r.status_code

200

In [4]:
# Parse response
soup = BeautifulSoup(r.content, 'html.parser')

In [5]:
# Parse the list of munros
munros = []
munro_elems = soup.find_all('tr')
header_indices = [0, 142]
for i, munro_elem in enumerate(munro_elems):
    if i in header_indices:  # ignore table headers
        continue
    name = str(munro_elem.a.contents[0])
    href = 'munros/' + munro_elem.a['href'] + '.html'
    ascents = int(munro_elem.find_all('td')[-1].contents[0])
    munros.append({
        'name': name, 
        'ascent_count': ascents, 
        'href': href
    })

## Scrape munro subpages

### 1. Cache subpage HTMLs to prevent repeated requests

In [6]:
def cache(subpage):
    cached_path = os.path.join('cache', subpage)
    if os.path.isfile(cached_path):  # if already cached
        return
    print(f'Downloading: {subpage}')
    url = URL_WALKHIGHLANDS + subpage
    r = requests.get(url, headers=MY_HEADER)
    if r.status_code != 200:
        print(f'Error: {r.status_code}')
        return
    # Store HTML in cache
    # create a new directory if necessary
    os.makedirs(os.path.dirname(cached_path), exist_ok=True)
    with open(cached_path, 'wb') as out:
        out.write(r.content)
        print('Success!\n')
    sleep(10)

In [7]:
for munro in munros:
    cache(munro['href'])

### 2. Extract details from munro subpages

In [8]:
def _parse_accom_type(accom_href):
    # Parse accommodation type and href from accommodation href, e.g.:
        # "/lochlomond/cottages_drymen.shtml" -> "cottages", "lochlomond/cottages_drymen.shtml"
        # "/lochlomond/hostels.shtml" -> "hostels", "lochlomond/hostels.shtml"
    accom_type = accom_href.split('.')[0]  # ignore suffix
    accom_type = accom_type.split('/')[2]  # ignore subpage
    accom_type = accom_type.split('_')[0]  # ignore geographical tag
    key = f'href_{accom_type}'
    val = accom_href.lstrip('/')
    return key, val 

def _get_accom_hrefs(soup):
    ACCOM_TEXT = 'Walker-friendly accommodation in the area'
    accom_elem = soup.find(lambda tag: tag.contents and tag.contents[0] == ACCOM_TEXT)
    current = accom_elem.next_sibling.next_sibling  # skip newline element
    accom_hrefs = {}
    while (current.name == 'p'):  # accom hrefs are stored in p tags
        href = current.a['href']
        key, val = _parse_accom_type(href)
        accom_hrefs[key] = val
        current = current.next_sibling
    return accom_hrefs

In [9]:
def _extract_munro_details(soup):
    details = {}
    # Extract rating
    rating_str = soup.find('strong', itemprop='ratingValue').contents[0]
    rating = float(rating_str)
    details['rating'] = rating
    # Extract rating count
    rating_count_str = soup.find(itemprop='ratingCount').contents[0]
    rating_count = int(rating_count_str)
    details['rating_count'] = rating_count
    # Extract altitude
    altitude_str = soup.find_all('p')[5].contents[0]
    altitude = int(re.sub(r'\D', '', altitude_str))  # extract integer from string
    details['altitude'] = altitude
    # Extract accommodation hrefs
    try:
        accom_hrefs = _get_accom_hrefs(soup)
        details.update(accom_hrefs)
    except:  # subpage could not be parsed
        print(f'Skipping {munro["name"]}')
    
    return details

In [10]:
for munro in munros:
    subpage = munro['href']
    html_file = os.path.join('cache', subpage)
    with open(html_file) as f:
        contents = f.read()
        soup = BeautifulSoup(contents, 'html')
        details = _extract_munro_details(soup)
        munro.update(details)

Skipping Sgorr nam Fiannaidh (Aonach Eagach)


### 3. Handle munros with a faulty HTML

In [11]:
faulty_munro_name = 'Sgorr nam Fiannaidh (Aonach Eagach)'
for munro in munros:
    if munro['name'] == faulty_munro_name:
        munro['href_hotels'] = 'fortwilliam/hotels_glencoe.shtml' 
        munro['href_bedandbreakfast'] = 'fortwilliam/bedandbreakfast_glencoe.shtml' 
        munro['href_cottages'] = 'fortwilliam/cottages_glencoe.shtml'
        munro['href_hostels'] = 'fortwilliam/hostels_glencoe.shtml'
        break

In [28]:
# Convert to pandas dataframe
df = pd.DataFrame.from_dict(munros)
df.head()

Unnamed: 0,name,ascent_count,href,rating,rating_count,altitude,href_hotels,href_bedandbreakfast,href_cottages,href_hostels
0,Ben Lomond,20532,munros/ben-lomond.html,3.8,317,974,lochlomond/hotels_drymen.shtml,lochlomond/bedandbreakfast_drymen.shtml,lochlomond/cottages_drymen.shtml,lochlomond/hostels.shtml
1,Ben Nevis,17892,munros/ben-nevis.html,3.9,253,1345,fortwilliam/hotels.shtml,fortwilliam/bedandbreakfast.shtml,fortwilliam/cottages.shtml,fortwilliam/hostels.shtml
2,Ben Lawers,16063,munros/ben-lawers.html,3.9,231,1214,perthshire/hotels.shtml,perthshire/bedandbreakfast.shtml,perthshire/cottages.shtml,perthshire/hostels.shtml
3,Schiehallion,15926,munros/schiehallion.html,3.6,267,1083,perthshire/hotels.shtml,perthshire/bedandbreakfast.shtml,perthshire/cottages.shtml,perthshire/hostels.shtml
4,Beinn Ghlas,15646,munros/beinn-ghlas.html,3.4,193,1103,perthshire/hotels.shtml,perthshire/bedandbreakfast.shtml,perthshire/cottages.shtml,perthshire/hostels.shtml


## Compute accommodation counts

### 1. Fix region specification for faulty hrefs

In [29]:
# "arisaig" should be "mallaig" as is clear from here: https://www.walkhighlands.co.uk/fortwilliam/bedandbreakfast.shtml
# "affric" should be "glenaffric" as is clear from here: https://www.walkhighlands.co.uk/lochness/hotels_glenaffric.shtml

df.replace({
    'fortwilliam/bedandbreakfast_arisaig.shtml': 'fortwilliam/bedandbreakfast_mallaig.shtml',
    'fortwilliam/cottages_arisaig.shtml': 'fortwilliam/cottages_mallaig.shtml',
    'fortwilliam/hotels_arisaig.shtml': 'fortwilliam/hotels_mallaig.shtml',
    'lochness/bedandbreakfast_affric.shtml': 'lochness/bedandbreakfast_glenaffric.shtml',
    'lochness/cottages_affric.shtml': 'lochness/cottages_glenaffric.shtml',
    'lochness/hotels_affric.shtml': 'lochness/hotels_glenaffric.shtml'
}, inplace=True)

### 2. Cache accommodation subpages

In [30]:
# Get unique of accommodation hrefs
accom_cols = ['href_hotels', 'href_bedandbreakfast', 'href_cottages', 'href_hostels']
accom_hrefs = df[accom_cols].values.flatten()
accom_hrefs = np.unique(accom_hrefs)

for accom_href in accom_hrefs:
    cache(accom_href)

### 3. Get accommodation counts

In [31]:
# TODO handle infinite scroll
# TODO handle some regions not specific enough
accom_counts = {}  # mapping from accommodation href to the number of accommodations listed under it

def _extract_accom_count(accom_href):
    html_file = os.path.join('cache', accom_href)
    with open(html_file) as f:
        contents = f.read()
        soup = BeautifulSoup(contents, 'html')
        accom_ads = soup.find_all("div", {"class": "blockadv"})
        accom_count = len(accom_ads)
        return accom_count

def get_accom_count(accom_href):
    if accom_href not in accom_counts:
        accom_counts[accom_href] = _extract_accom_count(accom_href)
    return accom_counts[accom_href]

In [32]:
accom_count_cols = ['hotel_count', 'bb_count', 'cottage_count', 'hostel_count']
df[accom_count_cols] = df[accom_cols].applymap(get_accom_count)
df.head()

Unnamed: 0,name,ascent_count,href,rating,rating_count,altitude,href_hotels,href_bedandbreakfast,href_cottages,href_hostels,hotel_count,bb_count,cottage_count,hostel_count
0,Ben Lomond,20532,munros/ben-lomond.html,3.8,317,974,lochlomond/hotels_drymen.shtml,lochlomond/bedandbreakfast_drymen.shtml,lochlomond/cottages_drymen.shtml,lochlomond/hostels.shtml,12,8,14,7
1,Ben Nevis,17892,munros/ben-nevis.html,3.9,253,1345,fortwilliam/hotels.shtml,fortwilliam/bedandbreakfast.shtml,fortwilliam/cottages.shtml,fortwilliam/hostels.shtml,30,30,30,18
2,Ben Lawers,16063,munros/ben-lawers.html,3.9,231,1214,perthshire/hotels.shtml,perthshire/bedandbreakfast.shtml,perthshire/cottages.shtml,perthshire/hostels.shtml,30,30,30,5
3,Schiehallion,15926,munros/schiehallion.html,3.6,267,1083,perthshire/hotels.shtml,perthshire/bedandbreakfast.shtml,perthshire/cottages.shtml,perthshire/hostels.shtml,30,30,30,5
4,Beinn Ghlas,15646,munros/beinn-ghlas.html,3.4,193,1103,perthshire/hotels.shtml,perthshire/bedandbreakfast.shtml,perthshire/cottages.shtml,perthshire/hostels.shtml,30,30,30,5


In [55]:
df[accom_count_cols] = df[accom_count_cols].replace(30, '30+')

## Join with the DoBIH database

In [23]:
# TODO

## Export dataset

In [57]:
out_cols = ['name', 'altitude', 'ascent_count', 'rating']  # Filter out unnecessary columns
out_cols.extend(accom_count_cols)
df[out_cols].to_csv('datasets/clean_v1.csv', index=False)