In [1]:
import re
import os
import difflib
import requests
import numpy as np
import pandas as pd
from time import sleep
from bs4 import BeautifulSoup

# 1. WalkHighlands

In [2]:
MY_HEADER = {'user-agent': 'Filip Balucha (s1913040@ed.ac.uk)'}
URL_WALKHIGHLANDS = 'https://www.walkhighlands.co.uk/'

## Scrape the webpage with a list of most climbed munros

In [3]:
# Get HTML
url = URL_WALKHIGHLANDS + 'munros/most-climbed'
r = requests.get(url, headers=MY_HEADER)
r.status_code

200

In [4]:
# Parse response
soup = BeautifulSoup(r.content, 'html.parser')

In [5]:
# Parse the list of munros
munros = []
munro_elems = soup.find_all('tr')
header_indices = [0, 142]
for i, munro_elem in enumerate(munro_elems):
    if i in header_indices:  # ignore table headers
        continue
    name = str(munro_elem.a.contents[0])
    href = 'munros/' + munro_elem.a['href'] + '.html'
    ascents = int(munro_elem.find_all('td')[-1].contents[0])
    munros.append({
        'name': name, 
        'ascent_count': ascents, 
        'href': href
    })

## Scrape munro subpages

### 1. Cache subpage HTMLs to prevent repeated requests

In [6]:
def cache(subpage):
    cached_path = os.path.join('cache', subpage)
    if os.path.isfile(cached_path):  # if already cached
        return
    print(f'Downloading: {subpage}')
    url = URL_WALKHIGHLANDS + subpage
    r = requests.get(url, headers=MY_HEADER)
    if r.status_code != 200:
        print(f'Error: {r.status_code}')
        return
    # Store HTML in cache
    # create a new directory if necessary
    os.makedirs(os.path.dirname(cached_path), exist_ok=True)
    with open(cached_path, 'wb') as out:
        out.write(r.content)
        print('Success!\n')
    sleep(10)

In [7]:
for munro in munros:
    cache(munro['href'])

### 2. Extract details from munro subpages

In [8]:
def _parse_accom_type(accom_href):
    # Parse accommodation type and href from accommodation href, e.g.:
        # "/lochlomond/cottages_drymen.shtml" -> "cottages", "lochlomond/cottages_drymen.shtml"
        # "/lochlomond/hostels.shtml" -> "hostels", "lochlomond/hostels.shtml"
    accom_type = accom_href.split('.')[0]  # ignore suffix
    accom_type = accom_type.split('/')[2]  # ignore subpage
    accom_type = accom_type.split('_')[0]  # ignore geographical tag
    key = f'href_{accom_type}'
    val = accom_href.lstrip('/')
    return key, val 

def _get_accom_hrefs(soup):
    ACCOM_TEXT = 'Walker-friendly accommodation in the area'
    accom_elem = soup.find(lambda tag: tag.contents and tag.contents[0] == ACCOM_TEXT)
    current = accom_elem.next_sibling.next_sibling  # skip newline element
    accom_hrefs = {}
    while (current.name == 'p'):  # accom hrefs are stored in p tags
        href = current.a['href']
        key, val = _parse_accom_type(href)
        accom_hrefs[key] = val
        current = current.next_sibling
    return accom_hrefs

In [9]:
def _extract_munro_details(soup):
    details = {}
    # Extract rating
    rating_str = soup.find('strong', itemprop='ratingValue').contents[0]
    rating = float(rating_str)
    details['rating'] = rating
    # Extract rating count
    rating_count_str = soup.find(itemprop='ratingCount').contents[0]
    rating_count = int(rating_count_str)
    details['rating_count'] = rating_count
    # Extract altitude
    altitude_str = soup.find_all('p')[5].contents[0]
    altitude = int(re.sub(r'\D', '', altitude_str))  # extract integer from string
    details['altitude'] = altitude
    # Extract accommodation hrefs
    try:
        accom_hrefs = _get_accom_hrefs(soup)
        details.update(accom_hrefs)
    except:  # subpage could not be parsed
        print(f'Skipping {munro["name"]}')
    
    return details

In [10]:
for munro in munros:
    subpage = munro['href']
    html_file = os.path.join('cache', subpage)
    with open(html_file) as f:
        contents = f.read()
        soup = BeautifulSoup(contents, 'html')
        details = _extract_munro_details(soup)
        munro.update(details)

Skipping Sgorr nam Fiannaidh (Aonach Eagach)


### 3. Handle munros with a faulty HTML

In [11]:
faulty_munro_name = 'Sgorr nam Fiannaidh (Aonach Eagach)'
for munro in munros:
    if munro['name'] == faulty_munro_name:
        munro['href_hotels'] = 'fortwilliam/hotels_glencoe.shtml' 
        munro['href_bedandbreakfast'] = 'fortwilliam/bedandbreakfast_glencoe.shtml' 
        munro['href_cottages'] = 'fortwilliam/cottages_glencoe.shtml'
        munro['href_hostels'] = 'fortwilliam/hostels_glencoe.shtml'
        break

In [12]:
# Convert to pandas dataframe
df = pd.DataFrame.from_dict(munros)
df.head()

Unnamed: 0,name,ascent_count,href,rating,rating_count,altitude,href_hotels,href_bedandbreakfast,href_cottages,href_hostels
0,Ben Lomond,20536,munros/ben-lomond.html,3.8,317,974,lochlomond/hotels_drymen.shtml,lochlomond/bedandbreakfast_drymen.shtml,lochlomond/cottages_drymen.shtml,lochlomond/hostels.shtml
1,Ben Nevis,17894,munros/ben-nevis.html,3.9,253,1345,fortwilliam/hotels.shtml,fortwilliam/bedandbreakfast.shtml,fortwilliam/cottages.shtml,fortwilliam/hostels.shtml
2,Ben Lawers,16065,munros/ben-lawers.html,3.9,231,1214,perthshire/hotels.shtml,perthshire/bedandbreakfast.shtml,perthshire/cottages.shtml,perthshire/hostels.shtml
3,Schiehallion,15928,munros/schiehallion.html,3.6,267,1083,perthshire/hotels.shtml,perthshire/bedandbreakfast.shtml,perthshire/cottages.shtml,perthshire/hostels.shtml
4,Beinn Ghlas,15648,munros/beinn-ghlas.html,3.4,193,1103,perthshire/hotels.shtml,perthshire/bedandbreakfast.shtml,perthshire/cottages.shtml,perthshire/hostels.shtml


## Compute accommodation counts

### 1. Fix region specification for faulty hrefs

In [13]:
# "arisaig" should be "mallaig" as is clear from here: https://www.walkhighlands.co.uk/fortwilliam/bedandbreakfast.shtml
# "affric" should be "glenaffric" as is clear from here: https://www.walkhighlands.co.uk/lochness/hotels_glenaffric.shtml

df.replace({
    'fortwilliam/bedandbreakfast_arisaig.shtml': 'fortwilliam/bedandbreakfast_mallaig.shtml',
    'fortwilliam/cottages_arisaig.shtml': 'fortwilliam/cottages_mallaig.shtml',
    'fortwilliam/hotels_arisaig.shtml': 'fortwilliam/hotels_mallaig.shtml',
    'lochness/bedandbreakfast_affric.shtml': 'lochness/bedandbreakfast_glenaffric.shtml',
    'lochness/cottages_affric.shtml': 'lochness/cottages_glenaffric.shtml',
    'lochness/hotels_affric.shtml': 'lochness/hotels_glenaffric.shtml'
}, inplace=True)

### 2. Cache accommodation subpages

In [14]:
# Get unique of accommodation hrefs
accom_cols = ['href_hotels', 'href_bedandbreakfast', 'href_cottages', 'href_hostels']
accom_hrefs = df[accom_cols].values.flatten()
accom_hrefs = np.unique(accom_hrefs)

for accom_href in accom_hrefs:
    cache(accom_href)

### 3. Get accommodation counts

In [15]:
# TODO handle infinite scroll
# TODO handle some regions not specific enough
accom_counts = {}  # mapping from accommodation href to the number of accommodations listed under it

def _extract_accom_count(accom_href):
    html_file = os.path.join('cache', accom_href)
    with open(html_file) as f:
        contents = f.read()
        soup = BeautifulSoup(contents, 'html')
        accom_ads = soup.find_all("div", {"class": "blockadv"})
        accom_count = len(accom_ads)
        return accom_count

def get_accom_count(accom_href):
    if accom_href not in accom_counts:
        accom_counts[accom_href] = _extract_accom_count(accom_href)
    return accom_counts[accom_href]

In [16]:
accom_count_cols = ['hotel_count', 'bb_count', 'cottage_count', 'hostel_count']
df[accom_count_cols] = df[accom_cols].applymap(get_accom_count)
df.head()

Unnamed: 0,name,ascent_count,href,rating,rating_count,altitude,href_hotels,href_bedandbreakfast,href_cottages,href_hostels,hotel_count,bb_count,cottage_count,hostel_count
0,Ben Lomond,20536,munros/ben-lomond.html,3.8,317,974,lochlomond/hotels_drymen.shtml,lochlomond/bedandbreakfast_drymen.shtml,lochlomond/cottages_drymen.shtml,lochlomond/hostels.shtml,12,8,14,7
1,Ben Nevis,17894,munros/ben-nevis.html,3.9,253,1345,fortwilliam/hotels.shtml,fortwilliam/bedandbreakfast.shtml,fortwilliam/cottages.shtml,fortwilliam/hostels.shtml,30,30,30,18
2,Ben Lawers,16065,munros/ben-lawers.html,3.9,231,1214,perthshire/hotels.shtml,perthshire/bedandbreakfast.shtml,perthshire/cottages.shtml,perthshire/hostels.shtml,30,30,30,5
3,Schiehallion,15928,munros/schiehallion.html,3.6,267,1083,perthshire/hotels.shtml,perthshire/bedandbreakfast.shtml,perthshire/cottages.shtml,perthshire/hostels.shtml,30,30,30,5
4,Beinn Ghlas,15648,munros/beinn-ghlas.html,3.4,193,1103,perthshire/hotels.shtml,perthshire/bedandbreakfast.shtml,perthshire/cottages.shtml,perthshire/hostels.shtml,30,30,30,5


In [17]:
df[accom_count_cols] = df[accom_count_cols].replace(30, '30+')

## Join with the DoBIH database

### 1. Load and filter data and fix formatting

In [88]:
# Load DB, keep relevant columns
relevant_cols = ['Number','Name','Island','Topo Section','County','County Top',
                 'Hill-bagging','Latitude','Longitude', 'T100', 'M','Metres']
df_dobih = pd.read_csv('datasets/DoBIH_v17_1.csv', index_col='Number', usecols=relevant_cols)

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [89]:
# Keep only Munros
munro_mask = df_dobih['M'] == 1
df_dobih = df_dobih.loc[munro_mask]
df_dobih.reset_index(inplace=True,drop=True)

In [90]:
# Fix format of column names
df_dobih.rename(columns={'Metres': 'altitude'}, inplace=True)
df_dobih.columns = df_dobih.columns.str.lower().str.replace('-','_').str.replace(' ','_')

In [91]:
# Remove text in [] at the end of names as this is just the pronunciation
re_square_brackets = r'\s\[.*\]'
df_dobih['name'] = df_dobih['name'].str.replace(re_square_brackets,'')

  df_dobih['name'] = df_dobih['name'].str.replace(re_square_brackets,'')


In [92]:
# Round heights and convert to int, since that's what's used in Walk Highlands data
df_dobih['altitude'] = df_dobih['altitude'].round(0).astype(int)

In [93]:
def fix_format(name):
    # For example:
    # 'Buachaille Etive Mor - Stob Dearg' -> 'Stob Dearg (Buachaille Etive Mor)'
    # 'Buachaille Etive Beag - Stob Dubh' -> 'Stob Dubh (Buachaille Etive Beag)'
    words = name.split(' - ')
    if len(words) > 1:
        name = f'{words[1]} ({words[0]})'
    return name

df_dobih['name'] = df_dobih['name'].apply(fix_format)

### 2. Align DoBIH Munro names with Walk Highlands naming

We will match based on a key - a stringified tuple consisting of Munro name and altitude. The data do not match exactly, so we will perform fuzzy matching. We will use `difflib` to help us find approximate matches for keys.

Even so, some of the differences in Munro naming are too significant to handle using an edit-distance-based method. We treat those now.

Note: The data from Walk Highlands will take priority.

In [94]:
# Treat names that are too hard to match
hard_to_match = {
    'Glas Leathad Mor (Ben Wyvis)': 'Ben Wyvis',
    'Leabaidh an Daimh Bhuidhe (Ben Avon)': 'Ben Avon',
    'Meall nan Con (Ben Klibreck)': 'Ben Klibreck',
    "Carn nan Gabhar (Beinn a' Ghlo)": 'Carn nan Gabhar'
}
df_dobih['name'] = df_dobih['name'].replace(hard_to_match)

In [95]:
# Fix problems with Munro altitude mismatches
# A counter-intuitive side-effect of using an edit-distance-based method 
# is that the numbers '1000' and '999' have quite a large edit distance
# unfortunately, some munros have altitude 1000 in one and 999 in the other
# dataset, and difflib failed to match those; we treat them below:

# Stob Ban
m = (df_dobih['name'] == 'Stob Ban') & (df_dobih['altitude'] == 1000)
df_dobih.loc[m, 'altitude'] = 999

# Sgurr Breac
m = (df_dobih['name'] == 'Sgurr Breac') & (df_dobih['altitude'] == 1000)
df_dobih.loc[m, 'altitude'] = 999

In [96]:
# Treat duplicates
# The DoBIH database stores a shortened version of Munro names (e.g. Carn Dearg)
# unlike Walk Highlands. This results mismatches, so we treat it below:

# A' Chailleach
m = (df_dobih['name'] == "A' Chailleach") & (df_dobih['altitude'] == 929)
df_dobih.loc[m, 'name'] = "A' Chailleach (Monadhliath)"

# Carn Dearg
m = (df_dobih['name'] == "Carn Dearg") & (df_dobih['altitude'] == 1034)
df_dobih.loc[m, 'name'] = 'Carn Dearg (Loch Pattack)'

m = (df_dobih['name'] == "Carn Dearg") & (df_dobih['altitude'] == 946)
df_dobih.loc[m, 'name'] = 'Carn Dearg (Monadhliath)'

m = (df_dobih['name'] == "Carn Dearg") & (df_dobih['altitude'] == 941)
df_dobih.loc[m, 'name'] = 'Carn Dearg (Corrour)'

# Geal-charn
m = (df_dobih['name'] == 'Geal-charn') & (df_dobih['altitude'] == 917)
df_dobih.loc[m, 'name'] = 'Geal-charn (Drumochter)'

m = (df_dobih['name'] == 'Geal-charn') & (df_dobih['altitude'] == 1132)
df_dobih.loc[m, 'name'] = 'Geal-charn (Alder)'

# Geal Charn
m = (df_dobih['name'] == 'Geal Charn') & (df_dobih['altitude'] == 926)
df_dobih.loc[m, 'name'] = 'Geal Charn (Monadhliath)'

# Beinn a' Chaorainn
m = (df_dobih['name'] == "Beinn a' Chaorainn") & (df_dobih['altitude'] == 1083)
df_dobih.loc[m, 'name'] = "Beinn a' Chaorainn (Cairngorms)"

m = (df_dobih['name'] == "Beinn a' Chaorainn") & (df_dobih['altitude'] == 1049)
df_dobih.loc[m, 'name'] = "Beinn a' Chaorainn (Glen Spean)"

# Beinn Dearg
m = (df_dobih['name'] == 'Beinn Dearg') & (df_dobih['altitude'] == 1009)
df_dobih.loc[m, 'name'] = 'Beinn Dearg (Blair Atholl)'

m = (df_dobih['name'] == 'Beinn Dearg') & (df_dobih['altitude'] == 1084)
df_dobih.loc[m, 'name'] = 'Beinn Dearg (Ullapool)'

# Sgurr nan Coireachan
m = (df_dobih['name'] == 'Sgurr nan Coireachan') & (df_dobih['altitude'] == 954)
df_dobih.loc[m, 'name'] = 'Sgurr nan Coireachan (Glen Dessary)'

m = (df_dobih['name'] == 'Sgurr nan Coireachan') & (df_dobih['altitude'] == 956)
df_dobih.loc[m, 'name'] = 'Sgurr nan Coireachan (Glenfinnan)'

# Carn nan Gobhar
    # Note: using data from Walk Highlands
    # Carn nan Gobhar (Strathfarrar): longitde = -4.8800484; latitude =57.4525364
    # Carn nan Gobhar (Loch Mullardoch): longitde = -5.0244477; latitude =57.3633406
m = (df_dobih['name'] == 'Carn nan Gobhar') & (df_dobih['longitude'].round(2) == -4.88)
df_dobih.loc[m, 'name'] = 'Carn nan Gobhar (Strathfarrar)'

m = (df_dobih['name'] == 'Carn nan Gobhar') & (df_dobih['longitude'].round(2) == -5.02)
df_dobih.loc[m, 'name'] = 'Carn nan Gobhar (Loch Mullardoch)'

### 3. Generate keys

Munro name and altitude are the only fields are the only fields that are common to both datasets. Hence, a key will be the tuple `(<munro_name>, <munro_altitude>)`, stringified. 

In [97]:
key_tuples = list(zip(df['name'].str.replace(' ',''), df['altitude']))
keys = list(map(str, key_tuples))
df['key'] = keys
df['key'].head()

0        ('BenLomond', 974)
1        ('BenNevis', 1345)
2       ('BenLawers', 1214)
3    ('Schiehallion', 1083)
4      ('BeinnGhlas', 1103)
Name: key, dtype: object

In [98]:
key_tuples = list(zip(df_dobih['name'].str.replace(' ',''), df_dobih['altitude']))
keys = list(map(str, key_tuples))
df_dobih['key'] = keys
df_dobih['key'].head()

0      ('BenChonzie', 931)
1      ('BenVorlich', 985)
2    ("Stuca'Chroin", 973)
3        ('BenMore', 1174)
4    ('StobBinnein', 1165)
Name: key, dtype: object

In [99]:
# Verify that all keys are unique
no_dupes_in_wh = not df['key'].duplicated(keep=False).any()
no_dupes_in_dobih = not df_dobih['key'].duplicated(keep=False).any()
no_dupes_in_wh and no_dupes_in_dobih

True

### 4. Verify approximate key pairs

In [100]:
# Verify suggested approximate matches
from ast import literal_eval as make_tuple
def _possible_mismatch(x):  # method that looks for possible key mismatches
    n1,a1 = make_tuple(x['wh_key'])
    n2,a2 = make_tuple(x['dobih_key'])
    n1 = n1.lower()
    n2 = n2.lower()
    return (n1 not in n2 and n2 not in n1) or abs(a2-a1) > 10

# Create a dummy dataframe to hold the matched key pairs
df_temp = pd.DataFrame()
df_temp['wh_key'] = df['key']
# Fuzzy match Walk Highlands keys against DoBIH keys
df_temp['dobih_key'] = df['key'].apply(lambda x: difflib.get_close_matches(x, df_dobih['key'])[0])

# Show suggested approximate matches for keys that don't have an exact match
with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    m = df_temp.apply(_possible_mismatch, axis=1)
    display(df_temp.loc[m])

Unnamed: 0,wh_key,dobih_key
73,"('StuchdanLochain', 960)","('StucanLochain', 960)"
82,"('BenChallum', 1025)","('BeinnChalluim', 1025)"
96,"('SgurrMor(BeinnAlligin)', 986)","('SgurrMhor(BeinnAlligin)', 986)"
165,"('SgurrnaBanachdich', 965)","('SgurrnaBanachdaich', 965)"
199,"('Sgornah-Ulaidh', 994)","('Sgurrnah-Ulaidh', 994)"
202,"('StobCoiranAlbannaich', 1044)","(""StobCoir'anAlbannaich"", 1044)"
210,"(""A'Chralaig"", 1120)","(""A'Chraileag"", 1120)"
217,"('BeinnHeasgarnich', 1078)","('BeinnSheasgarnaich', 1077)"
253,"('CarnEige', 1183)","('CarnEighe', 1183)"
276,"('SgurrnanCeathreamhnan', 1151)","('SgurrnanCeathramhnan', 1151)"


These all look sensible, so we may perform matching on them.

### 5. Perform fuzzy matching

In [101]:
# Fuzzy match Walk Highlands keys against DoBIH keys
df['key_fuzzy'] = df['key'].apply(lambda x: difflib.get_close_matches(x, df_dobih['key'])[0])

In [102]:
# Merge Walk Highlands and DoBIH datasets
df_merged = df.merge(df_dobih, left_on='key_fuzzy', right_on='key', suffixes=['_wh','_dobih'])

In [103]:
# Pick only relevant columns
relevant_cols = ['name_wh', 'altitude_wh', 'ascent_count', 'rating', 'rating_count', 'bb_count', 'cottage_count', 
                 'island', 'latitude', 'longitude', 'county', 'county_top', 'hill_bagging', 'hostel_count',
                 'hotel_count', 't100', 'topo_section']
df_merged = df_merged[relevant_cols]

In [104]:
# Strip the "_wh"-suffix
df_merged.rename(columns=lambda x: x[:-3] if x.endswith('_wh') else x, inplace=True)

## Export dataset

In [105]:
df_merged.to_csv('datasets/clean_v1.csv', index=False)