In [1]:
import re
import requests
import pandas as pd
from bs4 import BeautifulSoup
from time import sleep

# 1. WalkHighlands

In [2]:
def get_robots(url):
    r = requests.get(url)
    print(r.text)

In [3]:
MY_HEADER = {'user-agent': 'Filip Balucha (s1913040@ed.ac.uk)'}
URL_WALKHIGHLANDS = 'https://www.walkhighlands.co.uk/'

In [4]:
# Check robots.txt
get_robots(URL_WALKHIGHLANDS + 'robots.txt')

User-agent: *
Disallow: /admanage/
Disallow: /sending/

User-agent: magpie-crawler
Disallow: /


## Scrape the webpage with a list of most climbed munros

In [5]:
# Get HTML
url = URL_WALKHIGHLANDS + 'munros/most-climbed'
r = requests.get(url, headers=MY_HEADER)
r.status_code

200

In [6]:
# Parse response
soup = BeautifulSoup(r.content, 'html.parser')

In [7]:
# Parse the list of munros
munros = []
munro_elems = soup.findAll('tr')
header_indices = [0, 142]
for i, munro_elem in enumerate(munro_elems):
    if i in header_indices:  # ignore table headers
        continue
    name = str(munro_elem.a.contents[0])
    href = munro_elem.a['href']
    ascents = int(munro_elem.find_all('td')[-1].contents[0])
    munros.append({
        'name': name, 
        'ascent_count': ascents, 
        'href': href
    })

### Scrape munro subpages

### 1. Download subpage HTMLs to prevent repeated requests

In [8]:
def download(munros):
    for munro in munros:
        name = munro["name"]
        print(f'Downloading: {name}')
        url = URL_WALKHIGHLANDS + 'munros/' + munro['href']
        r = requests.get(url, headers=MY_HEADER)
        if r.status_code != 200:
            print(f'Error: request to {url} returned status code {r.status_code}')
            continue
        # Store HTML in cache
        with open(f'cache/munros/{name}.html', 'wb') as out:
            out.write(r.content)
            print('Success!\n')
        sleep(10)

In [9]:
cached = os.listdir('cache/munros')
to_download = [munro for munro in munros if f'{munro["name"]}.html' not in cached]

In [10]:
download(to_download)

### 2. Extract details from munro subpages

In [11]:
def _get_accom_type(accom_href):
    # Extract accommodation type from accommodation href, e.g.:
        # /lochlomond/cottages_drymen.shtml -> cottages
        # /lochlomond/hostels.shtml -> hostels
    accom_type = accom_href.split('.')[0]  # ignore suffix
    accom_type = accom_type.split('/')[2]  # ignore subpage
    accom_type = accom_type.split('_')[0]  # ignore geographical tag
    return accom_type

def _get_accom_hrefs(soup):
    ACCOM_TEXT = 'Walker-friendly accommodation in the area'
    accom_elem = soup.find(lambda tag: tag.contents and tag.contents[0] == ACCOM_TEXT)
    current = accom_elem.next_sibling.next_sibling  # skip newline element
    accom_hrefs = {}
    while (current.name == 'p'):  # accom hrefs are stored in p tags
        href = current.a['href']
        accom_type = _get_accom_type(href)
        accom_hrefs['href_'+accom_type] = href
        current = current.next_sibling
    return accom_hrefs

In [12]:
def _extract_munro_details(soup):
    details = {}
    # Extract rating
    rating_str = soup.find('strong', itemprop='ratingValue').contents[0]
    rating = float(rating_str)
    details['rating'] = rating
    # Extract altitude
    altitude_str = soup.findAll('p')[5].contents[0]
    altitude = int(re.sub(r'\D', '', altitude_str))  # extract integer from string
    details['altitude'] = altitude
    # Extract accommodation hrefs
    try:
        accom_hrefs = _get_accom_hrefs(soup)
        munro.update(accom_hrefs)
    except:  # subpage could not be parsed
        print(f'Skipping {name}')
    # TODO: extract accommodation count
    
    return details

In [13]:
def parse_munro_subpage(url):
    r = requests.get(url, headers=MY_HEADER)
    if r.status_code != 200:
        print(f'Error: request to {url} returned status code {r.status_code}')
        return
    # Parse response
    details = _extract_munro_details(r)
    return details


In [14]:
for munro in munros:
    name = munro['name']
    html_file = f'./cache/munros/{name}.html'
    with open(html_file) as f:
        contents = f.read()
        soup = BeautifulSoup(contents, 'html')
        details = _extract_munro_details(soup)
        munro.update(details)

Skipping Sgorr nam Fiannaidh (Aonach Eagach)


### 3. Handle munros with a faulty HTML

In [15]:
faulty_munro_name = 'Sgorr nam Fiannaidh'
for munro in munros:
    if munro['name'] == faulty_munro_name:
        munro['href_hotels']: '/fortwilliam/hotels_glencoe.shtml' 
        munro['href_bedandbreakfast']: '/fortwilliam/bedandbreakfast_glencoe.shtml' 
        munro['href_cottages']: '/fortwilliam/cottages_glencoe.shtml'
        munro['href_hostels']: '/fortwilliam/hostels_glencoe.shtml'
        break

### 4. Export dataset

In [16]:
# Convert to pandas dataframe
df = pd.DataFrame.from_dict(munros)
df.head()

Unnamed: 0,name,ascent_count,href,href_hotels,href_bedandbreakfast,href_cottages,href_hostels,rating,altitude
0,Ben Lomond,20532,ben-lomond,/lochlomond/hotels_drymen.shtml,/lochlomond/bedandbreakfast_drymen.shtml,/lochlomond/cottages_drymen.shtml,/lochlomond/hostels.shtml,3.8,974
1,Ben Nevis,17892,ben-nevis,/fortwilliam/hotels.shtml,/fortwilliam/bedandbreakfast.shtml,/fortwilliam/cottages.shtml,/fortwilliam/hostels.shtml,3.9,1345
2,Ben Lawers,16063,ben-lawers,/perthshire/hotels.shtml,/perthshire/bedandbreakfast.shtml,/perthshire/cottages.shtml,/perthshire/hostels.shtml,3.9,1214
3,Schiehallion,15926,schiehallion,/perthshire/hotels.shtml,/perthshire/bedandbreakfast.shtml,/perthshire/cottages.shtml,/perthshire/hostels.shtml,3.6,1083
4,Beinn Ghlas,15646,beinn-ghlas,/perthshire/hotels.shtml,/perthshire/bedandbreakfast.shtml,/perthshire/cottages.shtml,/perthshire/hostels.shtml,3.4,1103


In [17]:
# Export dataset
df_out = df[['name', 'altitude', 'ascent_count', 'rating']]
df_out.to_csv('datasets/clean_v1.csv', index=False)

In [18]:
# TODO
# 1. Get accom. count for each accom. type
    # Download accommodation subpages
# 2. Join with 2nd DB