This page is the scraper for MP. Because there's no API. The only thing I have access to is what's normally viewable online and a json file that contained every single webpage on the MP site. This was scraped in another notebook to only include pages that represented routes. (Other pages included forum pages, area pages, member pages) The processed list was pickled into routes.txt. Next:

1. Reference routes.txt
2. For each route, download the route page and parse it with BeautifulSoup and a lot of regex
3. Put the parsed information in a tuple
4. Store the tuple in a new database offline (sqlite)

After this is complete, in another notebook we can explore joining this database with a tick list to generate a table containing info for only routes in that ticklist. From there, analysis and visualization can be done, then served up to a website. 

Notes

There were a bunch of 404s. I probably should have kept track of those as they were happening. moving on...

Also should have stored the grade as a string and not a float. 5.10 turned into 5.1. 

In [1]:
import sqlite3 as sl
import requests
from bs4 import BeautifulSoup
import pickle
import time
import pandas as pd
import re
import json

con = sl.connect('my-test.db')

def getPage(url): #take in a URL, return a BS page
    page = requests.get(url)
    bsPage = BeautifulSoup(page.content, 'html.parser')
    return bsPage

#Get 1st level area for a given climb url
def findArea(routeSoup): #take in a route, return an area url
    location = routeSoup.find('a',text = 'All Locations').find_next_siblings()[-1]['href'] #FOUND THE LOCATION!!!! that was so easy!!!
    return location

def scrapeMeta(routeSoup):
    metaEl = routeSoup.find_all('script', type= 'application/ld+json') 
    meta = [i for i in metaEl]
    routeDict = json.loads(meta[0].contents[0])
    areaDict = json.loads(meta[1].contents[0])['itemListElement']
    
    name = routeDict['name']
    grade = routeDict['description'].split(' ')[0]
    climbtype = routeDict['description'].split(' ')[1:]
    climbtype = ' '.join(climbtype)
    latitude = routeDict['geo']['latitude']
    longitude = routeDict['geo']['longitude']
    
    return name, grade, climbtype, float(latitude), float(longitude), str(areaDict)

def getDescription(routeSoup): #take in route, return description
    routeDescription = routeSoup.find_all('div',class_ = 'fr-view')
    description = ' '.join([i.text for i in routeDescription]).lower()

    return description

#  id -url -name -grade -type -area areaDict latitude longitude -description

bug!!! when inserting the 5.10 grade in it cuts out the 0 so it becomes 5.1. 

Temporary fix: just assume all 5.1s 5.10 for now (pretty safe assumption)
The ultimate solution is to rescrape the entire MP site again. :(

In [16]:
# testing the above functions... and the 5.1 to 5.10 fix

url = 'https://www.mountainproject.com/route/105717766/3am-crack'
routeSoup = getPage(url)

name, grade, climbtype, latitude, longitude, areaDict = scrapeMeta(routeSoup)

grade

'5.10'

In [69]:
url = 'https://www.mountainproject.com/route/107033082/cold-terrorists'
routeSoup = getPage(url)

# scrape the site for meta data (the hierarchy of areas like NA > Utah > SE Utah > Indian Creek, and a description string)
metaEl = routeSoup.find_all('script', type= 'application/ld+json') 
meta = [i for i in metaEl]
desc = meta[0].contents
areaTree = meta[1].contents
routeDict = json.loads(desc[0])
areaDict = json.loads(areaTree[0])['itemListElement']

In [73]:
# an example of what is contained in the area tree
str(areaDict)

"[{'@type': 'ListItem', 'position': 1, 'item': 'https://www.mountainproject.com/route-guide', 'name': 'All Locations'}, {'@type': 'ListItem', 'position': 2, 'item': 'https://www.mountainproject.com/area/105708963/south-dakota', 'name': 'South Dakota'}, {'@type': 'ListItem', 'position': 3, 'item': 'https://www.mountainproject.com/area/112521712/east-side-sd', 'name': 'East Side SD'}, {'@type': 'ListItem', 'position': 4, 'item': 'https://www.mountainproject.com/area/105874281/palisades-state-park', 'name': 'Palisades State Park'}, {'@type': 'ListItem', 'position': 5, 'item': 'https://www.mountainproject.com/area/107033076/adelaide-flow', 'name': 'Adelaide Flow'}]"

In [6]:
# basic db functions

with open('routes.txt', 'rb') as f:
    routes = pickle.load(f)
    f.close()
    
routes = [route[1] for route in routes]

In [None]:
# create the table

with con: 
    con.execute("DROP TABLE ROUTES")

with con:
    con.execute("""
        CREATE TABLE ROUTES (
            id INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT,
            url TEXT,
            name TEXT,
            grade FLOAT,
            type TEXT,
            area TEXT,
            areaDict TEXT,
            latitude FLOAT,
            longitude FLOAT,
            description TEXT 
            
        );
    """)

In [195]:
routes[30655]

'https://www.mountainproject.com/route/117002412/duffys-escape'

In [200]:
# SCRAPE TIME!! This is the main route scraping section. I hate that this is necessary. 

n = 1
i = 30615 # how many routes to scrape? 
start = time.time()

# "bookmark" function. adjust i to start where you last stopped. because this takes several days. 
with con: 
    data = con.execute("SELECT id FROM ROUTES ORDER BY id DESC LIMIT 1;")
    for row in data:
        if row[0]:
            i = row[0] + 1

for route in routes[i:i+n]:
    try: # handle 404s
        index = i
        routeSoup = getPage(route)
        if (routeSoup.find_all('h3',text='The page you\'re looking for does not exist.')): 
            print('ERROR 404', i, route)
            i+=1
            continue
    except:
        print('ERROR', i, route)
        continue
    
    area = findArea(routeSoup)
    description = getDescription(routeSoup)
    name, grade, climbtype, latitude, longitude, areaDict = scrapeMeta(routeSoup)
    
    
    routetup = (i, route, name, grade, climbtype, area, areaDict, latitude, longitude, str(description))
    
    if i % 100 == 0:
        print(routetup[0], str(round(time.time()-start)) + 's', routetup[2])
    with con:
        con.execute('insert into routes (id, url, name, grade, type, area, areaDict, latitude, longitude, description) values(?, ?, ?, ?, ?, ?, ?, ?, ?, ?)',routetup)
        
    i+=1

In [10]:
# sanity check table
with con: 
    data = con.execute('pragma table_info("routes")')
    for row in data:
        print(row)

(0, 'id', 'INTEGER', 1, None, 1)
(1, 'url', 'TEXT', 0, None, 0)
(2, 'name', 'TEXT', 0, None, 0)
(3, 'grade', 'FLOAT', 0, None, 0)
(4, 'type', 'TEXT', 0, None, 0)
(5, 'area', 'TEXT', 0, None, 0)
(6, 'areaDict', 'TEXT', 0, None, 0)
(7, 'latitude', 'FLOAT', 0, None, 0)
(8, 'longitude', 'FLOAT', 0, None, 0)
(9, 'description', 'TEXT', 0, None, 0)


In [14]:
# another sanity check, last 50
with con: 
    data = con.execute("SELECT rowid, name, grade FROM ROUTES ORDER BY id DESC LIMIT 50;")
    for row in data:
        print(row)

(240045, 'North Gully', 'WI3')
(240044, 'Mugwort Tea', 'V1')
(240043, 'Cumberland SLAPS', 'V4')
(240042, 'Su çorbasi', 5.5)
(240041, 'Schlitz', '5.10d')
(240040, 'Headstrong', '5.11b/c')
(240039, 'Mighty Mouse', 'V1')
(240038, "Doctor's Orders", 'V9+')
(240037, 'Back in the Game', 'V0')
(240036, 'Clem Fandango', '5.11a')
(240035, 'Touchy Feely', '5.10b')
(240034, 'A Knight in Climbing Armor', 'V1')
(240033, 'Midnight Plowboy', '5.10b')
(240032, 'Four Dollar Arete', 'V11')
(240031, 'Local Maximum', 'V1')
(240030, 'Slash and Burn', 'V4')
(240029, 'Icky Yucka', 'V7')
(240028, 'West’s Dihedral', 5.7)
(240027, 'Goat Crack', '5.10c/d')
(240026, 'Minute Man Rocket (Project)', 'V8+')
(240025, 'Poison Ivy Arete', 'V0')
(240024, 'Moor or Less', 5.3)
(240023, 'Oscar the Grouch', 'V11')
(240022, 'Swordfish', 'V5-6')
(240021, 'Атлантида', '5.10c')
(240020, 'Miller Time', 5.6)
(240019, 'The Faerie', 'V8')
(240018, "Toog's Trailside Direct", '5.10b')
(240017, 'BB8', 'V3-4')
(240016, 'Damascus Road', 

In [31]:
# initial attempt at cross ref routes db with tick list

ticklist = []

con = sl.connect('my-test.db')
with con:
    data = con.execute('SELECT ticks.rowid, ticks.url, date, grade, status, latitude, longitude FROM ticks INNER JOIN routes ON routes.url = ticks.url where ticks.rowid < 20;')
    for row in data:
        if row[3] == 5.1:
            row2 = list(row)
            row2[3] = str('5.10')
            row2 = tuple(row2)
            ticklist.append(row2)
        if row[3] != 5.1: 
            ticklist.append(row)
    

In [32]:
print(ticklist)

[(1, 'https://www.mountainproject.com/route/105740981/rye-crisp', 'Mar 9, 2021', 5.8, 'Lead / Fell/Hung', 42.06738673, -113.70893731), (2, 'https://www.mountainproject.com/route/105803570/pure-palm', 'Mar 9, 2021', '5.11a', 'Solo', 44.36798221, -121.1308092), (3, 'https://www.mountainproject.com/route/109399939/blade-runner', 'Dec 5, 2020', '5.12c', 'Lead / Fell/Hung', 47.43113857, -121.61999578), (4, 'https://www.mountainproject.com/route/111183004/wretched-love-affair', 'Nov 28, 2020', '5.11c', 'Lead / Onsight', 45.53735198, -122.3720833), (5, 'https://www.mountainproject.com/route/111945454/chicago-love-affair', 'Oct 19, 2020', '5.10b', 'Lead / Onsight', 44.36581163, -121.12983902), (6, 'https://www.mountainproject.com/route/105889245/quasar', 'Oct 17, 2020', '5.10a/b', 'Lead / Onsight', 44.36732719, -121.13064751), (7, 'https://www.mountainproject.com/route/105791785/cruel-sister', 'Oct 17, 2020', '5.10a', 'Lead / Onsight', 44.36732719, -121.13064751), (8, 'https://www.mountainproj