This notebook takes a Mountain Project route tick list URL, downloads it, and scrapes it for important data:
date, 
route name,
status of the send (Onsight, flash, send, attempt, etc.)

In [5]:
import sqlite3 as sl
import requests
from bs4 import BeautifulSoup
import json
import re
import numpy as np

con = sl.connect('my-test2.db')

# found the element that contains the tick's route name...

<a class="text-black route-row" href="https://www.mountainproject.com/route/111945454/chicago-love-affair>
    is the line for routes

# have to find the number of pages in the tick lists. there's a few ways to do this but this way seems most robust. there's a little pagination div at the bottom of each tick list page.......

<a class="no-click">
        1 of 3
    </a>
<a href="https://www.mountainproject.com/user/200166625/patrick-cheng/ticks?page=2">
<img alt="Next" class="skinny" src="/img/arrows/next.svg"/>
</a>
<a href="https://www.mountainproject.com/user/200166625/patrick-cheng/ticks?page=3">
<img alt="Last" src="/img/arrows/last.svg"/>
    
    the page numbers
    couple ways to find the number of tick pages:
    find the no-click class that matches a regex expression of 'digit of digit'
    actually that's probably the easiest way 
    other way is to find the link element but that will take more code
    or you can just iterate until you get a 404- might be the easiest and also most robust......

In [3]:
def getStatus(string):
    if string == '': return ''
    poss = {'Solo', 'TR', 'Follow', 'Lead', 'Lead / Onsight', 'Lead / Flash', 'Lead / Redpoint', 'Lead / Pinkpoint', 'Lead / Fell/Hung'}
    
    meta = re.split(r'\.', string)[0]
    if (meta not in poss): return ''
    return meta

def getLastPage(url):
    page = requests.get(url)
    tickSoup = BeautifulSoup(page.content, 'html.parser')
    li = tickSoup.find_all('a', class_ = 'no-click')
    noclick = [route.text for route in li]
    p = re.compile(r'\d+')
    for el in noclick:
        if p.findall(el):
            lastPage = p.findall(el)[1]
            break
    return int(lastPage)

def getPageURL(i, url):
    ext = '' if i == 1 else ('?page=%d' %i)
    url = url + ext
    return url

def getTickInfo(tick):
    url = tick['href']
    
    info = re.split(r'[·]',tick.find('i').text)
    date = info[0].strip()
    
    if len(info) > 1:
        status = getStatus(info[1].strip())
    else: 
        status = ''
        
    return (url, date, status)

def getPageTicks(url):
    pageTicks = []
    page = requests.get(url)
    tickSoup = BeautifulSoup(page.content, 'html.parser')
    userRoutes = tickSoup.find_all('a', class_ = 'route-row')
    
    for route in userRoutes:
        info = getTickInfo(route)
        pageTicks.append(info)
        
    return pageTicks

def getUserTicks(url):
    userTicks = []
    
    for i in range(1, getLastPage(url)+1):
        pageurl = getPageURL(i, url)
        pageTicks = getPageTicks(pageurl)
        userTicks+=pageTicks
        
    return userTicks

def insertTicks(userTicks):
    for tick in userticks:
            with con:
                con.execute('insert into ticks (url, date, status) values(?, ?, ?)', tick)


In [8]:
# load my own ticks
url = ('https://www.mountainproject.com/user/200166625/patrick-cheng/ticks')
userticks = getUserTicks(url)
print(len(userticks))
print(userticks[80])

106
('https://www.mountainproject.com/route/106314933/blowing-smoke-at-the-monkey', 'Jun 30, 2019', '')


In [6]:
# see what the table looks like...
with con:
    data = con.execute('select * from ticks where rowid < 10')
    for row in data:
        print(row)

('https://www.mountainproject.com/route/105740981/rye-crisp', 'Mar 9, 2021', 'Lead / Fell/Hung')
('https://www.mountainproject.com/route/105803570/pure-palm', 'Mar 9, 2021', 'Solo')
('https://www.mountainproject.com/route/109399939/blade-runner', 'Dec 5, 2020', 'Lead / Fell/Hung')
('https://www.mountainproject.com/route/111183004/wretched-love-affair', 'Nov 28, 2020', 'Lead / Onsight')
('https://www.mountainproject.com/route/111945454/chicago-love-affair', 'Oct 19, 2020', 'Lead / Onsight')
('https://www.mountainproject.com/route/105889245/quasar', 'Oct 17, 2020', 'Lead / Onsight')
('https://www.mountainproject.com/route/105791785/cruel-sister', 'Oct 17, 2020', 'Lead / Onsight')
('https://www.mountainproject.com/route/105901816/azog', 'Oct 16, 2020', 'Lead / Onsight')
('https://www.mountainproject.com/route/109565046/puck', 'Oct 16, 2020', 'Lead / Onsight')


In [7]:
# sanity check print out the tick list joined with the route database for more info
with con:
    data = con.execute('SELECT date, ticks.url, grade, status, latitude, longitude FROM ticks INNER JOIN routes ON routes.url = ticks.url;')
    for row in data:
        print(row)

('Mar 9, 2021', 'https://www.mountainproject.com/route/105740981/rye-crisp', 5.8, 'Lead / Fell/Hung', 42.06738673, -113.70893731)
('Mar 9, 2021', 'https://www.mountainproject.com/route/105803570/pure-palm', '5.11a', 'Solo', 44.36798221, -121.1308092)
('Dec 5, 2020', 'https://www.mountainproject.com/route/109399939/blade-runner', '5.12c', 'Lead / Fell/Hung', 47.43113857, -121.61999578)
('Nov 28, 2020', 'https://www.mountainproject.com/route/111183004/wretched-love-affair', '5.11c', 'Lead / Onsight', 45.53735198, -122.3720833)
('Oct 19, 2020', 'https://www.mountainproject.com/route/111945454/chicago-love-affair', '5.10b', 'Lead / Onsight', 44.36581163, -121.12983902)
('Oct 17, 2020', 'https://www.mountainproject.com/route/105889245/quasar', '5.10a/b', 'Lead / Onsight', 44.36732719, -121.13064751)
('Oct 17, 2020', 'https://www.mountainproject.com/route/105791785/cruel-sister', '5.10a', 'Lead / Onsight', 44.36732719, -121.13064751)
('Oct 16, 2020', 'https://www.mountainproject.com/route/10