In [None]:
import requests
from bs4 import BeautifulSoup
import time
from urllib import urlencode
import selenium.webdriver
import pandas as pd
import random
from pymongo import MongoClient
import json

In [19]:
def search_mnt_project(url,  browser, delay=3):
    '''Pulls page content and returns it'''
    browser.get(url)
    # make delay more random
    delay = random.randint(2, 6)
    time.sleep(delay)  # Wait a few seconds before getting the HTML source
    return browser.page_source

In [20]:
def soup_maker(url):
    '''Opens up selenium webdriver and returns soup'''
    browser = selenium.webdriver.Firefox()
    html = search_mnt_project(url,browser)
    browser.quit()
    soup = BeautifulSoup(html, 'html.parser')
    return soup

In [23]:
def find_table_urls(table_tag, href_list):
    '''
     Pulls route urls from table
    '''
    for t in table_tag:
        for row in t.findAll('tr'):
            stars = row.findAll('td')[1].find('span',{'class': 'small textLight'})
            # stop if not review
            if stars != None:
                if str(stars.text) == ' (0)':
                    continue
            a = row.findAll('td')[0].find('a', href=True)
            if a != None:
                href_list.append(a.get('href'))
    return href_list

In [33]:
def find_route_urls(query, route_href_list):
    '''
    INPUT
        - url - a page url 
        - route_href_list - list of href's for routes
    OUTPUT
        - route_href_list - list of href's for routes
        - soup - the html for the given page
    '''
    url = "https://www.mountainproject.com%s" % query
    soup = soup_maker(url)
    table_tag = soup.select('table.objectList')
    product_tags = soup.select('div.search-result-gridview-item')
    route_href_list = find_table_urls(table_tag, route_href_list)
    return route_href_list, soup

In [37]:
def all_route_urls(start_url):
    '''
    find all route urls
    INPUT
        - start_url - first url to go to
    OUTPUT
        - route_href_list - list off all the route's urls
    '''
    # make empty list to fill with route page urls
    route_href_list = [] 
    route_href_list, soup = find_route_urls(start_url, route_href_list)
    # click next page
    page_url = ''
    while page_url is not None:
        for a in soup.find('td', {'align': 'right'}).findAll('a',href=True):
            if 'Next' in a.text:
                page_url = a.get('href')
                break
            else:
                page_url = False
        if page_url == False:
            page_url = None
            break
        route_href_list, soup = find_route_urls(page_url, route_href_list)
    return route_href_list

In [41]:
def scrape_route_page(query):
    '''
    INPUT
        - query     
    OUTPUT
    
    '''
    url = "https://www.mountainproject.com%s" % query
    soup = soup_maker(url)
    page_tag = soup.find('div', {'id':'rspCol800'})
    # make route dict
    route_dict = {}
    route_dict['name'] = page_tag.find('span', {'itemprop':'itemreviewed'}).text
    route_dict['grade'] = page_tag.find('span', {'class':'rateYDS'}).text
    route_stars_text = soup.find('span', {'id':'starSummaryText'}).text.split('Average: ')
    route_dict['average_rating'] = route_stars_text[1][:3]
    # convert to string from unicode
    star_url = str(soup.find('span', {'id':'starSummaryText'}).find('a', href=True).get('href'))
    
    for i, td in enumerate(page_tag.find('table').findAll('td')): 
        if td.text.split(':')[0] == 'Type':
            route_dict['type'] = page_tag.find('table').findAll('td')[i+1].text
        elif td.text.split(':')[0] == 'Original':
            route_dict['original_grade'] = page_tag.find('table').findAll('td')[i+1].text
        elif td.text.split(':')[0] == 'FA':
            fa = str(page_tag.find('table').findAll('td')[i+1].text)
            route_dict['FA'] = fa
        elif td.text.split(':')[0] == 'Season':
            season = page_tag.find('table').findAll('td')[i+1].text
            route_dict['season'] = season
        elif td.text.split(':')[0] == 'Page Views':
            route_dict['page_views'] = page_tag.find('table').findAll('td')[i+1].text
        elif td.text.split(':')[0] == 'Submitted By':
            route_dict['submitted_by'] = page_tag.find('table').findAll('td')[i+1].text
            
    return star_url, route_dict

In [42]:
def scrape_ratings_by_user(query, route_name):
    rating_dict = {'route_name': route_name, 'username': [], 'rating':[]}
    url = "https://www.mountainproject.com%s" % query
    soup = soup_maker(url)
    table_tag = soup.findAll('table')
    # the 4th table is the one with star votes
    for row in table_tag[3].findAll('tr'):
        for i, column in enumerate(row.findAll('td')):
            if i % 2 == 0:
                rating_dict['username'].append(column.text) # username
                user_url = str(column.find('a', href=True).get('href')) #query for user url
            if i % 2 == 1:
                rating_dict['rating'].append(int(column.text.split('Html(')[1][0]) - 1) # number of stars
    return user_url, rating_dict

In [58]:
def scrape_user(query):
    '''returns user info'''
    url = "https://www.mountainproject.com%s" % query
    soup = soup_maker(url)
    # make user dict
    user_dict = {}
    user_dict['name'] = str(soup.find('h1').text)
    side_bar = soup.select('div.roundedBottom')[0].text.split('\n') # side bar
    user_dict['member_since'] = side_bar[3].split('Since: ')[1]
    user_dict['last_vist'] = side_bar[4].split('Visit: ')[1]
    user_dict['point_rank'] = side_bar[9].split('Rank: # ')[1]
    user_dict['total_points'] = side_bar[10].split('Points: ')[1].replace(',', '')
    user_dict['compliments'] = side_bar[11].split(' Compliments')[0]
    
    for item in soup.find('div',{'class': 'personalData'}).text.split('\n'):
        if 'Personal' in item:
            user_dict['personal'] = item.split('Personal: ')[1]
        elif 'Favorite Climbs:' in item:
            user_dict['favorite_climbs'] = item.split('Favorite Climbs: ')[1]
        elif 'Other Interests:' in item:
            user_dict['other_interests'] = item.split('Other Interests: ')[1]
        elif 'Likes to climb:' in item:
            user_dict['likes_to_climb'] = item.split('Likes to climb: ')[1]
        elif 'Trad:' in item:
            user_dict['trad'] = item.split('Trad:')[1]
        elif 'Sport:' in item:
            user_dict['sport'] = item.split('Sport:')[1]
        elif 'Aid:' in item:
            user_dict['aid'] = item.split('Aid:')[1]
        elif 'Ice:' in item:
            user_dict['ice'] = item.split('Ice:')[1]
    return user_dict

In [59]:
def add_to_route_database(route_dict):
    client = MongoClient('mongodb://localhost:27017/')
    db = client.routes
    routes = db.routes
    routes.insert_one(route_dict)

In [53]:
def add_to_rating_database(rating_dict):
    client = MongoClient('mongodb://localhost:27017/')
    db = client.ratings
    ratings = db.ratings
    ratings.insert_one(rating_dict)

In [54]:
def add_to_user_database(user_dict):
    client = MongoClient('mongodb://localhost:27017/')
    db = client.users
    users = db.users
    users.insert_one(user_dict)

In [None]:
washington_route_url = '''/scripts/Search.php?searchType=
            routeFinder&minVotes=0&selectedIds=105708966&type=rock&diffMinrock=
            800&diffMinboulder=20000&diffMinaid=70000&diffMinice=30000&diffMinmixed=
            50000&diffMaxrock=12400&diffMaxboulder=21400&diffMaxaid=75260&diffMaxice=
            38500&diffMaxmixed=60000&is_trad_climb=1&is_sport_climb=1&is_top_rope=
            1&stars=0&pitches=0&sort1=area&sort2=rating'''
# returns all route urls
route_urls = all_route_urls(washington_route_url)

In [60]:
star_urls = []
for i, quary in enumerate(route_urls):
    star_url, route_dict = scrape_route_page(quary) # updates route_df and return star_url
    add_to_route_database(route_dict)
    user_url, rating_dict = scrape_ratings_by_user(star_url, route_dict['name'])
    add_to_rating_database(rating_dict)
    user_dict = scrape_user(user_url)
    add_to_user_database(user_dict)

IndexError: list index out of range

In [50]:
type(rating_dict)

dict

In [89]:
client = MongoClient('mongodb://localhost:27017/')
db = client.route_info
route_info = db.route_info
raw_data = route_info.find()
df = pd.DataFrame(list(raw_data))
df.head(100)

Unnamed: 0,FA,_id,grade,name,original_grade,page_views,season,stars,submitted_by,type
0,,58c7249740b44128791b7e3a,YDS: 5.7,5.7,YDS: 5.7 French: 5a Ewbanks: 15 UIAA: V+ ...,317,[],2 s,"Keenan Waeschle on Mar 31, 2010","Sport, 40'"
1,,58c724a140b44128791b7e3b,YDS: 5.8,5.8,YDS: 5.8 French: 5b Ewbanks: 16 UIAA: VI- ...,253,[],1.5,"Keenan Waeschle on Mar 31, 2010","Sport, 40'"
2,,58c724ad40b44128791b7e3c,YDS: Easy 5th,Brothers Traverse,YDS: Easy 5th French: 1+ Ewbanks: 3 UIAA: ...,4643,spring,3 s,"Eric Fjellanger on Jul 20, 2009","Trad, Alpine"
3,,58c724b840b44128791b7e3d,YDS: Easy 5th,South Corner,YDS: Easy 5th French: 1+ Ewbanks: 3 UIAA: ...,200,[],2.5,"Matt. B. on Sep 13, 2015","Trad, Alpine, 2 pitches, 150', Grade II"
4,1965 Arnie & Diane Bloomer,58c724c240b44128791b7e3e,YDS: 3rd,Honeymoon Route,YDS: 3rd French: 1- Ewbanks: 1 UIAA: I ZA...,11,[],2 s,"JeremyJ on Mar 3, 2017","Trad, Alpine"
5,?,58c724cd40b44128791b7e3f,YDS: 5.10a,The Dinner Plate,YDS: 5.10a French: 6a Ewbanks: 18 UIAA: VI...,839,[],3 s,"Joshua Dreher on Sep 28, 2010","Sport, 190'"
6,,58c726b740b44128791b7e43,YDS: 5.7,5.7,YDS: 5.7 French: 5a Ewbanks: 15 UIAA: V+ ...,317,[],2 s,"Keenan Waeschle on Mar 31, 2010","Sport, 40'"
7,,58c726c140b44128791b7e44,YDS: 5.8,5.8,YDS: 5.8 French: 5b Ewbanks: 16 UIAA: VI- ...,253,[],1.5,"Keenan Waeschle on Mar 31, 2010","Sport, 40'"
8,,58c726cc40b44128791b7e45,YDS: Easy 5th,Brothers Traverse,YDS: Easy 5th French: 1+ Ewbanks: 3 UIAA: ...,4643,spring,3 s,"Eric Fjellanger on Jul 20, 2009","Trad, Alpine"
9,,58c726d740b44128791b7e46,YDS: Easy 5th,South Corner,YDS: Easy 5th French: 1+ Ewbanks: 3 UIAA: ...,200,[],2.5,"Matt. B. on Sep 13, 2015","Trad, Alpine, 2 pitches, 150', Grade II"


In [87]:
import requests
from bs4 import BeautifulSoup
import time
from urllib import urlencode
import selenium.webdriver
import random
from pymongo import MongoClient
import json
import boto3
from route_url_scraper import (search_route_page, 
                               all_route_urls, 
                               scrape_route_page,
                               scrape_ratings_by_user,
                               scrape_user)
from store_in_database import (add_to_route_html_database,
                               add_to_rating_database,
                               add_to_user_html_database,
                               add_route_link_database)
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [48]:
client = MongoClient('mongodb://localhost:27017/')
db = client.route_url
route_url = db.route_url
route_url_dict = route_url.find()

In [49]:
route_href_list = list(route_url_dict)[0]['route_urls']

In [47]:
aws = boto3.resource('s3')
ww_all = aws.Bucket('rockclimbingrecommender')
ww_all.put_object(Body='Zach',Key="ThrowAwayZach")

s3.Object(bucket_name='rockclimbingrecommender', key='ThrowAwayZach')

In [88]:
browser = selenium.webdriver.PhantomJS()#Firefox()
aws = boto3.resource('s3')
ww_all = aws.Bucket('rockclimbingrecommender')
for route_href in route_href_list:
    star_url, route_html, route_name = scrape_route_page(route_href, browser)
    #key = 'routes/' + route_name
    #ww_all.put_object(Body=route_html, Key= key)
    #route_name_utf8 = route_name.encode("utf-8")
    #route_html_utf8 = route_html.encode("utf-8")
    route_html_dict = {'route': route_name, 'html': route_html}
    add_to_route_html_database(route_html_dict)
    
    user_url, rating_dict = scrape_ratings_by_user(star_url, browser)
    #key = 'route_rating_by_user/' + route_name
    #ww_all.put_object(Metadata=rating_dict, Key=key)
    rating_dict['route'] = route_name
    add_to_rating_database(rating_dict)
    
    user_html, user_name = scrape_user(user_url, browser)
    #user_html_utf8 = user_html.encode("utf-8")
    #key = 'users/' + user_name
    #ww_all.put_object(Body=user_html, Key=key)
    user_html_dict = {'username': user_name, 'html': user_html}
    add_to_user_html_database(user_html_dict)
browser.quit()

KeyboardInterrupt: 

In [89]:
route_href

u'/v/damnation-crack/105809780'

In [92]:
route_href_list.index(route_href)

834

In [97]:
browser = selenium.webdriver.PhantomJS()#Firefox()
aws = boto3.resource('s3')
ww_all = aws.Bucket('rockclimbingrecommender')
for route_href in route_href_list[834:]:
    star_url, route_html, route_name = scrape_route_page(route_href, browser)
    #key = 'routes/' + route_name
    #ww_all.put_object(Body=route_html, Key= key)
    #route_name_utf8 = route_name.encode("utf-8")
    #route_html_utf8 = route_html.encode("utf-8")
    route_html_dict = {'route': route_name, 'html': route_html}
    add_to_route_html_database(route_html_dict)
    
    user_url, rating_dict = scrape_ratings_by_user(star_url, browser)
    #key = 'route_rating_by_user/' + route_name
    #ww_all.put_object(Metadata=rating_dict, Key=key)
    rating_dict['route'] = route_name
    add_to_rating_database(rating_dict)
    
    user_html, user_name = scrape_user(user_url, browser)
    #user_html_utf8 = user_html.encode("utf-8")
    #key = 'users/' + user_name
    #ww_all.put_object(Body=user_html, Key=key)
    user_html_dict = {'username': user_name, 'html': user_html}
    add_to_user_html_database(user_html_dict)
browser.quit()

KeyboardInterrupt: 

In [98]:
route_href

u'/v/like-honey/106918228'

In [99]:
route_href_list.index(route_href)

3559

In [100]:
len(route_href_list)

3744

In [101]:
browser = selenium.webdriver.PhantomJS()#Firefox()
aws = boto3.resource('s3')
ww_all = aws.Bucket('rockclimbingrecommender')
for route_href in route_href_list[3559:]:
    star_url, route_html, route_name = scrape_route_page(route_href, browser)
    #key = 'routes/' + route_name
    #ww_all.put_object(Body=route_html, Key= key)
    #route_name_utf8 = route_name.encode("utf-8")
    #route_html_utf8 = route_html.encode("utf-8")
    route_html_dict = {'route': route_name, 'html': route_html}
    add_to_route_html_database(route_html_dict)
    
    user_url, rating_dict = scrape_ratings_by_user(star_url, browser)
    #key = 'route_rating_by_user/' + route_name
    #ww_all.put_object(Metadata=rating_dict, Key=key)
    rating_dict['route'] = route_name
    add_to_rating_database(rating_dict)
    
    user_html, user_name = scrape_user(user_url, browser)
    #user_html_utf8 = user_html.encode("utf-8")
    #key = 'users/' + user_name
    #ww_all.put_object(Body=user_html, Key=key)
    user_html_dict = {'username': user_name, 'html': user_html}
    # check if name in database before storing
    add_to_user_html_database(user_html_dict)
browser.quit()

AttributeError: 'NoneType' object has no attribute 'find'

In [103]:
route_href_list.index(route_href)

3726

In [4]:
client = MongoClient('mongodb://localhost:27017/')
db = client.route_html_collection

In [7]:
route_html_list = list(db.route_html_collection.find())
pd.DataFrame(route_html_list[830:840])

Unnamed: 0,_id,html,route
0,58c8ab2240b44102a84b016d,"<!DOCTYPE html><html xmlns=""http://www.w3.org/...",Crystal Ship
1,58c8ab3140b44102a84b0173,"<!DOCTYPE html><html xmlns=""http://www.w3.org/...",Cutting Teeth
2,58c8ab4240b44102a84b0179,"<!DOCTYPE html><html xmlns=""http://www.w3.org/...",D
3,58c8ab4a40b44102a84b017f,"<!DOCTYPE html><html xmlns=""http://www.w3.org/...",Dad's Nuts
4,58c8ab5540b44102a84b0185,"<!DOCTYPE html><html xmlns=""http://www.w3.org/...",Damnation Crack
5,58c8c58940b44102a84b0187,"<!DOCTYPE html><html xmlns=""http://www.w3.org/...",Damnation Crack
6,58c8c59840b44102a84b018d,"<!DOCTYPE html><html xmlns=""http://www.w3.org/...",Day Tripper
7,58c8c5a540b44102a84b0193,"<!DOCTYPE html><html xmlns=""http://www.w3.org/...",Death By Jenga
8,58c8c5b440b44102a84b0199,"<!DOCTYPE html><html xmlns=""http://www.w3.org/...",Deception Crack
9,58c8c5c140b44102a84b019f,"<!DOCTYPE html><html xmlns=""http://www.w3.org/...",Delayed Adolescence


In [None]:
pd.DataFrame(route_html_list[3558:3560])