In [46]:
# Trying to get hyperlinks for all states so we can run the scraping script on all states.

# Imports and get the html data from MountainProject. 
import pandas as pd
import numpy as np
import string
import requests
from bs4 import BeautifulSoup

url = 'https://www.mountainproject.com/route-guide'
page = requests.get(url)

soup = BeautifulSoup(page.content, 'html.parser')

# Put all hyperlinks on the page into a list.
data = []
for link in soup.find_all('a', class_= 'text-truncate float-xs-left'):
    data.append(link.get('href'))
    
# List of US States that we'll use to remove unwanted hyperlinks.
# Mississippi and Nebrasaka don't have pages, which makes sense,
# considering how flat they are.
states = ["Alabama","Alaska","Arizona","Arkansas","California","Colorado",
  "Connecticut","Delaware","Florida","Georgia","Hawaii","Idaho","Illinois",
  "Indiana","Iowa","Kansas","Kentucky","Louisiana","Maine","Maryland",
  "Massachusetts","Michigan","Minnesota","Mississippi","Missouri","Montana",
  "Nebraska","Nevada","New-Hampshire","New-Jersey","New-Mexico","New-York",
  "North-Carolina","North-Dakota","Ohio","Oklahoma","Oregon","Pennsylvania",
  "Rhode-Island","South-Carolina","South-Dakota","Tennessee","Texas","Utah",
  "Vermont","Virginia","Washington","West-Virginia","Wisconsin","Wyoming"]

# List of US States and their abbreviations for a dictionary.
us_state_abbrev = {
    'Alabama': 'AL',
    'Alaska': 'AK',
    'Arizona': 'AZ',
    'Arkansas': 'AR',
    'California': 'CA',
    'Colorado': 'CO',
    'Connecticut': 'CT',
    'Delaware': 'DE',
    'District of Columbia': 'DC',
    'Florida': 'FL',
    'Georgia': 'GA',
    'Hawaii': 'HI',
    'Idaho': 'ID',
    'Illinois': 'IL',
    'Indiana': 'IN',
    'Iowa': 'IA',
    'Kansas': 'KS',
    'Kentucky': 'KY',
    'Louisiana': 'LA',
    'Maine': 'ME',
    'Maryland': 'MD',
    'Massachusetts': 'MA',
    'Michigan': 'MI',
    'Minnesota': 'MN',
    'Mississippi': 'MS',
    'Missouri': 'MO',
    'Montana': 'MT',
    'Nebraska': 'NE',
    'Nevada': 'NV',
    'New-Hampshire': 'NH',
    'New-Jersey': 'NJ',
    'New-Mexico': 'NM',
    'New-York': 'NY',
    'North-Carolina': 'NC',
    'North-Dakota': 'ND',
    'Northern Mariana Islands':'MP',
    'Ohio': 'OH',
    'Oklahoma': 'OK',
    'Oregon': 'OR',
    'Palau': 'PW',
    'Pennsylvania': 'PA',
    'Puerto Rico': 'PR',
    'Rhode-Island': 'RI',
    'South-Carolina': 'SC',
    'South-Dakota': 'SD',
    'Tennessee': 'TN',
    'Texas': 'TX',
    'Utah': 'UT',
    'Vermont': 'VT',
    'Virgin Islands': 'VI',
    'Virginia': 'VA',
    'Washington': 'WA',
    'West-Virginia': 'WV',
    'Wisconsin': 'WI',
    'Wyoming': 'WY',
}

# Make the states lowercase to match hyperlinks. 
states = [x.lower() for x in states]
us_state_abbrev = dict((k.lower(),v) for k,v in us_state_abbrev.items())

# Loop through each hyperlink and see if it contains a state as a subsrting.
# If it does, add the link to a list containing links for each state page. 
US = []
for datum in data:
    for state in states:
        if state in datum:
            US.append([state,datum])
        
# Remove duplicate links, Vermont Ice+Mixed, and Tennessee Wall (they are duplicates!)
# Check length. Should be 48. 
res = [] 
[res.append(x) for x in US if x not in res] 
# US=list(set(US))
res.remove(['tennessee','https://www.mountainproject.com/area/105851828/the-tennessee-wall'])
res.remove(['vermont','https://www.mountainproject.com/area/107280521/vermont-ice-and-mixed'])
links = dict(res)
print(len(links))

48


In [2]:
# Trying to get number of crags on an individual state page.

import pandas as pd
import requests
from bs4 import BeautifulSoup

def getData(url):

    page = requests.get(url)

    # Getting the webpage, creating a Response object.
    response = requests.get(url)
 
    # Extracting the source code of the page.
    data = response.text
 
    # Passing the source code to Beautiful Soup to create a BeautifulSoup object for it.
    soup = BeautifulSoup(data, 'lxml')
 
    # Extracting all the <a> tags into a list. No ID or class or anything next to hyperlink in HTML
    titles = soup.findAll('a')
    climbs = soup.findAll('span',class_ ='text-warm')
 
    # Extracting text from the the <a> and put into list. 
    areas = [x.text for x in titles]
    climbs = [y.text for y in climbs ]
            
    return areas, climbs


# Define function to get location of a string that always occurs after end of list of crags.
def getIndexes(dfObj, value):
    ''' Get index positions of value in dataframe i.e. dfObj.'''
 
    listOfPos = list()
    # Get bool dataframe with True at positions where the given value exists
    result = dfObj.isin([value])
    # Get list of columns that contains the value
    seriesObj = result.any()
    columnNames = list(seriesObj[seriesObj == True].index)
    # Iterate over list of columns and fetch the rows indexes where value exists
    for col in columnNames:
        rows = list(result[col][result[col] == True].index)
        for row in rows:
            listOfPos.append((row, col))
    # Return a list of tuples indicating the positions of value in the dataframe
    return listOfPos

# Define function to remove the unwanted text from the dataframe. Pages with slight differences
# in their layout needed the cutOff point to be manually entered.
def adjustDF(url,state):

    areas, climbs = getData(url)
    
    # Cleaning up areas DataFrame
    areas = pd.DataFrame(list(filter(None,areas)))
    areas.columns = ['Crags']
    areas = areas[~areas['Crags'].str.contains('\n')]
    # Actual list of climbing areas starts 1 after 'Climbing Area map', and ends 2 before it.
    places = getIndexes(areas,'Climbing Area Map')
    cleaned_areas = areas.loc[places[0][0] + 1:places[1][0] - 2]
    
    # Cleaning up number of climbs DataFrame
    climbs = pd.DataFrame(list(filter(None,climbs)))
    climbs.columns = ['Climbs']
    climbs['Climbs'] = [s.strip() for s in climbs['Climbs']]
    
    # Make DataFrame combining fields. The climbs scrape starts with the number of climbs for
    # the first area, meaning it will line up with the proper crag. 
    df = pd.DataFrame(list(zip(cleaned_areas['Crags'],climbs['Climbs'])))

    # Add state column
    df = df.assign(ST = us_state_abbrev[state])
    return df

In [3]:
# Make combined dataframe for all crags in all states using gathered hyperlinks.
def makeDataFrame(links):
    # Instantiate empty dataframe
    df = pd.DataFrame()
    #Loop through links.
    for sta, link in links.items():
        # Add state dataframes on top of each other.
        df = df.append(adjustDF(link,sta))
    return df

In [113]:
all_df = makeDataFrame(links)
all_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1138 entries, 0 to 31
Data columns (total 3 columns):
0     1138 non-null object
1     1138 non-null object
ST    1138 non-null object
dtypes: object(3)
memory usage: 35.6+ KB


In [6]:
#all_df = all_df[all_df['Crags']!='Climbing Area Map']
#all_df = all_df[~all_df['Crags'].str.contains('\n')]

In [114]:
all_df.columns=['Crags', 'Climbs', 'ST']
# Remove Punctuation from Climbs
all_df['Climbs'] = [s.translate(str.maketrans('', '', string.punctuation)) for s in all_df['Climbs']]
# Convert Climbs values to integers
all_df['Climbs'] = [int(x) for x in all_df['Climbs']]
# Make dataframe to count number of crags per state. 
crag_counts = pd.DataFrame(all_df['ST'].value_counts())
crag_counts.columns = ['Crags']
crag_counts.head()

Unnamed: 0,Crags
MA,110
CO,74
NC,62
UT,53
WI,53


In [115]:
# Make dataframe for aggregating climb counts at the State level.
climb_counts = all_df.drop(columns = ['Crags'])
climb_counts = climb_counts.set_index('ST')
climb_counts = climb_counts.sum(level = 'ST')

Unnamed: 0,Climbs,ST
0,2,AL
1,10,AL
2,1,AL
3,32,AL
4,12,AL


In [120]:
# Make aggregate dataFrame.
agg_df = climb_counts.join(crag_counts)

In [6]:
import chart_studio.plotly as py
import plotly.graph_objs as go 
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

In [132]:
data = dict(type='choropleth',
            colorscale = 'inferno',
            locations = list(agg_df.index.values),
            z = agg_df['Climbs'],
            locationmode = 'USA-states',
            #text = df['text'],
            text = agg_df['Crags'],
            marker = dict(line = dict(color = 'rgb(255,255,255)',width = 2)),
            colorbar = {'title':"Climbing"}
            ) 

In [136]:
layout = dict(title = 'Number of Climbs By State',
              geo = dict(scope='usa',
                         showlakes = True,
                         lakecolor = 'rgb(85,173,240)')
             )

In [134]:
choromap = go.Figure(data = [data],layout = layout)

In [135]:
iplot(choromap)

  # Iplot colorschemes
  ['aggrnyl', 'agsunset', 'algae', 'amp', 'armyrose', 'balance',
             'blackbody', 'bluered', 'blues', 'blugrn', 'bluyl', 'brbg',
             'brwnyl', 'bugn', 'bupu', 'burg', 'burgyl', 'cividis', 'curl',
             'darkmint', 'deep', 'delta', 'dense', 'earth', 'edge', 'electric',
             'emrld', 'fall', 'geyser', 'gnbu', 'gray', 'greens', 'greys',
             'haline', 'hot', 'hsv', 'ice', 'icefire', 'inferno', 'jet',
             'magenta', 'magma', 'matter', 'mint', 'mrybm', 'mygbm', 'oranges',
             'orrd', 'oryel', 'peach', 'phase', 'picnic', 'pinkyl', 'piyg',
             'plasma', 'plotly3', 'portland', 'prgn', 'pubu', 'pubugn', 'puor',
             'purd', 'purp', 'purples', 'purpor', 'rainbow', 'rdbu', 'rdgy',
             'rdpu', 'rdylbu', 'rdylgn', 'redor', 'reds', 'solar', 'spectral',
             'speed', 'sunset', 'sunsetdark', 'teal', 'tealgrn', 'tealrose',
             'tempo', 'temps', 'thermal', 'tropic', 'turbid', 'twilight',
             'viridis', 'ylgn', 'ylgnbu', 'ylorbr', 'ylorrd'].