In [3]:
#Trying to get hyperlinks for all states so we can run the scraping script on all states.

#Imports and get the html data from MountainProject. 
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup

url = 'https://www.mountainproject.com/route-guide'
page = requests.get(url)

soup = BeautifulSoup(page.content, 'html.parser')

#Put all hyperlinks on the page into a list.
data = []
for link in soup.find_all('a', class_= 'text-truncate float-xs-left'):
    data.append(link.get('href'))
    
#List of US States that we'll use to remove unwanted hyperlinks.
#Mississippi and Nebrasaka don't have pages, which makes sense.
states = ["Alabama","Alaska","Arizona","Arkansas","California","Colorado",
  "Connecticut","Delaware","Florida","Georgia","Hawaii","Idaho","Illinois",
  "Indiana","Iowa","Kansas","Kentucky","Louisiana","Maine","Maryland",
  "Massachusetts","Michigan","Minnesota","Mississippi","Missouri","Montana",
  "Nebraska","Nevada","New-Hampshire","New-Jersey","New-Mexico","New-York",
  "North-Carolina","North-Dakota","Ohio","Oklahoma","Oregon","Pennsylvania",
  "Rhode-Island","South-Carolina","South-Dakota","Tennessee","Texas","Utah",
  "Vermont","Virginia","Washington","West-Virginia","Wisconsin","Wyoming"]

#List of US States and their abbreviations for a dictionary.
us_state_abbrev = {
    'Alabama': 'AL',
    'Alaska': 'AK',
    'Arizona': 'AZ',
    'Arkansas': 'AR',
    'California': 'CA',
    'Colorado': 'CO',
    'Connecticut': 'CT',
    'Delaware': 'DE',
    'District of Columbia': 'DC',
    'Florida': 'FL',
    'Georgia': 'GA',
    'Hawaii': 'HI',
    'Idaho': 'ID',
    'Illinois': 'IL',
    'Indiana': 'IN',
    'Iowa': 'IA',
    'Kansas': 'KS',
    'Kentucky': 'KY',
    'Louisiana': 'LA',
    'Maine': 'ME',
    'Maryland': 'MD',
    'Massachusetts': 'MA',
    'Michigan': 'MI',
    'Minnesota': 'MN',
    'Mississippi': 'MS',
    'Missouri': 'MO',
    'Montana': 'MT',
    'Nebraska': 'NE',
    'Nevada': 'NV',
    'New-Hampshire': 'NH',
    'New-Jersey': 'NJ',
    'New-Mexico': 'NM',
    'New-York': 'NY',
    'North-Carolina': 'NC',
    'North-Dakota': 'ND',
    'Northern Mariana Islands':'MP',
    'Ohio': 'OH',
    'Oklahoma': 'OK',
    'Oregon': 'OR',
    'Palau': 'PW',
    'Pennsylvania': 'PA',
    'Puerto Rico': 'PR',
    'Rhode-Island': 'RI',
    'South-Carolina': 'SC',
    'South-Dakota': 'SD',
    'Tennessee': 'TN',
    'Texas': 'TX',
    'Utah': 'UT',
    'Vermont': 'VT',
    'Virgin Islands': 'VI',
    'Virginia': 'VA',
    'Washington': 'WA',
    'West-Virginia': 'WV',
    'Wisconsin': 'WI',
    'Wyoming': 'WY',
}

#Make the states lowercase to match hyperlinks. 
states = [x.lower() for x in states]
us_state_abbrev = dict((k.lower(),v) for k,v in us_state_abbrev.items())

#Loop through each hyperlink and see if it contains a state as a subsrting.
#If it does, add the link to a list containing links for each state page. 
US = []
for datum in data:
    for state in states:
        if state in datum:
            US.append([state,datum])
        
#Remove duplicate links, Vermont Ice+Mixed, and Tennessee Wall (they are duplicates!)
#Check length. Should be 48. 
res = [] 
[res.append(x) for x in US if x not in res] 
#US=list(set(US))
res.remove(['tennessee','https://www.mountainproject.com/area/105851828/the-tennessee-wall'])
res.remove(['vermont','https://www.mountainproject.com/area/107280521/vermont-ice-and-mixed'])
links = dict(res)
print(len(links))

48


In [69]:
#Trying to get number of crags on an individual state page.

import pandas as pd
import requests
from bs4 import BeautifulSoup

url = 'https://www.mountainproject.com/area/105708959/california'

def getData(url):

    page = requests.get(url)

    # Getting the webpage, creating a Response object.
    response = requests.get(url)
 
    # Extracting the source code of the page.
    data = response.text
 
    # Passing the source code to Beautiful Soup to create a BeautifulSoup object for it.
    soup = BeautifulSoup(data, 'lxml')
 
    # Extracting all the <a> tags into a list. No ID or class or anything next to hyperlink in HTML
    titles = soup.findAll('a')
 
    # Extracting text from the the <a> and put into list. 
    final = []
    for title in titles:
        final.append(title.text)
            
    df = pd.DataFrame(final)
    return df



#Define function to get location of a string that always occurs after end of list of crags.
def getIndexes(dfObj, value):
    ''' Get index positions of value in dataframe i.e. dfObj.'''
 
    listOfPos = list()
    # Get bool dataframe with True at positions where the given value exists
    result = dfObj.isin([value])
    # Get list of columns that contains the value
    seriesObj = result.any()
    columnNames = list(seriesObj[seriesObj == True].index)
    # Iterate over list of columns and fetch the rows indexes where value exists
    for col in columnNames:
        rows = list(result[col][result[col] == True].index)
        for row in rows:
            listOfPos.append((row, col))
    # Return a list of tuples indicating the positions of value in the dataframe
    return listOfPos

#Define function to remove the unwanted text from the dataframe. Pages with slight differences
#in their layout needed the cutOff point to be manually entered.
def adjustDF(url,state):

    df = pd.DataFrame()
    df = getData(url)

    #Find location of desired string (the first string after last crag), remove everything after it. 
    if 'delaware' in url:
        cutOff = [(48,0)]
    elif 'illinois' in url:
        cutOff = [(74,0)]
    elif 'kentucky' in url:
        cutOff = [(56,0)]
    elif 'louisiana' in url:
        cutOff = [(47,0)]
    elif 'new-jersey' in url:
        cutOff = [(51,0)]
    elif 'rhode-island' in url:
        cutOff = [(67,0)]
    else:
        cutOff = getIndexes(df,'\nPrevious\n')
        
    df.drop(df.index[cutOff[0][0]:(len(df))],inplace = True)


    #Remove text displayed before climbing spots.
    if 'new-jersey' in url:
        df.drop(df.index[0:45], inplace = True)
    else:
        df.drop(df.index[0:46], inplace = True)

    #Rename column to 'crags'
    df.columns = ['Crags']

    #Add state column
    df = df.assign(ST = us_state_abbrev[state])
    
    return df

In [74]:
#Make combined dataframe for all crags in all states using gathered hyperlinks.
def makeDataFrame(links):
    #Instantiate empty dataframe
    hope = pd.DataFrame()
    #Loop through links.
    for sta, link in links.items():
        #Add state dataframes on top of each other.
        hope = hope.append(adjustDF(link,sta))
    return hope

In [78]:
all_df = makeDataFrame(links)
all_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1167 entries, 46 to 77
Data columns (total 2 columns):
Crags    1167 non-null object
ST       1167 non-null object
dtypes: object(2)
memory usage: 27.4+ KB
