# Data Science in Practice - Scraping - 1002

## Adding Climber Data to Ascents

## Scraping climber data and adding info to their ascents. 

Author : Eoghan Cunningham

In [7]:
import pandas as pd
import urllib.request
import numpy as np
from bs4 import BeautifulSoup
import scrapy
from loguru import logger
from scrapy.crawler import CrawlerProcess

Below are the necessary links to access route and climber details respectively. As you can see I've them up so they can be formatted with the route or climber ID and I tested it quickly with my own ID.

In [8]:
ukc_climbers_dir = '../data/raw/'
ukc_climbers_stats_dir = '../data/stats/'
graphs_link = 'https://www.ukclimbing.com/logbook/showgraph.php?id={}'
ascents_file = '../data/ascents_data.csv'
british_grades = ['S','HS','VS','HVS','E1','E2','E3','E4','E5','E6','E7','E8']

ascents = pd.read_csv(ascents_file)

In [9]:
ascents.head()

Unnamed: 0.1,Unnamed: 0,name,date,style,comment,climber_id,grade,location
0,0,Hurricane,"15 Oct, 2018",Lead O/S,Another brilliant FH E2. Lead both pitches and...,57132,E2,Fair Head
1,2,Hurricane,"19 Jul, 2018",Lead O/S,Led both pitch,233566,E2,Fair Head
2,3,Hurricane,"7 Jul, 2018",AltLd O/S,with Robert Duran,133563,E2,Fair Head
3,4,Hurricane,"25 Jun, 2018",AltLd O/S,Great splitter climbing.with Mike Hutton,12933,E2,Fair Head
4,5,Hurricane,"4 Jun, 2018",AltLd O/S,Amazing! Led p2with Aggie,158515,E2,Fair Head


The ascents file contains an old index. We can drop this. 

In [4]:
ascents = ascents.drop(['Unnamed: 0'], axis=1)
ascents.sample(10)

Unnamed: 0,name,date,style,comment,climber_id,grade,location
42606,Great Portland Street,"11 Jun, 2011",Lead O/S,"Take lots of small wires. Found the crux okay,...",100531,HVS,Millstone Edge
19654,Army Dreamers,"24 Apr, 2005",Lead O/S,with Patrick Stockreisser,24102,HVS,St. Govan's Head
50337,The Plum,"13 Jun, 1993",Lead O/S,,51446,E1,Craig Bwlch y Moch (Tremadog)
36964,Knightsbridge,"28 Aug, 2018",Lead O/S,,123606,E2,Millstone Edge
35783,Eros,"9 Dec, 2015",Lead rpt,mad hotaches!!with JemG,155392,E1,Millstone Edge
46379,The Corner,"3 Jul, 2018",AltLd O/S,,148727,HVS,Clogwyn Du'r Arddu (Cloggy)
51489,One Step in the Clouds,"28 May, 2017",AltLd O/S,"I led P1 & P3, Tim led P2with tim_mcd",116604,VS,Craig Bwlch y Moch (Tremadog)
38577,Embankment 2,"?Oct, 2015",Lead O/S,,199099,VS,Millstone Edge
28559,Wall Climb,"8 Jun, 2015",Lead O/S,with mlok2000,117651,VS,Curbar Edge
56614,Rienetta,"11 Mar, 2000",AltLd O/S,with Dan Bailey,43516,HS,Craig Bwlch y Moch (Tremadog)


In [5]:
print('Data contains ', len(ascents), 'and ', ascents['climber_id'].nunique(), 'unique climbers')

Data contains  58953 and  5419 unique climbers


Each ascent is associated with a climber_id. We can use this to obtain further information about a climber. From the numbers above, we can see our average climber is responsible for about 10 ascents. By storing this climber data locally we can reduce our requests to UKC by a factor of 10. 

In [6]:
# The following Spider takes each climber id in the ascents dataframe and checks if it has already been stored locally,
# If not it is downloaded.

class UKCSpider(scrapy.Spider):
    
    name = "ukc_spider"

    def start_requests(self):
        
        for climber_id in ascents['climber_id'].values:
            
            filename = '{}{}.html'.format(ukc_climbers_dir,climber_id)
            
            if not(os.path.isfile(filename)):
                yield scrapy.Request(
                    url=graphs_link.format(climber_id),
                    callback=self.parse,
                    meta={'climber_id': climber_id, 'filename': filename},
                )

    def parse(self, response):
        filename = response.meta['filename']
        with open(filename, 'w') as f:
            f.write(response.text)
        self.log('Saved file %s' % filename)
        
process = CrawlerProcess()

process.crawl(UKCSpider)
process.start() 

2019-05-06 14:36:27 [scrapy.utils.log] INFO: Scrapy 1.6.0 started (bot: scrapybot)
2019-05-06 14:36:27 [scrapy.utils.log] INFO: Versions: lxml 4.2.5.0, libxml2 2.9.8, cssselect 1.0.3, parsel 1.5.1, w3lib 1.20.0, Twisted 18.9.0, Python 3.7.1 (default, Dec 14 2018, 13:28:58) - [Clang 4.0.1 (tags/RELEASE_401/final)], pyOpenSSL 18.0.0 (OpenSSL 1.1.1a  20 Nov 2018), cryptography 2.4.2, Platform Darwin-15.6.0-x86_64-i386-64bit
2019-05-06 14:36:27 [scrapy.crawler] INFO: Overridden settings: {}
2019-05-06 14:36:27 [scrapy.extensions.telnet] INFO: Telnet Password: 7bb13105d231447a
2019-05-06 14:36:27 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.logstats.LogStats']
2019-05-06 14:36:27 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
 'scrapy.downloadermiddlewares.downloadtimeout.D

The function below, **add_climber_data**, takes a dataframe containing ascents and calls a helper function, **get_climber_data**, to retreive and add some info to each ascent. It takes the climber's ID and the year of the ascent and returns information on where they are local to and what grades they climbed that year. 


In [7]:
def produce_stats_for_climber(climber_id) : 
        logger.debug('producing stats for climber {}'.format(climber_id))

        filename = '{}{}.html'.format(ukc_climbers_dir,climber_id)
        f=open(filename, "r")
        source = f.read()
        soup = BeautifulSoup(source,'html.parser')
        try:
            # the crag (location) that a climber visits most is the first text field found after the divider with the id 'crag' 
            local_to = soup.find("div", {"id": "crag"}).find('a').text

            # average and max grades are stored in the table that is accessed below. 
            # 'gradetype2' is there title for trad climbing and 'British' indicates the british grading system. 
            table = soup.find("div", {"id": "gradetype2"}).find("h5", string = 'British').nextSibling.nextSibling
            rows = table.find_all('tr')

            stats = []

            for row in rows[1:]: 
                tds = row.find_all('td')
                stats.append([tds[0].string, tds[-2].string, tds[-1].string, local_to])

            df = pd.DataFrame(stats, columns = ['year', 'avg_grade', 'max_grade', 'local_to']).set_index('year')
            df.to_csv("{}{}.csv".format(ukc_climbers_stats_dir,climber_id))
            
        except:
            logger.info('no climber data for {}'.format(climber_id))
            
        return 

In [8]:
climber_ids = ascents['climber_id'].unique()
len(climber_ids)

5419

In [9]:
for climber_id in climber_ids:
    produce_stats_for_climber(climber_id)

2019-04-20 13:18:35.874 | DEBUG    | __main__:produce_stats_for_climber:2 - producing stats for climber 57132
2019-04-20 13:18:36.134 | DEBUG    | __main__:produce_stats_for_climber:2 - producing stats for climber 233566
2019-04-20 13:18:36.237 | DEBUG    | __main__:produce_stats_for_climber:2 - producing stats for climber 133563
2019-04-20 13:18:36.488 | DEBUG    | __main__:produce_stats_for_climber:2 - producing stats for climber 12933
2019-04-20 13:18:36.802 | DEBUG    | __main__:produce_stats_for_climber:2 - producing stats for climber 158515
2019-04-20 13:18:37.004 | DEBUG    | __main__:produce_stats_for_climber:2 - producing stats for climber 151893
2019-04-20 13:18:37.358 | DEBUG    | __main__:produce_stats_for_climber:2 - producing stats for climber 199444
2019-04-20 13:18:37.856 | DEBUG    | __main__:produce_stats_for_climber:2 - producing stats for climber 35159
2019-04-20 13:18:38.079 | DEBUG    | __main__:produce_stats_for_climber:2 - producing stats for climber 41303
2019-

In [19]:
def add_climber_data(route) : 
    
    count = 0
    
    def read_stats(climber_id, year) : 
        
        logger.debug('climber : {}, year : {}, count : {}'.format(climber_id, year, count))   
        filename = '{}{}.csv'.format(ukc_climbers_stats_dir,climber_id)
        stats = pd.read_csv(filename).set_index('year')
        row = stats.loc[int(year)]
        return row['local_to'], row['avg_grade'], row['max_grade']

    local_tos, avg_grades, max_grades = [], [], []
    for _, row in route.iterrows():
        try:
            local_to, avg_grade, max_grade = read_stats(row['climber_id'], row['date'][-4:])
        except: 
            local_to, avg_grade, max_grade = np.nan, np.nan, np.nan
        local_tos.append(local_to)
        avg_grades.append(avg_grade)
        max_grades.append(max_grade)
        count += 1 

    route['local_to'] = local_tos
    route['avg_grade'] = avg_grades
    route['max_grade'] = max_grades
    
    return route


In [20]:
ascents_complete = add_climber_data(ascents)

2019-04-20 13:49:39.759 | DEBUG    | __main__:read_stats:7 - climber : 57132, year : 2018, count : 0
2019-04-20 13:49:39.766 | DEBUG    | __main__:read_stats:7 - climber : 233566, year : 2018, count : 1
2019-04-20 13:49:39.772 | DEBUG    | __main__:read_stats:7 - climber : 133563, year : 2018, count : 2
2019-04-20 13:49:39.780 | DEBUG    | __main__:read_stats:7 - climber : 12933, year : 2018, count : 3
2019-04-20 13:49:39.788 | DEBUG    | __main__:read_stats:7 - climber : 158515, year : 2018, count : 4
2019-04-20 13:49:39.795 | DEBUG    | __main__:read_stats:7 - climber : 151893, year : 2018, count : 5
2019-04-20 13:49:39.804 | DEBUG    | __main__:read_stats:7 - climber : 199444, year : 2018, count : 6
2019-04-20 13:49:39.813 | DEBUG    | __main__:read_stats:7 - climber : 35159, year : 2017, count : 7
2019-04-20 13:49:39.821 | DEBUG    | __main__:read_stats:7 - climber : 41303, year : 2017, count : 8
2019-04-20 13:49:39.830 | DEBUG    | __main__:read_stats:7 - climber : 199444, year : 

In [21]:
ascents_complete.to_csv("../data/ascents_data_with_climber_info.csv")