forked from metaodi/fis-alpine-skiing-scraper
/
scraper.py
90 lines (83 loc) · 3.76 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
DATABASE_NAME = 'data.sqlite'
import os
os.environ['SCRAPERWIKI_DATABASE_NAME'] = DATABASE_NAME
import scraperwiki
import lxml.html
import urlparse
from pprint import pprint
result_schema = {
'rank': None,
'bib': None,
'fis_code': None,
'name': None,
'year_of_birth': None,
'nation': None,
'run_1': None,
'run_2': None,
'total_time': None,
'diff': None,
'fis_points': None,
}
# generator of FIS race links
def race_link_results(url):
html = scraperwiki.scrape(url)
root = lxml.html.fromstring(html)
result_table = root.cssselect("table.fisfootable")[0]
status_cells = result_table.cssselect("td.status")
for status_cell in status_cells:
result_div = status_cell.cssselect("div")[0]
for element, attribute, link, pos in result_div.iterlinks():
event_page = scraperwiki.scrape(link)
event_root = lxml.html.fromstring(event_page)
event_rows = event_root.cssselect("table.footable tbody tr")
for event_row in event_rows:
last_cell = event_row.cssselect("td:last-child")[0]
for race_links in last_cell.iterlinks():
print get_cell_value(event_row.cssselect("td")[2], "span a")
extra = {
'date': get_cell_value(event_row.cssselect("td")[1], "span a"),
'place': get_cell_value(event_row.cssselect("td")[2], "span a"),
'country': get_cell_value(event_row.cssselect("td")[3], "a span"),
'codex': get_cell_value(event_row.cssselect("td")[4], "a"),
'discipline': get_cell_value(event_row.cssselect("td")[5], "a"),
}
yield (race_links[2], extra)
def get_cell_value(element, css):
return element.cssselect(css)[0].text_content()
FIS_URL = "http://data.fis-ski.com/cross-country/results.html"
for link, raceinfo in race_link_results(FIS_URL):
html = scraperwiki.scrape(link)
root = lxml.html.fromstring(html)
result_table = root.cssselect("table.footable")[0]
result_cells = result_table.cssselect("tr")
for result_cell in result_cells:
print result_cell.text_content()
if len(result_cell.cssselect("td")) > 2:
athlete = result_cell.cssselect("td")[1]
print athlete.text_content()
athlete_url = athlete.cssselect("a")[0].get("href")
print athlete_url
parsed = urlparse.urlparse(athlete_url)
athlete_id = urlparse.parse_qs(parsed.query)['competitorid']
result = {
'event': raceinfo['codex'],
'rank': result_cell.cssselect("td")[0].text_content(),
'athlete': result_cell.cssselect("td")[1].text_content(),
'competitor_id': athlete_id,
'yob': result_cell.cssselect("td")[2].text_content(),
'nation': result_cell.cssselect("td")[3].text_content(),
'time': result_cell.cssselect("td")[4].text_content(),
'behind': result_cell.cssselect("td")[5].text_content(),
'points': result_cell.cssselect("td")[6].text_content()
}
print result
scraperwiki.sqlite.save(unique_keys=['athlete'], data=result, table_name="result")
print link
print raceinfo['date']
scraperwiki.sqlite.save(unique_keys=['codex'], data=raceinfo, table_name="data")
# # Write out to the sqlite database using scraperwiki library
# data = {"name": "susan", "occupation": "software developer"}
# scraperwiki.sqlite.save(unique_keys=['name'], data=data, table_name="data")
#
# An arbitrary query against the database
# pprint(scraperwiki.sql.select("* from data where 'name'='susan'"))