This repository has been archived by the owner on Nov 23, 2020. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 3
/
heise_spider.py
65 lines (49 loc) · 2.19 KB
/
heise_spider.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import scrapy
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors import LinkExtractor
from scrapy.selector import Selector
from harvester.items import Comment
import time
import calendar
import re
class HeiseSpider(CrawlSpider):
name = "heise"
allowed_domains = ["www.heise.de"]
start_urls = [
"http://www.heise.de/forum/Telepolis/Kommentare/Ohne-Vorratsdatenspeicherung-sterben-vermisste-Kinder-und-Suizidale/forum-242979/"
]
rules = (
#Rule(LinkExtractor(allow=('/tp/foren/[^/]+/forum-[0-9]+/list'))),
Rule(LinkExtractor(allow=('/posting-[0-9]+/show')), callback='parse_item')
)
def clean_str(self, val):
return val.replace(u'\xa0', u' ').strip()
def to_str(self, arr):
return self.clean_str(''.join(arr))
def parse_date(self, val):
grps = re.search('[0-9]+\. ([A-Za-z]+) [0-9]{4} [0-9]{2}:[0-9]{2}', val)
mnth = grps.group(1)
months = ['Januar', 'Februar', 'M\u00e4rz', 'April', 'Mai', 'Juni', 'Juli', 'August', 'September', 'Oktober', 'November', 'Dezember']
for index, item in enumerate(months):
if item.lower() == mnth.lower():
val = val.replace(mnth, str(index))
break
return calendar.timegm(time.strptime(val, "%d. %m %Y %H:%M"))
def parse_item(self, response):
sel = Selector(response)
isRoot = len(response.xpath("//ul[@class='forum_navi'][2]/li")) == 6
if !isRoot:
# find parent
parent = response.xpath("//span[@class='active_post']/../../../parent::ul[@class='nextlevel_line']/preceding-sibling::div[@class='hover_line']")
# get link
link = parent.xpath(".//div[@class='thread_title']/a")
# extract parent id from href
item = Comment()
item['text'] = self.to_str(sel.xpath("//h3[@class='posting_subject']/text()").extract()) + self.to_str(sel.xpath("//p[@class='posting_text']/text()").extract())
item['url'] = response.url
item['parent'] = 'unknown'
item['level'] = 0
item['thread'] = re.search('forum-([0-9]+)', response.url).group(1)
item['author'] = self.to_str(sel.xpath("//div[@class='user_info']/i//text()").extract())
item['date'] = self.parse_date(self.to_str(response.xpath("//div[@class='posting_date']/text()").extract()))
return item