-
-
Notifications
You must be signed in to change notification settings - Fork 414
/
single_crawler.py
296 lines (245 loc) · 11.1 KB
/
single_crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
"""
This script should only be executed by the news-please initial script itself.
This script starts a crawler.
"""
import hashlib
import logging
import shutil
import sys
from ast import literal_eval
import os
from typing import Type
from scrapy import Spider
from scrapy.crawler import CrawlerProcess
from scrapy.settings import Settings
from scrapy.spiderloader import SpiderLoader
from scrapy.utils.log import configure_logging
cur_path = os.path.dirname(os.path.realpath(__file__))
par_path = os.path.dirname(cur_path)
sys.path.append(cur_path)
sys.path.append(par_path)
from newsplease.config import CrawlerConfig
from newsplease.config import JsonConfig
from newsplease.helper import Helper
from newsplease.helper_classes.class_loader import ClassLoader
from newsplease.crawler.items import NewscrawlerItem
from newsplease.crawler.spiders.newsplease_spider import NewspleaseSpider
try:
from _thread import start_new_thread
except ImportError:
from thread import start_new_thread
from twisted.internet.error import ReactorAlreadyRunning
class SingleCrawler(object):
"""
This class is called when this script is executed.
It starts a single crawler, that is passed along to this script.
"""
cfg = None
json = None
log = None
crawler_name = None
process = None
helper = None
cfg_file_path = None
json_file_path = None
cfg_crawler = None
__scrapy_options = None
__default_spider_modules = "newsplease.crawler.spiders"
site_number = None
shall_resume = False
daemonize = False
@classmethod
def create_as_library(cls, url):
"""
Creates a single crawler as in library mode. Crawling will start immediately.
:param url:
:return:
"""
site = {
"crawler": "Download",
"url": url
}
cfg_file_path = os.path.dirname(__file__) + os.path.sep + 'config' + os.path.sep + 'config_lib.cfg'
return cls(cfg_file_path, site, 0, False, False, True)
def __init__(self, cfg_file_path, json_file_path,
site_index, shall_resume, daemonize, library_mode=False):
# set up logging before it's defined via the config file,
# this will be overwritten and all other levels will be put out
# as well, if it will be changed.
configure_logging({"LOG_LEVEL": "CRITICAL"})
self.log = logging.getLogger(__name__)
self.cfg_file_path = cfg_file_path
self.json_file_path = json_file_path
self.site_number = int(site_index)
self.shall_resume = shall_resume \
if isinstance(shall_resume, bool) else literal_eval(shall_resume)
self.daemonize = daemonize \
if isinstance(daemonize, bool) else literal_eval(daemonize)
# set up the config file
self.cfg = CrawlerConfig.get_instance()
self.cfg.setup(self.cfg_file_path)
self.log.debug("Config initialized - Further initialisation.")
self.cfg_crawler = self.cfg.section("Crawler")
# load the URL-input-json-file or - if in library mode - take the json_file_path as the site information (
# kind of hacky..)
if not library_mode:
self.json = JsonConfig.get_instance()
self.json.setup(self.json_file_path)
sites = self.json.get_site_objects()
site = sites[self.site_number]
else:
sites = [json_file_path]
site = json_file_path
if "ignore_regex" in site:
ignore_regex = "(%s)|" % site["ignore_regex"]
else:
ignore_regex = "(%s)|" % \
self.cfg.section('Crawler')['ignore_regex']
# Get the default crawler. The crawler can be overwritten by fallbacks.
if "additional_rss_daemon" in site and self.daemonize:
self.crawler_name = "RssCrawler"
elif "crawler" in site:
self.crawler_name = site["crawler"]
else:
self.crawler_name = self.cfg.section("Crawler")["default"]
# Get the real crawler-class (already "fallen back")
crawler_class = self.get_crawler(self.crawler_name, site["url"])
if not self.cfg.section('Files')['relative_to_start_processes_file']:
relative_to_path = os.path.dirname(self.cfg_file_path)
else:
# absolute dir this script is in
relative_to_path = os.path.dirname(__file__)
news_item_class_name = self.cfg.section("Scrapy").get("item_class", None)
if not news_item_class_name:
news_item_class = NewscrawlerItem
else:
news_item_class = ClassLoader.from_string(news_item_class_name)
if not issubclass(news_item_class, NewscrawlerItem):
raise ImportError("ITEM_CLASS must be a subclass of NewscrawlerItem")
self.helper = Helper(self.cfg.section('Heuristics'),
self.cfg.section("Files")["local_data_directory"],
relative_to_path,
self.cfg.section('Files')['format_relative_path'],
sites,
crawler_class,
news_item_class,
self.cfg.get_working_path())
self.__scrapy_options = self.cfg.get_scrapy_options()
self.update_jobdir(site)
# make sure the crawler does not resume crawling
# if not stated otherwise in the arguments passed to this script
self.remove_jobdir_if_not_resume()
self.load_crawler(crawler_class,
site["url"],
ignore_regex)
# start the job. if in library_mode, do not stop the reactor and so on after this job has finished
# so that further jobs can be executed. it also needs to run in a thread since the reactor.run method seems
# to not return. also, scrapy will attempt to start a new reactor, which fails with an exception, but
# the code continues to run. we catch this excepion in the function 'start_process'.
if library_mode:
start_new_thread(start_process, (self.process, False,))
else:
self.process.start()
def update_jobdir(self, site):
"""
Update the JOBDIR in __scrapy_options for the crawler,
so each crawler gets its own jobdir.
:param object site: a site dict extracted from the json file
"""
working_path = self.cfg.get_working_path()
if not working_path.endswith("/"):
working_path += "/"
jobdirname = self.__scrapy_options["JOBDIRNAME"]
if not jobdirname.endswith("/"):
jobdirname += "/"
site_string = ''.join(site["url"]) + self.crawler_name
hashed = hashlib.md5(site_string.encode('utf-8'))
self.__scrapy_options["JOBDIR"] = working_path + jobdirname + hashed.hexdigest()
def get_crawler(self, crawler: str, url: str):
"""
Checks if a crawler supports a website (the website offers e.g. RSS
or sitemap) and falls back to the fallbacks defined in the config if
the site is not supported.
:param str crawler: Crawler-string (from the crawler-module)
:param str url: the url this crawler is supposed to be loaded with
:rtype: crawler-class or None
"""
check_crawler_has_urls_to_scan = self.cfg_crawler.get('check_crawler_has_urls_to_scan')
checked_crawlers = []
while crawler is not None and crawler not in checked_crawlers:
checked_crawlers.append(crawler)
current = self.get_crawler_class(crawler)
if not issubclass(current, NewspleaseSpider):
self.log.warning("The crawler %s has no supports_site-method defined", crawler)
return current
try:
crawler_supports_site = current.supports_site(url)
except Exception as e:
self.log.info(f'Crawler not supported due to: {str(e)}', exc_info=True)
crawler_supports_site = False
if crawler_supports_site:
if check_crawler_has_urls_to_scan and not current.has_urls_to_scan(url):
self.log.warning(f"Crawler {crawler} has no url to scan for {url}")
else:
self.log.debug("Using crawler %s for %s.", crawler, url)
return current
fallbacks = self.cfg_crawler["fallbacks"]
if crawler in fallbacks and fallbacks[crawler] is not None:
self.log.warning("Crawler %s not supported by %s. Trying to fall back.", crawler, url)
crawler = fallbacks[crawler]
else:
self.log.error("No crawlers (incl. fallbacks) found for url %s.", url)
raise RuntimeError("No crawler found. Quit.")
self.log.error("Could not fall back since you created a fall back "
"loop for %s in the config file.", crawler)
sys.exit(1)
def get_crawler_class(self, crawler: str) -> Type[Spider]:
"""
Searches through the modules in spider_modules for a crawler with
the name passed along.
:param str crawler: Name of the crawler to load
:rtype: crawler-class
"""
spider_modules = self.cfg.section("Scrapy").get("spider_modules", [self.__default_spider_modules])
settings = Settings()
settings.set("SPIDER_MODULES", spider_modules)
spider_loader = SpiderLoader(settings)
return spider_loader.load(crawler)
def load_crawler(self, crawler, url, ignore_regex):
"""
Loads the given crawler with the given url.
:param class crawler: class of the crawler to load
:param str url: url to start the crawler with
:param regex ignore_regex: to be able to ignore urls that match this
regex code
"""
self.process = CrawlerProcess(self.cfg.get_scrapy_options())
self.process.crawl(
crawler,
self.helper,
url=url,
config=self.cfg,
ignore_regex=ignore_regex)
def remove_jobdir_if_not_resume(self):
"""
This method ensures that there's no JOBDIR (with the name and path
stated in the config file) any crawler would automatically resume
crawling with if '--resume' isn't passed to this script.
"""
jobdir = self.__scrapy_options["JOBDIR"]
if (not self.shall_resume or self.daemonize) \
and os.path.exists(jobdir):
shutil.rmtree(jobdir)
self.log.info("Removed " + jobdir + " since '--resume' was not passed to"
" initial.py or this crawler was daemonized.")
def start_process(process, stop_after_job):
try:
process.start(stop_after_job)
except ReactorAlreadyRunning:
pass
if __name__ == "__main__":
SingleCrawler(cfg_file_path=sys.argv[1],
json_file_path=sys.argv[2],
site_index=sys.argv[3],
shall_resume=sys.argv[4],
daemonize=sys.argv[5])