From b4109cc73078b2a392cfcd729607805dee8b56e9 Mon Sep 17 00:00:00 2001 From: Marvin Zhang Date: Thu, 13 Jul 2023 20:21:52 +0800 Subject: [PATCH 1/2] feat: added crawl --- main.py | 6 ++-- webspot/cmd/crawl.py | 69 ++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 66 insertions(+), 9 deletions(-) diff --git a/main.py b/main.py index 22f5b99..8c6d5da 100644 --- a/main.py +++ b/main.py @@ -18,10 +18,8 @@ ) crawl_parser = subparsers.add_parser('crawl') -crawl_parser.add_argument('--domain', '-D', help='domain to crawl', required=True) -crawl_parser.add_argument('--urls', '-U', help='urls to crawl, comma separated') -crawl_parser.add_argument('--url-paths', '-P', help='root directory to store data') -crawl_parser.add_argument('--data-root-dir', '-d', help='root directory to store data') +crawl_parser.add_argument('--url', '-U', help='url to crawl') +crawl_parser.add_argument('--output', '-o', help='output file path') crawl_parser.set_defaults(func=cmd_crawl) web_parser = subparsers.add_parser('web') diff --git a/webspot/cmd/crawl.py b/webspot/cmd/crawl.py index fbe2371..c3f84c8 100644 --- a/webspot/cmd/crawl.py +++ b/webspot/cmd/crawl.py @@ -1,9 +1,68 @@ -from webspot.crawler.actions.run_crawler import run_crawler +from urllib.parse import urljoin + +import requests +from bs4 import BeautifulSoup +from pandas import DataFrame + +from webspot.constants.detector import DETECTOR_PLAIN_LIST, DETECTOR_PAGINATION +from webspot.extract.extract_results import extract_rules def cmd_crawl(args): - run_crawler( - args.domain, - [f'https://{args.domain}'] if args.urls is None else args.urls.split(','), - args.data_root_dir, + url = args.url + + results, execution_time, html_requester, graph_loader, detectors = extract_rules( + url=url, ) + + # plain list + result_plain_list = results.get(DETECTOR_PLAIN_LIST) + if result_plain_list is None or len(result_plain_list) == 0: + return + + # items + full_items = result_plain_list[0].get('selectors').get('full_items') + items_selector = full_items.get('selector') + print(items_selector) + + # fields + fields = result_plain_list[0].get('fields') + print(fields) + + # pagination + result_pagination = results.get(DETECTOR_PAGINATION) + pagination_selector = None + if result_pagination is not None and len(result_pagination) > 0: + pagination_selector = result_pagination[0].get('selectors').get('next').get('selector') + print(pagination_selector) + + res = crawl_page(url, items_selector, fields, pagination_selector) + print(DataFrame(list(res))) + + +def crawl_page(url, items_selector, fields, pagination_selector): + print(f'requesting {url}') + res = requests.get(url) + soup = BeautifulSoup(res.content) + + for el_item in soup.select(items_selector): + row = {} + for f in fields: + try: + if f.get('attribute'): + row[f.get('name')] = el_item.select_one(f.get('selector')).attrs.get(f.get('attribute')) + else: + row[f.get('name')] = el_item.select_one(f.get('selector')).text.strip() + except: + pass + yield row + + if pagination_selector is not None: + try: + href = soup.select_one(pagination_selector).attrs.get('href') + next_url = urljoin(url, href) + + for row in crawl_page(next_url, items_selector, fields, pagination_selector): + yield row + except: + pass From 3c74061f19cc9d671fcce586eab61db8aae58288 Mon Sep 17 00:00:00 2001 From: Marvin Zhang Date: Thu, 13 Jul 2023 20:31:02 +0800 Subject: [PATCH 2/2] feat: updated score calculation --- webspot/cmd/crawl.py | 8 ++++---- webspot/detect/detectors/plain_list.py | 3 ++- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/webspot/cmd/crawl.py b/webspot/cmd/crawl.py index c3f84c8..935fc95 100644 --- a/webspot/cmd/crawl.py +++ b/webspot/cmd/crawl.py @@ -23,18 +23,18 @@ def cmd_crawl(args): # items full_items = result_plain_list[0].get('selectors').get('full_items') items_selector = full_items.get('selector') - print(items_selector) + # print(items_selector) # fields fields = result_plain_list[0].get('fields') - print(fields) + # print(fields) # pagination result_pagination = results.get(DETECTOR_PAGINATION) pagination_selector = None if result_pagination is not None and len(result_pagination) > 0: pagination_selector = result_pagination[0].get('selectors').get('next').get('selector') - print(pagination_selector) + # print(pagination_selector) res = crawl_page(url, items_selector, fields, pagination_selector) print(DataFrame(list(res))) @@ -43,7 +43,7 @@ def cmd_crawl(args): def crawl_page(url, items_selector, fields, pagination_selector): print(f'requesting {url}') res = requests.get(url) - soup = BeautifulSoup(res.content) + soup = BeautifulSoup(res.content, features='lxml') for el_item in soup.select(items_selector): row = {} diff --git a/webspot/detect/detectors/plain_list.py b/webspot/detect/detectors/plain_list.py index ab374de..31529fb 100644 --- a/webspot/detect/detectors/plain_list.py +++ b/webspot/detect/detectors/plain_list.py @@ -362,7 +362,8 @@ def _filter( logger.debug(f'score_item_count: {score_item_count}') # score - score = score_text_richness + score_complexity + score_item_count + # score = score_text_richness + score_complexity + score_item_count + score = score_text_richness + score_item_count logger.debug(f'score: {score}') # skip score less than threshold