Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Develop #46

Merged
merged 2 commits into from
Jul 13, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 2 additions & 4 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,8 @@
)

crawl_parser = subparsers.add_parser('crawl')
crawl_parser.add_argument('--domain', '-D', help='domain to crawl', required=True)
crawl_parser.add_argument('--urls', '-U', help='urls to crawl, comma separated')
crawl_parser.add_argument('--url-paths', '-P', help='root directory to store data')
crawl_parser.add_argument('--data-root-dir', '-d', help='root directory to store data')
crawl_parser.add_argument('--url', '-U', help='url to crawl')
crawl_parser.add_argument('--output', '-o', help='output file path')
crawl_parser.set_defaults(func=cmd_crawl)

web_parser = subparsers.add_parser('web')
Expand Down
69 changes: 64 additions & 5 deletions webspot/cmd/crawl.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,68 @@
from webspot.crawler.actions.run_crawler import run_crawler
from urllib.parse import urljoin

import requests
from bs4 import BeautifulSoup
from pandas import DataFrame

from webspot.constants.detector import DETECTOR_PLAIN_LIST, DETECTOR_PAGINATION
from webspot.extract.extract_results import extract_rules


def cmd_crawl(args):
run_crawler(
args.domain,
[f'https://{args.domain}'] if args.urls is None else args.urls.split(','),
args.data_root_dir,
url = args.url

results, execution_time, html_requester, graph_loader, detectors = extract_rules(
url=url,
)

# plain list
result_plain_list = results.get(DETECTOR_PLAIN_LIST)
if result_plain_list is None or len(result_plain_list) == 0:
return

# items
full_items = result_plain_list[0].get('selectors').get('full_items')
items_selector = full_items.get('selector')
# print(items_selector)

# fields
fields = result_plain_list[0].get('fields')
# print(fields)

# pagination
result_pagination = results.get(DETECTOR_PAGINATION)
pagination_selector = None
if result_pagination is not None and len(result_pagination) > 0:
pagination_selector = result_pagination[0].get('selectors').get('next').get('selector')
# print(pagination_selector)

res = crawl_page(url, items_selector, fields, pagination_selector)
print(DataFrame(list(res)))


def crawl_page(url, items_selector, fields, pagination_selector):
print(f'requesting {url}')
res = requests.get(url)
soup = BeautifulSoup(res.content, features='lxml')

for el_item in soup.select(items_selector):
row = {}
for f in fields:
try:
if f.get('attribute'):
row[f.get('name')] = el_item.select_one(f.get('selector')).attrs.get(f.get('attribute'))
else:
row[f.get('name')] = el_item.select_one(f.get('selector')).text.strip()
except:
pass
yield row

if pagination_selector is not None:
try:
href = soup.select_one(pagination_selector).attrs.get('href')
next_url = urljoin(url, href)

for row in crawl_page(next_url, items_selector, fields, pagination_selector):
yield row
except:
pass
3 changes: 2 additions & 1 deletion webspot/detect/detectors/plain_list.py
Original file line number Diff line number Diff line change
Expand Up @@ -362,7 +362,8 @@ def _filter(
logger.debug(f'score_item_count: {score_item_count}')

# score
score = score_text_richness + score_complexity + score_item_count
# score = score_text_richness + score_complexity + score_item_count
score = score_text_richness + score_item_count
logger.debug(f'score: {score}')

# skip score less than threshold
Expand Down
Loading