Skip to content

Commit

Permalink
Adds a mechanism for accepting or rejecting articles (#184)
Browse files Browse the repository at this point in the history
* Starts with adding html classification

* Changes the html classification to be function based

* Implements url based filtering for the ndr

* Changes the classifier to be optional

* Finishes merge with main

* Fixes isort

* Improves the readability of the article classification and adresses some of the comments from @MaxDall

* Adresses the comments from @MaxDall

* Fixes a variable name

* Updates the typing of the classifier

* Moved classification to a new a file and reworked classifier a bit.

* added some documentation and switched html/url parameter

---------

Co-authored-by: MaxDall <max.dallabetta@googlemail.com>
  • Loading branch information
Weyaaron and MaxDall authored May 9, 2023
1 parent fda9de4 commit 35e15fe
Show file tree
Hide file tree
Showing 5 changed files with 57 additions and 3 deletions.
5 changes: 4 additions & 1 deletion src/fundus/publishers/base_objects.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
from dataclasses import dataclass, field
from enum import Enum, unique
from typing import Any, Dict, Iterator, List, Optional, Type
from typing import Any, Callable, Dict, Iterator, List, Optional, Type

from fundus.parser import BaseParser
from fundus.scraping.scraper import ArticleClassifier


@dataclass(frozen=True)
Expand All @@ -11,6 +12,7 @@ class PublisherSpec:
parser: Type[BaseParser]
rss_feeds: List[str] = field(default_factory=list)
sitemaps: List[str] = field(default_factory=list)
article_classifier: Optional[ArticleClassifier] = field(default=None)
news_map: Optional[str] = field(default=None)

def __post_init__(self):
Expand All @@ -34,6 +36,7 @@ def __init__(self, spec: PublisherSpec):
self.sitemaps = spec.sitemaps
self.news_map = spec.news_map
self.parser = spec.parser
self.article_classifier = spec.article_classifier

def supports(self, source_type: Optional[str]) -> bool:
if source_type == "rss":
Expand Down
2 changes: 2 additions & 0 deletions src/fundus/publishers/de/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from datetime import datetime

from fundus.publishers.base_objects import PublisherEnum, PublisherSpec
from fundus.scraping.classification import regex_classifier

from .berliner_zeitung import BerlinerZeitungParser
from .die_welt import DieWeltParser
Expand Down Expand Up @@ -127,6 +128,7 @@ class DE(PublisherEnum):
news_map="https://www.ndr.de/sitemap112-newssitemap.xml",
sitemaps=["https://www.ndr.de/sitemap112-sitemap.xml"],
parser=NDRParser,
article_classifier=lambda url, html: not regex_classifier("podcast[0-9]{4}")(url),
)

Taz = PublisherSpec(
Expand Down
9 changes: 9 additions & 0 deletions src/fundus/scraping/classification.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
import re
from typing import Callable


def regex_classifier(regex: str) -> Callable[[str], bool]:
def classify(url: str):
return bool(re.search(regex, url))

return classify
22 changes: 20 additions & 2 deletions src/fundus/scraping/pipeline.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,15 @@
from typing import Iterator, List, Literal, Optional, Set, Tuple, Type, Union
from typing import (
Any,
Callable,
Iterator,
List,
Literal,
Optional,
Set,
Tuple,
Type,
Union,
)

import more_itertools

Expand Down Expand Up @@ -71,7 +82,14 @@ def crawl(
sources.append(SitemapSource(spec.news_map, publisher=spec.name))

if sources:
scrapers.append(Scraper(*sources, parser=spec.parser(), extraction_filter=extraction_filter))
scrapers.append(
Scraper(
*sources,
parser=spec.parser(),
article_classifier=spec.article_classifier,
extraction_filter=extraction_filter,
)
)

if scrapers:
pipeline = Pipeline(*scrapers)
Expand Down
22 changes: 22 additions & 0 deletions src/fundus/scraping/scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,22 @@ def __call__(self, extracted: Dict[str, Any]) -> bool:
...


class ArticleClassifier(Protocol):
"""Classifies a website, represented by a given <url> and <html> as an article.
When called with (<url>, <html>), an object satisfying this protocol should return
the truth value of a binary classification classifying the website represented with
<url> and <html> as article or not.
Returns: This is a binary classification, so:
<True>: The represented website is considered to be an article:
<False>: The represented website is considered not to be an article
"""

def __call__(self, url: str, html: str) -> bool:
...


class Requires:
def __init__(self, *required_attributes: str) -> None:
self.required_attributes = set(required_attributes)
Expand All @@ -27,10 +43,12 @@ def __init__(
*sources: Source,
parser: BaseParser,
extraction_filter: Optional[ExtractionFilter] = None,
article_classifier: Optional[ArticleClassifier] = None,
):
self.sources = list(sources)
self.parser = parser
self.extraction_filter = extraction_filter
self.article_classifier = article_classifier

if isinstance(extraction_filter, Requires):
supported_attributes = set(parser.attributes().names)
Expand All @@ -50,7 +68,11 @@ def scrape(self, error_handling: Literal["suppress", "catch", "raise"], batch_si
for crawler in self.sources:
for article_source in crawler.fetch(batch_size):
try:
if self.article_classifier and self.article_classifier(article_source.url, article_source.html):
continue

extraction = self.parser.parse(article_source.html, error_handling)

if self.extraction_filter and not self.extraction_filter(extraction):
continue
except Exception as err:
Expand Down

0 comments on commit 35e15fe

Please sign in to comment.