Adds a mechanism for accepting or rejecting articles (#184)

* Starts with adding html classification * Changes the html classification to be function based * Implements url based filtering for the ndr * Changes the classifier to be optional * Finishes merge with main * Fixes isort * Improves the readability of the article classification and adresses some of the comments from @MaxDall * Adresses the comments from @MaxDall * Fixes a variable name * Updates the typing of the classifier * Moved classification to a new a file and reworked classifier a bit. * added some documentation and switched html/url parameter --------- Co-authored-by: MaxDall <max.dallabetta@googlemail.com>
flairNLP · May 9, 2023 · 35e15fe · 35e15fe
1 parent fda9de4
commit 35e15fe
Show file tree

Hide file tree

Showing 5 changed files with 57 additions and 3 deletions.
diff --git a/src/fundus/publishers/base_objects.py b/src/fundus/publishers/base_objects.py
@@ -1,8 +1,9 @@
 from dataclasses import dataclass, field
 from enum import Enum, unique
-from typing import Any, Dict, Iterator, List, Optional, Type
+from typing import Any, Callable, Dict, Iterator, List, Optional, Type
 
 from fundus.parser import BaseParser
+from fundus.scraping.scraper import ArticleClassifier
 
 
 @dataclass(frozen=True)
@@ -11,6 +12,7 @@ class PublisherSpec:
     parser: Type[BaseParser]
     rss_feeds: List[str] = field(default_factory=list)
     sitemaps: List[str] = field(default_factory=list)
+    article_classifier: Optional[ArticleClassifier] = field(default=None)
     news_map: Optional[str] = field(default=None)
 
     def __post_init__(self):
@@ -34,6 +36,7 @@ def __init__(self, spec: PublisherSpec):
         self.sitemaps = spec.sitemaps
         self.news_map = spec.news_map
         self.parser = spec.parser
+        self.article_classifier = spec.article_classifier
 
     def supports(self, source_type: Optional[str]) -> bool:
         if source_type == "rss":

diff --git a/src/fundus/publishers/de/__init__.py b/src/fundus/publishers/de/__init__.py
@@ -1,6 +1,7 @@
 from datetime import datetime
 
 from fundus.publishers.base_objects import PublisherEnum, PublisherSpec
+from fundus.scraping.classification import regex_classifier
 
 from .berliner_zeitung import BerlinerZeitungParser
 from .die_welt import DieWeltParser
@@ -127,6 +128,7 @@ class DE(PublisherEnum):
         news_map="https://www.ndr.de/sitemap112-newssitemap.xml",
         sitemaps=["https://www.ndr.de/sitemap112-sitemap.xml"],
         parser=NDRParser,
+        article_classifier=lambda url, html: not regex_classifier("podcast[0-9]{4}")(url),
     )
 
     Taz = PublisherSpec(

diff --git a/src/fundus/scraping/classification.py b/src/fundus/scraping/classification.py
@@ -0,0 +1,9 @@
+import re
+from typing import Callable
+
+
+def regex_classifier(regex: str) -> Callable[[str], bool]:
+    def classify(url: str):
+        return bool(re.search(regex, url))
+
+    return classify
diff --git a/src/fundus/scraping/pipeline.py b/src/fundus/scraping/pipeline.py
@@ -1,4 +1,15 @@
-from typing import Iterator, List, Literal, Optional, Set, Tuple, Type, Union
+from typing import (
+    Any,
+    Callable,
+    Iterator,
+    List,
+    Literal,
+    Optional,
+    Set,
+    Tuple,
+    Type,
+    Union,
+)
 
 import more_itertools
 
@@ -71,7 +82,14 @@ def crawl(
                 sources.append(SitemapSource(spec.news_map, publisher=spec.name))
 
             if sources:
-                scrapers.append(Scraper(*sources, parser=spec.parser(), extraction_filter=extraction_filter))
+                scrapers.append(
+                    Scraper(
+                        *sources,
+                        parser=spec.parser(),
+                        article_classifier=spec.article_classifier,
+                        extraction_filter=extraction_filter,
+                    )
+                )
 
         if scrapers:
             pipeline = Pipeline(*scrapers)

diff --git a/src/fundus/scraping/scraper.py b/src/fundus/scraping/scraper.py
@@ -11,6 +11,22 @@ def __call__(self, extracted: Dict[str, Any]) -> bool:
         ...
 
 
+class ArticleClassifier(Protocol):
+    """Classifies a website, represented by a given <url> and <html> as an article.
+
+    When called with (<url>, <html>), an object satisfying this protocol should return
+    the truth value of a binary classification classifying the website represented with
+    <url> and <html> as article or not.
+
+    Returns: This is a binary classification, so:
+        <True>:     The represented website is considered to be an article:
+        <False>:    The represented website is considered not to be an article
+    """
+
+    def __call__(self, url: str, html: str) -> bool:
+        ...
+
+
 class Requires:
     def __init__(self, *required_attributes: str) -> None:
         self.required_attributes = set(required_attributes)
@@ -27,10 +43,12 @@ def __init__(
         *sources: Source,
         parser: BaseParser,
         extraction_filter: Optional[ExtractionFilter] = None,
+        article_classifier: Optional[ArticleClassifier] = None,
     ):
         self.sources = list(sources)
         self.parser = parser
         self.extraction_filter = extraction_filter
+        self.article_classifier = article_classifier
 
         if isinstance(extraction_filter, Requires):
             supported_attributes = set(parser.attributes().names)
@@ -50,7 +68,11 @@ def scrape(self, error_handling: Literal["suppress", "catch", "raise"], batch_si
         for crawler in self.sources:
             for article_source in crawler.fetch(batch_size):
                 try:
+                    if self.article_classifier and self.article_classifier(article_source.url, article_source.html):
+                        continue
+
                     extraction = self.parser.parse(article_source.html, error_handling)
+
                     if self.extraction_filter and not self.extraction_filter(extraction):
                         continue
                 except Exception as err: