In [1]:
import nbimporter
from utils_os import UtilsOS
from scraper_data_reader import ReaderScrapedData

import pycountry
import itertools

Importing Jupyter notebook from utils_os.ipynb
Importing Jupyter notebook from scraper_data_reader.ipynb
Importing Jupyter notebook from scraper_config_reader.ipynb


In [2]:
class URLCleaner:
    def _contains_lang_code(self, url):
        """Returns true if the url contains a language code"""
        return any(t in self._lang_codes for t in url)

    def _contains_website_name(self, url, website):
        """Returns true if the url contains the website name"""
        return any(website in t for t in url)

    def _contains_banned_extension(self, url):
        """Returns true if the url does not end with a banned extension"""
        return any(extension in url[-1] for extension in self._extensions_banned if len(url) > 0)

    def _contains_banned_word(self, url):
        """Returns true if the url does not end with a banned extension"""
        return any(word in url for word in self._banned_words)
    
    def _does_end_with_number(self, url):
        return url[-1].isdigit()
    
    def __init__(self):
        self._banned_words = set(["category", "video", "page", "live", "about", "search", "contact-us",
                        "terms", "author", "login", "submit", "sgs", "tagged", "tag", "feeds",
                        "topic", "topics", "signin", "policy", "comments", "feed"])
        self._extensions_banned = set([".jpg", ".png", ".zip", ".xml"])
        
        self._lang_codes = [lang.alpha_3.lower()[:2] for lang in pycountry.languages]
        self._lang_codes = [t[0] + t[1] + t[2]
                           for t in zip(self._lang_codes, ["-" for i in range(len(self._lang_codes))], self._lang_codes)]
        self._lang_codes = list(set(self._lang_codes))
        self._lang_codes.remove("en-en")

    def filter_urls(self, urls, website):
        urls = [url.split("/") for url in urls]
        
        urls = [url for url in urls if not self._contains_lang_code(url)]
        urls = [url for url in urls if self._contains_website_name(url, website)]
        urls = [url for url in urls if not self._contains_banned_extension(url)]
        urls = [url for url in urls if not self._contains_banned_word(url)]
        urls = [url for url in urls if not self._does_end_with_number(url)]
        
        return ["/".join(url) for url in urls]

# Test

In [3]:
if __name__ == "__main__":
    # get urls that are not articles
    url_not_article = UtilsOS.read_json("url_not_article.json")

    # get urls that are articles
    data = ReaderScrapedData.read_data("scraper_configs.json")
    dataset_nested = [data[website][domain] for website in data.keys() for domain in data[website].keys()]
    dataset = [el for subl in dataset_nested for el in subl]
    url_article = [sample["url"] for sample in dataset]
    
    url_cleaner = URLCleaner()

    num_filtered = len(url_not_article) - len(url_cleaner.filter_urls(url_not_article, ""))
    print("URLs without article: {0}".format(len(url_not_article)))
    print("\tFiltered: {0} --> {1:.2f}%".format(num_filtered, num_filtered / len(url_not_article) * 100))


    num_filtered = len(url_article) - len(url_cleaner.filter_urls(url_article, ""))
    print("URLs with article: {0}".format(len(url_article)))
    print("\tFiltered: {0} --> {1:.2f}%".format(num_filtered, num_filtered / len(url_article) * 100))

URLs without article: 322
	Filtered: 131 --> 40.68%
URLs with article: 2971
	Filtered: 74 --> 2.49%
