In [1]:
import nbimporter
from utils_os import UtilsOS
from scraper_data_reader import ReaderScrapedData
from constants import Constants

import pycountry
import itertools
from alphabet_detector import AlphabetDetector

Importing Jupyter notebook from utils_os.ipynb
Importing Jupyter notebook from scraper_data_reader.ipynb
Importing Jupyter notebook from scraper_config_reader.ipynb
Importing Jupyter notebook from constants.ipynb


In [62]:
class URLCleaner:
    def _contains_lang_code(self, url):
        """Returns true if the url contains a language code"""
        return any(t in self._lang_codes for t in url)

    def _contains_website_name(self, url, website):
        """Returns true if the url contains the website name"""
        return any(website in t for t in url)

    def _contains_banned_extension(self, url):
        """Returns true if the url does not end with a banned extension"""
        return any(extension in url[-1] for extension in self._extensions_banned if len(url) > 0)

    def _contains_banned_word(self, url):
        """Returns true if the url does not contain a banned word"""
        return any(word in url for word in self._banned_words)
    
    def _contains_foreign_alphabet(self, url):
        """Returns true if the url contains a character that is not LATIN or bad encoded strings (signaled by a lot of % chars)"""
        has_not_latin = not self._ad.only_alphabet_chars("".join(url), "LATIN")
        has_bad_encoding = "".join(url).count("%") >= 3
        return has_not_latin or has_bad_encoding 
    
    def _website_part_is_ok(self, url):
        """Returns true if the website part is something like www.xxxxxxxx.aaa where www and aaa must be long at least 3 characters"""
        website_part = url[2]
        ok_start = len(website_part.split(".")[0]) >= 3
        ok_end = len(website_part.split(".")[-1]) >= 3
        return ok_start and ok_end
    
    def _does_end_with_number(self, url):
        return url[-1].isdigit()
    
    def _ends_with_something_not_allowed(self, url):
        return any(end in url[-1] for end in self._not_allowed_endings if len(url) > 0)
    
    def __init__(self):
        self._banned_words = set(["video", "page", "live", "about", "search", "contact-us",
                        "terms", "author", "login", "submit", "sgs",
                        "signin", "policy", "comments", "register"])
        self._extensions_banned = set([".jpg", ".png", ".zip", ".xml", ".jpeg"])
        
        self._lang_codes = [lang.alpha_3.lower()[:2] for lang in pycountry.languages]
        self._lang_codes = [t[0] + t[1] + t[2]
                           for t in zip(self._lang_codes, ["-" for i in range(len(self._lang_codes))], self._lang_codes)]
        self._lang_codes = list(set(self._lang_codes))
        self._lang_codes.remove("en-en")
        
        self._not_allowed_endings = set(["mailto"])
        
        self._ad = AlphabetDetector()

    def filter_urls(self, urls, website):
        urls = [url.split("/") for url in urls]
        
        urls = [url for url in urls if not self._contains_lang_code(url)]
        urls = [url for url in urls if self._contains_website_name(url, website)]
        urls = [url for url in urls if not self._contains_banned_extension(url)]
        urls = [url for url in urls if not self._contains_banned_word(url)]
        urls = [url for url in urls if not self._contains_foreign_alphabet(url)]
        urls = [url for url in urls if not self._does_end_with_number(url)]
        urls = [url for url in urls if self._website_part_is_ok(url)]
        urls = [url for url in urls if not self._ends_with_something_not_allowed(url)]
        
        return ["/".join(url) for url in urls]

# Test

In [64]:
url_cleaner = URLCleaner()
url_cleaner.filter_urls(["https://www.bigdatanews.datasciencecentral.com/"], "datasciencecentral")

['https://www.bigdatanews.datasciencecentral.com/']

In [43]:
if __name__ == "__main__":
    # get urls that are not articles
    url_not_article = UtilsOS.read_json(Constants.path_to_url_not_article)

    # get urls that are articles
    data = ReaderScrapedData.read_data(Constants.path_to_scraper_config, Constants.path_to_articles)
    dataset_nested = [data[website][domain] for website in data.keys() for domain in data[website].keys()]
    dataset = [el for subl in dataset_nested for el in subl]
    url_article = [sample["url"] for sample in dataset]
    
    url_cleaner = URLCleaner()

    num_filtered = len(url_not_article) - len(url_cleaner.filter_urls(url_not_article, ""))
    print("URLs without article: {0}".format(len(url_not_article)))
    print("\tFiltered: {0} --> {1:.2f}% (better close to 100%)".format(num_filtered, num_filtered / len(url_not_article) * 100))

    num_filtered = len(url_article) - len(url_cleaner.filter_urls(url_article, ""))
    print("URLs with article: {0}".format(len(url_article)))
    print("\tFiltered: {0} --> {1:.2f}% (better close to 0%, but I may be removing wrong articles so it's ok)".format(num_filtered, num_filtered / len(url_article) * 100))

URLs without article: 20
	Filtered: 2 --> 10.00% (better close to 100%)
URLs with article: 32
	Filtered: 2 --> 6.25% (better close to 0%, but I may be removing wrong articles so it's ok)
