# Scraper

In [1]:
import nbimporter

from scraper_config_reader import ScraperConfigReader
from scraper_requests import ScraperRequests
from scraper_data_reader import ReaderScrapedData
from model_article_url_discriminator import URLCleaner
from utils_soup import UtilsSoup
from utils_os import UtilsOS
from constants import Constants

Importing Jupyter notebook from scraper_config_reader.ipynb
Importing Jupyter notebook from scraper_requests.ipynb
Importing Jupyter notebook from scraper_data_reader.ipynb
Importing Jupyter notebook from utils_os.ipynb
Importing Jupyter notebook from model_article_url_discriminator.ipynb
Importing Jupyter notebook from constants.ipynb
Importing Jupyter notebook from utils_soup.ipynb


In [2]:
import time
import json
from bs4 import BeautifulSoup
import random
import os
from datetime import datetime
from alphabet_detector import AlphabetDetector

In [3]:
class Scraper:
    """"""
    _extensions_banned = [".jpg", ".png", ".zip", ".xml"]
    
    def _manage_article_directory(self, directory, start_from_zero):
        exists_directory = UtilsOS.directory_exists(directory)
        if start_from_zero and exists_directory:
            UtilsOS.directory_remove(directory)
        UtilsOS.directory_maybe_create(directory)
    
    def _check_link_extension(self, link):
        for extension in self._extensions_banned:
            if link[-len(extension):] == extension:
                return False
        return True
    
    def _normalize_url(self, url):
        url = url.split("?")[0]
        url = url.split("#")[0]
        url = url.split("&")[0]
        url = url.split("@")[0]
        if url[-1] == "/":
            url = url[:-1]
        return url
            
    def _clean_links(self, website, links, prefix_url):
        links = [prefix_url + link if len(link) > 0 and link[0] == "/" else link for link in links] # fix "/index.html"
        links = [prefix_url + "/" + link if "//" not in link else link for link in links] # fix "index.html"
        links = [self._normalize_url(link) for link in links] # remove http parameters (after ?)
        links = list(set(links)) # remove duplicates
        return links
    
    def _ok_title(self, title, ad):
        num_words = len(title.split(" "))
        ok_num_words = num_words >= 2 and num_words <= 20
        ok_alphabet = ad.only_alphabet_chars(title, "LATIN")
        return ok_num_words and ok_alphabet

    def _ok_content(self, content, ad):
        num_words = len(content.split(" "))
        ok_num_words = num_words >= 100
        ok_alphabet = ad.only_alphabet_chars(content, "LATIN")
        return ok_num_words and ok_alphabet
    
    def _get_random_url_by_domain(self, data):
        # return one url for each domain
        res = [] # res = []
        domain_urls = [data[dom] for dom in data.keys()]
        for urls in domain_urls:
            res.append(random.choice(urls)["url"])
        return res
    
    def __init__(self):
        pass
    
    def scrape_website_incremental(self, website, config, path_to_articles, path_to_url_not_article,
                                   max_scraped=-1, max_tries=-1, start_from_zero=False):
        """Incremental scraping of a website, according to a site configuration"""
        directory = path_to_articles + "/" + website

        # eventually clean "article" directory
        self._manage_article_directory(directory, start_from_zero)

        # create request handler, url cleaner and alphabet detector
        scraper_requests = ScraperRequests()
        url_cleaner = URLCleaner()
        ad = AlphabetDetector()

        # parse already scraped data
        scraped_data = ReaderScrapedData.read_data_of_website(website, path_to_articles)
        titles = set(ReaderScrapedData.get_titles(scraped_data))
        urls_by_domain = self._get_random_url_by_domain(scraped_data) # get one url for each domain of the selected website
        urls = ReaderScrapedData.get_urls(scraped_data)
        if not UtilsOS.file_exists(path_to_url_not_article):
            UtilsOS.write_to_json([], path_to_url_not_article)
        url_not_article = UtilsOS.read_json(path_to_url_not_article) # list of strings
            

        # decides from which url we start scraping
        first_url = config[ScraperConfigReader.first_url_key][0]
        if len(urls_by_domain) > 0:
            queue = urls_by_domain
        else:
            queue = [first_url]

        # extract prefix url
        prefix_url = "/".join(first_url.split("/")[:3])

        # create the url black list
        already_considered = set(urls)
        already_considered.add(first_url)

        counter = 0
        counter_added = 0
        counter_delta_incremental = len(urls)

        while len(queue) > 0:
            # get url to visit
            url = queue.pop(0)
            
            # if we already know that this link does not correspond to an article, we don't visit it
            if url in url_not_article and not len(queue) == 0:
                continue
            
            print("Visiting " + url)
            print("URLs in queue: {0}".format(len(queue)))

            # visit url
            try:
                # make request
                response = scraper_requests.make_get(url)
                soup = BeautifulSoup(response.text, "html.parser")

                # get all outer links from url, clean them and add to queue
                links = soup.find_all("a")
                print(len(links))
                links = [tag["href"] for tag in links if tag.has_attr("href")]
                links = self._clean_links(website, links, prefix_url)
                links = url_cleaner.filter_urls(links, website)
                print(len(links))
                for link in links:
                    if link not in already_considered:
                        already_considered.add(link)
                        queue.append(link)

                # fill data
                data = {"url": url, "html": str(soup)}
                data["title_html"], data["title"] = UtilsSoup.get_with_selector(config[ScraperConfigReader.title_selector_key], soup)
                data["content_html"], data["content"] = UtilsSoup.get_with_selector(config[ScraperConfigReader.content_selector_key], soup)
                data["timestamp_scraper"] = datetime.today().timestamp()
                data["website"] = website
                
                ok_title = self._ok_title(data["title"], ad)
                ok_content = self._ok_content(data["content"], ad)

                # eventually save article
                if ok_title and ok_content: # we save only if we got the necessary info
                    if data["title"] not in titles:
                        titles.add(data["title"])
                        counter_added += 1
                        print("{0} - Extracted article: ".format(counter_delta_incremental + counter_added) + data["title"])

                        # eventually create domain directory
                        sub_directory = directory + "/" + url.split("/")[2]
                        UtilsOS.directory_maybe_create(sub_directory)

                        # write to file
                        UtilsOS.write_to_json(data, sub_directory + "/" + str(counter_delta_incremental + counter_added) + '.json')

                        # eventually end scraping
                        if counter_added == max_scraped:
                            break
                    else:
                        print("Article already extracted: {0}".format(data["title"]))
                else:
                    url_not_article.append(data["url"])
                    UtilsOS.write_to_json(url_not_article, path_to_url_not_article)
                counter += 1
                if counter == max_tries:
                    break
                
                # sleep...
                time.sleep(0.2)
            except Exception as e:
                print(e)
                print("------------")
                time.sleep(1) # time to escape by KeywordInterrupt
                continue
                
            # shuffle queue
            random.shuffle(queue)

            print("------------")
            
    def incremental_scraping_from_configs(self, configs, path_to_articles, path_to_url_not_article, max_scraped=10, max_tries=30):
        items = list(configs.items())
        random.shuffle(items)
        while True:
            for website, conf in items:
                self.scrape_website_incremental(website, conf, path_to_articles, path_to_url_not_article, max_scraped=max_scraped)

# Run scraping

In [None]:
if __name__ == "__main__":
    configs = ScraperConfigReader.get_configs(Constants.path_to_scraper_config, need_javascript=False)
    scraper = Scraper()
    scraper.incremental_scraping_from_configs(configs, Constants.path_to_articles, Constants.path_to_url_not_article,
                                              max_tries=40)

In [None]:
if __name__ == "__main__":
    configs = ScraperConfigReader.get_configs(Constants.path_to_scraper_config, need_javascript=False)
    scraper = Scraper()
    website = "techcrunch"
    scraper.scrape_website_incremental(website, configs[website], Constants.path_to_articles, Constants.path_to_url_not_article)

# Tests

In [28]:
import requests

url = "https://www.vox.com/2019/5/3/18307660/climate-change-green-new-deal-bill-mckibben-falter4"

configs = ScraperConfigReader.get_configs(Constants.path_to_scraper_config, need_javascript=False)
website = "vox"
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")
title = UtilsSoup.get_with_selector(configs[website][ScraperConfigReader.title_selector_key], soup)[1]
title

'Bill McKibben has been sounding the climate alarm for decades. Here’s his best advice.  '

In [29]:
UtilsSoup.get_with_selector(configs[website][ScraperConfigReader.content_selector_key], soup)[1]

'\n One of the first writers to sound the alarm on climate change was Bill McKibben.   \n His 1989 book,  The End of Nature   , introduced a mainstream audience to the problem of rising greenhouse gas emissions, and propelled him to eventually form the international environmental group  350.org   in 2007.   \n McKibben’s latest book,  Falter   ,   is a depressing vindication of his first one. Thirty years ago, he warned that human beings were altering the planet in such a way that we would imperil our own existence. Today, he says, “we are even deeper in the hole.”  \n There is now no conceivable way to stop climate change; at best, it’s a battle to mitigate its impact — and we’re quickly running out of time. Even worse, McKibben writes, “because of the way power and wealth are currently distributed on our planet ... we’re uniquely ill-prepared to cope with the emerging challenges.”  \n Despite the gloomy tone, McKibben’s book ends with an affirmation of the power of activism and nonvi

In [None]:
soup