# Scraper

In [None]:
import nbimporter

from scraper_config_reader import ScraperConfigReader
from scraper_requests import ScraperRequests
from scraper_data_reader import ReaderScrapedData
from utils_soup import UtilsSoup
from utils_os import UtilsOS

In [None]:
import time
import json
from bs4 import BeautifulSoup
import random
import os

In [None]:
class Scraper:
    """"""
    _extensions_banned = [".jpg", ".png", ".zip"]
    
    def _manage_article_directory(self, directory, start_from_zero):
        exists_directory = UtilsOS.directory_exists(directory)
        if start_from_zero and exists_directory:
            UtilsOS.directory_remove(directory)
        UtilsOS.directory_maybe_create(directory)
    
    def _check_link_extension(self, link):
        for extension in self._extensions_banned:
            if link[-len(extension):] == extension:
                return False
        return True
    
    def _normalize_url(self, url):
        url = url.split("?")[0]
        url = url.split("#")[0]
        url = url.split("&")[0]
        url = url.split("@")[0]
        if url[-1] == "/":
            url = url[:-1]
        return url
            
    def _clean_links(self, website, links, prefix_url):
        links = [prefix_url + link if len(link) > 0 and link[0] == "/" else link for link in links] # fix "/index.html"
        links = [prefix_url + "/" + link if "//" not in link else link for link in links] # fix "index.html"
        links = [link for link in links if website in link] # must contain the name of the website
        links = [link for link in links if self._check_link_extension(link)] # remove links that end with banned extension
        links = [self._normalize_url(link) for link in links] # remove http parameters (after ?)
        links = list(set(links)) # remove duplicates
        return links
    
    def _get_random_url_by_domain(self, data):
        # return one url for each domain
        res = [] # res = []
        domain_urls = [data[dom] for dom in data.keys()]
        for urls in domain_urls:
            res.append(random.choice(urls)["url"])
        return res
    
    def __init__(self):
        pass
    
    def scrape_website_incremental(self, website, config, max_scraped=-1, start_from_zero=False):
        """Incremental scraping of a website, according to a site configuration"""
        directory = "../articles/" + website

        # eventually clean "article" directory
        self._manage_article_directory(directory, start_from_zero)

        # create request handler and scraped data reader
        scraper_requests = ScraperRequests()

        # parse already scraped data
        scraped_data = ReaderScrapedData.read_data_of_website(website)
        titles = set(ReaderScrapedData.get_titles(scraped_data))
        urls_by_domain = self._get_random_url_by_domain(scraped_data) # get one url for each domain of the selected website
        urls = ReaderScrapedData.get_urls(scraped_data)

        # decides from which url we start scraping
        first_url = config[ScraperConfigReader.first_url_key][0]
        if len(urls_by_domain) > 0:
            queue = urls_by_domain
        else:
            queue = [first_url]

        # extract prefix url
        prefix_url = "/".join(first_url.split("/")[:3])

        # create the url black list
        already_considered = set(urls)
        already_considered.add(first_url)

        counter = 0
        counter_delta_incremental = len(urls)

        while len(queue) > 0:
            # get url to visit
            url = queue.pop(0)
            print("Visiting " + url)
            print("URLs in queue: {0}".format(len(queue)))

            # visit url
            try:
                # make request
                response = scraper_requests.make_get(url)
                soup = BeautifulSoup(response.text, "html.parser")

                # get all outer links from url
                links = soup.find_all("a")
                links = [tag["href"] for tag in links if tag.has_attr("href")]

                # clean links
                links = self._clean_links(website, links, prefix_url)

                # add links to queue if not already considered
                for link in links:
                    if link not in already_considered:
                        already_considered.add(link)
                        queue.append(link)

                # fill data
                data = {"url": url, "html": str(soup)}
                data["title_html"], data["title"] = UtilsSoup.get_with_selector(config[ScraperConfigReader.title_selector_key], soup)
                data["content_html"], data["content"] = UtilsSoup.get_with_selector(config[ScraperConfigReader.content_selector_key], soup)

                # eventually save article
                if data["title"] != "" and data["content"] != "" and data["title"] not in titles: # we save only if we got the necessary info
                    titles.add(data["title"])
                    counter += 1
                    print("{0} - Extracted article: ".format(counter_delta_incremental + counter) + data["title"])

                    # eventually create domain directory
                    sub_directory = directory + "/" + url.split("/")[2]
                    UtilsOS.directory_maybe_create(directory)

                    # write to file
                    UtilsOS.write_to_json(sub_directory + "/" + str(counter_delta_incremental + counter) + '.json', data)

                    # eventually end scraping
                    if counter == max_scraped:
                        break

                # sleep...
                time.sleep(0.2)
            except Exception as e:
                print(e)
                print("------------")
                time.sleep(1) # time to escape by KeywordInterrupt
                continue

            print("------------")
            
    def incremental_scraping_from_configs(self, configs, max_scraped=10):
        items = list(configs.items())
        random.shuffle(items)
        while True:
            for website,conf in items:
                self.scrape_website_incremental(website, conf, max_scraped=max_scraped)

# Run scraping

In [None]:
if __name__ == "__main__":
    configs = ScraperConfigReader.get_configs("scraper_configs.json")
    scraper = Scraper()
    #scraper.scrape_website_incremental("tutorialspoint", configs["tutorialspoint"], max_scraped=5, start_from_zero=False)
    scraper.incremental_scraping_from_configs(configs)