# Scraper

In [1]:
import nbimporter

from scraper_config_reader import ScraperConfigReader
from scraper_requests import ScraperRequests
from scraper_data_reader import ReaderScrapedData
from model_article_url_discriminator import URLCleaner
from utils_soup import UtilsSoup
from utils_os import UtilsOS

Importing Jupyter notebook from scraper_config_reader.ipynb
Importing Jupyter notebook from scraper_requests.ipynb
Importing Jupyter notebook from scraper_data_reader.ipynb
Importing Jupyter notebook from utils_os.ipynb
Importing Jupyter notebook from model_article_url_discriminator.ipynb
Importing Jupyter notebook from utils_soup.ipynb


In [2]:
import time
import json
from bs4 import BeautifulSoup
import random
import os

In [3]:
class Scraper:
    """"""
    _extensions_banned = [".jpg", ".png", ".zip", ".xml"]
    
    def _manage_article_directory(self, directory, start_from_zero):
        exists_directory = UtilsOS.directory_exists(directory)
        if start_from_zero and exists_directory:
            UtilsOS.directory_remove(directory)
        UtilsOS.directory_maybe_create(directory)
    
    def _check_link_extension(self, link):
        for extension in self._extensions_banned:
            if link[-len(extension):] == extension:
                return False
        return True
    
    def _normalize_url(self, url):
        url = url.split("?")[0]
        url = url.split("#")[0]
        url = url.split("&")[0]
        url = url.split("@")[0]
        if url[-1] == "/":
            url = url[:-1]
        return url
            
    def _clean_links(self, website, links, prefix_url):
        links = [prefix_url + link if len(link) > 0 and link[0] == "/" else link for link in links] # fix "/index.html"
        links = [prefix_url + "/" + link if "//" not in link else link for link in links] # fix "index.html"
        links = [self._normalize_url(link) for link in links] # remove http parameters (after ?)
        links = list(set(links)) # remove duplicates
        return links
    
    def _get_random_url_by_domain(self, data):
        # return one url for each domain
        res = [] # res = []
        domain_urls = [data[dom] for dom in data.keys()]
        for urls in domain_urls:
            res.append(random.choice(urls)["url"])
        return res
    
    def __init__(self):
        pass
    
    def scrape_website_incremental(self, website, config, max_scraped=-1, max_tries=-1, start_from_zero=False):
        """Incremental scraping of a website, according to a site configuration"""
        directory = "../articles/" + website

        # eventually clean "article" directory
        self._manage_article_directory(directory, start_from_zero)

        # create request handler and url cleaner
        scraper_requests = ScraperRequests()
        url_cleaner = URLCleaner()

        # parse already scraped data
        scraped_data = ReaderScrapedData.read_data_of_website(website)
        titles = set(ReaderScrapedData.get_titles(scraped_data))
        urls_by_domain = self._get_random_url_by_domain(scraped_data) # get one url for each domain of the selected website
        urls = ReaderScrapedData.get_urls(scraped_data)
        urls_not_articles_filename = "url_not_article.json"
        if not UtilsOS.file_exists(urls_not_articles_filename):
            UtilsOS.write_to_json([], urls_not_articles_filename)
        url_not_article = UtilsOS.read_json(urls_not_articles_filename) # list of strings
            

        # decides from which url we start scraping
        first_url = config[ScraperConfigReader.first_url_key][0]
        if len(urls_by_domain) > 0:
            queue = urls_by_domain
        else:
            queue = [first_url]

        # extract prefix url
        prefix_url = "/".join(first_url.split("/")[:3])

        # create the url black list
        already_considered = set(urls)
        already_considered.add(first_url)

        counter = 0
        counter_added = 0
        counter_delta_incremental = len(urls)

        while len(queue) > 0:
            # get url to visit
            url = queue.pop(0)
            
            # if we already know that this link does not correspond to an article, we don't visit it
            if url in url_not_article:
                continue
            
            print("Visiting " + url)
            print("URLs in queue: {0}".format(len(queue)))

            # visit url
            try:
                # make request
                response = scraper_requests.make_get(url)
                soup = BeautifulSoup(response.text, "html.parser")

                # get all outer links from url
                links = soup.find_all("a")
                links = [tag["href"] for tag in links if tag.has_attr("href")]

                # clean links
                links = self._clean_links(website, links, prefix_url)
                links = url_cleaner.filter_urls(links, website)

                # add links to queue if not already considered
                for link in links:
                    if link not in already_considered:
                        already_considered.add(link)
                        queue.append(link)

                # fill data
                data = {"url": url, "html": str(soup)}
                data["title_html"], data["title"] = UtilsSoup.get_with_selector(config[ScraperConfigReader.title_selector_key], soup)
                data["content_html"], data["content"] = UtilsSoup.get_with_selector(config[ScraperConfigReader.content_selector_key], soup)

                # eventually save article
                if data["title"] != "" and data["content"] != "": # we save only if we got the necessary info
                    if data["title"] not in titles:
                        titles.add(data["title"])
                        counter_added += 1
                        print("{0} - Extracted article: ".format(counter_delta_incremental + counter_added) + data["title"])

                        # eventually create domain directory
                        sub_directory = directory + "/" + url.split("/")[2]
                        UtilsOS.directory_maybe_create(sub_directory)

                        # write to file
                        UtilsOS.write_to_json(data, sub_directory + "/" + str(counter_delta_incremental + counter_added) + '.json')

                        # eventually end scraping
                        if counter_added == max_scraped:
                            break
                    else:
                        print("Article already extracted: {0}".format(data["title"]))
                else:
                    url_not_article.append(data["url"])
                    UtilsOS.write_to_json(url_not_article, "url_not_article.json")
                    
                counter += 1
                if counter == max_tries:
                    break
                
                # sleep...
                time.sleep(0.2)
            except Exception as e:
                print(e)
                print("------------")
                time.sleep(1) # time to escape by KeywordInterrupt
                continue

            print("------------")
            
    def incremental_scraping_from_configs(self, configs, max_scraped=10, max_tries=30):
        items = list(configs.items())
        random.shuffle(items)
        while True:
            for website,conf in items:
                self.scrape_website_incremental(website, conf, max_scraped=max_scraped)

# Run scraping

In [4]:
if __name__ == "__main__":
    configs = ScraperConfigReader.get_configs("scraper_configs.json")
    scraper = Scraper()
    #scraper.scrape_website_incremental("tutorialspoint", configs["tutorialspoint"], max_scraped=5, start_from_zero=False)
    scraper.incremental_scraping_from_configs(configs)

Visiting https://medium.com
URLs in queue: 0
------------
Visiting https://medium.com/jobs-at-medium/work-at-medium-959d1a85284e
URLs in queue: 16
1 - Extracted article: Work at Medium  
------------
Visiting https://about.medium.com
URLs in queue: 24
------------
Visiting https://medium.com/s/story/done-with-the-ding-e8331d66b64e
URLs in queue: 33
2 - Extracted article: How to Silence the Persistent Ding of Modern Life  
------------
Visiting https://medium.com/3minread
URLs in queue: 34
------------
Visiting https://medium.com/elemental-by-medium
URLs in queue: 52
------------
Visiting https://help.medium.com
URLs in queue: 62
------------
Visiting https://humanparts.medium.com/riding-in-cars-with-black-boys-1b850402a7e9
URLs in queue: 61
3 - Extracted article: Riding in Cars With Black Boys  
------------
Visiting https://medium.com/creators
URLs in queue: 68
------------
Visiting https://medium.com/human-parts
URLs in queue: 75
------------
Visiting https://medium.com/one-zero
URLs

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



------------
Visiting https://www.tutorialspoint.com/scala/index.htm
URLs in queue: 1087
Traceback (most recent call last):
  File "/anaconda3/envs/education/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2961, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-4-b925f50f7376>", line 5, in <module>
    scraper.incremental_scraping_from_configs(configs)
  File "<ipython-input-3-97512358c713>", line 162, in incremental_scraping_from_configs
    self.scrape_website_incremental(website, conf, max_scraped=max_scraped)
  File "<ipython-input-3-97512358c713>", line 98, in scrape_website_incremental
    response = scraper_requests.make_get(url)
  File "scraper_requests.ipynb", line 52, in make_get
    "        session.proxies = {'http':  'socks5://127.0.0.1:9050',\n",
  File "/anaconda3/envs/education/lib/python3.5/site-packages/requests/sessions.py", line 525, in get
    return self.request('GET', url, **kwargs)
  File "/anaconda3/env

KeyboardInterrupt: 

# Tests

In [None]:
#scraped_data = ReaderScrapedData.read_data_of_website("wikihow")
#titles = set(ReaderScrapedData.get_titles(scraped_data))

In [None]:

import requests

config = {
    "first_url": ["https://mashable.com"],
    "title_selector": [".article-header > .title"],
    "content_selector": [".article-content p"]
}

response = requests.get("https://mashable.com/roundup/best-cheap-laptops-under-500")
soup = BeautifulSoup(response.text, "html.parser")
title = UtilsSoup.get_with_selector(config[ScraperConfigReader.title_selector_key], soup)[1]
title

In [None]:
#title in titles