**Async and Multi-process crawing is much much faster**. I initially wrote the engadget crawer as a single threaded class. Because the python `requests` library is synchronous, the crawler spent virtually all time waiting for the `GET` requests.
    
    This could be made a *lot* faster by parallelizing the crawling, or use proper async pattern. 

    This thought came to me pretty late during the second crawl so I did not implement it. But for future work, parallel and async crawler is going to be on the todo list.
    
    
## TODO

- [ ] use async pattern for the requests, so that we don't spend 90% of the time waiting for `GET` request to finish.
- [ ] use multiple-threads to craw.

In [1]:
from bs4 import BeautifulSoup
import requests
from IPython.display import HTML
import re
import pickle

from IPython.display import clear_output

In [2]:
page = requests.get('https://www.engadget.com/2017/03/24/razers-paid-to-play-program-bribes-gamers-to-use-its-cortex-s/')
soup = BeautifulSoup(page.content, 'html.parser')

In [None]:
soup.prettify()

In [4]:
html = ""
for n in soup.findAll('div', attrs={"class":"article-text"}):
    html += str(n)
    
HTML(html)

In [None]:
for a in soup.findAll('a'):
    mask = re.compile('https:\/\/www.engadget.com\/\d\d\d\d\/\d\d\/\d\d\/(.*)');
    # print(mask.match(a['href'] ) )
    if mask.match(a['href']) is not None:
        print(a)

In [None]:
class Scrape():
    def __init__(self, folder):
        self.cache_file_name = folder + ".scrape.progress"
        self.post_url_mask = re.compile('https:\/\/www.engadget.com\/(\d\d\d\d\/\d\d\/\d\d)\/([^?\/:]*)(.*)');
        self.folder = folder
        self.active = []
        self.done = []
        self.i = 0
        
        
    def scrape(self, url):
        if url in self.done: return
        self.done.append(url)
        
        page = requests.get(url)
        soup = BeautifulSoup(page.content, 'html.parser')
        
        if self.post_url_mask.match(url):
            html = ""
            for n in soup.findAll('div', attrs={"class":"article-text"}):
                html += str(n)

            match = self.post_url_mask.match(url)
            title = match.group(2)
            date = match.group(1)

            print(date + ":  " + title)

            with open(self.folder + title + '.txt', 'w+') as f:
                soup = BeautifulSoup(html.replace('</p>', '\n\n</p>').replace('  ', ' '), 'html.parser')
                f.write(soup.getText())
        
        for a in soup.findAll('a'):
            try: 
                if mask.match(a['href']) is not None:
                    self.add_link(a['href'])
            except KeyError:
                print('Warning: anchor has empty href.')
            
    def craw(self, url, once=False):
        if url is not None:
            self.active.append(url)

        while len(self.active) > 0:
            self.i += 1
            url = self.active.pop(0)
            self.scrape(url)
            self.done.append(url)
            
            if once:
                break
                
            if self.i%50 == 49:
                clear_output(wait=True)
                print("... #{:d}".format(self.i))
        
    def add_link(self, link):
        if link in self.done or link in self.active:
            return
        self.active.append(link)
            
    def __enter__(self, *args):
        try:
            with open(self.cache_file_name, 'rb') as f:
                
                print(self.cache_file_name)
                cache = pickle.load(f)
                self.active = cache['active'] or []
                self.done = cache['done'] or []
                self.i = cache['i'] or 0
                
        except FileNotFoundError:
            print('no progress file found')
        except EOFError:
            print('Warning: file is empty')
        except AttributeError as err:
            print('cache does not have key ' + str(err))
        
        return self
            
    def __exit__(self, *args):
        print('scrape has exited')
        with open(self.cache_file_name, 'wb') as f:
            print(self.active, self.done)
            pickle.dump({
                "active": self.active,
                "done": self.done,
                "i": self.i
            }, f)
        print('scrape state has been saved')
        
with Scrape('./engadget_data/') as s:
    
    s.craw('https://www.engadget.com')
    #s.craw('https://www.engadget.com/2010/06/23/google-wins-youtube-copyright-case-against-viacom/')

... #50749
2008/10/15:  panasonic-bringing-blu-ray-recorder-to-us-in-2009
2008/10/23:  humax-announces-320gb-freesat-foxsat-hdr-for-uk-market
2008/11/21:  humaxs-320gb-freesat-foxsat-hdr-now-on-sale
2008/03/07:  future-shop-offers-trade-in-credit-for-hd-dvd-players-will-dona
2007/12/24:  queens-christmas-day-speech-broadcast-in-hd-on-youtube
2008/03/20:  viddyou-takes-online-video-sharing-to-1080p
2008/03/19:  time-warner-cable-signs-up-for-march-madness-vod
2008/03/03:  comcast-to-offer-up-ncaa-march-madness-more-films-on-hd-vod
2008/02/28:  bring-it-cbs-to-offer-march-madness-hd-vod
2008/04/17:  universals-blu-ray-release-plans-uncovered-details-later-today
2008/04/17:  nbc-universal-brings-heroes-to-blu-ray-august-26
2006/10/10:  sooloos-to-store-your-lossless-tunes
2008/03/12:  frances-numericable-rolls-docsis-3-0-network-to-five-new-cities
2008/10/01:  panasonics-ez-touch-multitouch-remote-control-concept-hands-on
2008/02/20:  panasonic-intros-dmr-br500-dmr-xw320-and-dmr-xw120-hdd

In [None]:
# with open("./.scrape.progress", 'wb') as f:
#     cache = pickle.dump({"active":[], 'done':[]}, f)
#     print(cache)

In [None]:
with open("./engadget_data/.scrape.progress", 'rb') as f:
    cache = pickle.load(f)
    print(cache)

{'active': [], 'done': ['https://www.engadget.com', 'https://www.engadget.com', 'https://www.engadget.com/2017/03/24/how-to-adult-at-security/', 'https://www.engadget.com/2017/03/24/how-to-adult-at-security/', 'https://www.engadget.com/2016/02/19/hospital-ransomware-a-chilling-wake-up-call/', 'https://www.engadget.com/2016/02/19/hospital-ransomware-a-chilling-wake-up-call/', 'https://www.engadget.com/2016/10/28/that-time-your-smart-toaster-broke-the-internet/', 'https://www.engadget.com/2016/10/28/that-time-your-smart-toaster-broke-the-internet/', 'https://www.engadget.com/2017/02/24/how-used-cars-became-a-security-nightmare/', 'https://www.engadget.com/2017/02/24/how-used-cars-became-a-security-nightmare/', 'https://www.engadget.com/2016/02/15/hollywood-hospital-ransomware-attack/', 'https://www.engadget.com/2016/02/15/hollywood-hospital-ransomware-attack/', 'https://www.engadget.com/2016/01/08/you-say-advertising-i-say-block-that-malware/', 'https://www.engadget.com/2016/01/08/you-sa