In [51]:
import requests
import bs4
import hashlib
import os
import json

from urllib.parse import urlparse
from tqdm.notebook import tqdm

In [53]:
def extract_all_links(page_soup, news_domain, site):
    page_links = []
    for link in page_soup.find_all('a'):
        if link.get('href'):
            if (news_domain in link.get('href'))\
                    and link.get('href').endswith('.html'):
                link_url = link.get('href')
                if link_url.startswith(news_domain):
                    link_url = site + link_url
                page_links.append(link_url)
    return page_links

In [48]:
def parse_page(url, news_domain, parent_path, save=True):
    path = None
    links = []
    
    resp = None
    try:
        resp = requests.get(url)
    except Exception as e:
        print(f"An exception[{e}] occurred during fetching of '{url}'")
        return None, None
    resp = requests.get(url)    
    if resp.status_code != 200:
        return None, None    
    
    soup = bs4.BeautifulSoup(resp.content, 'html.parser')
    doc = {
        'url': url,
        'title': soup.title.text,
        'hash': hashlib.md5(url.encode('utf-8')).hexdigest(),
        'text': soup.get_text('\n'),
        'links': links
    }
    
    scheme, netloc, path, params, query, fragment = urlparse(url)
    site = f"{scheme}://{netloc}"
    links.extend(extract_all_links(soup, news_domain, site))
    
    if save:
        path = os.path.join(parent_path, doc['hash'])
        if not os.path.exists(path):
            os.makedirs(path)
        doc_path = os.path.join(path, 'news.json')
        doc_json = json.dumps(doc, indent=4)
        with open(doc_path, 'w') as doc_file:
            doc_file.write(doc_json)
    return doc, path

In [46]:
def link_search(url, news_domain, parent_path, max_depth, max_amount, mode='dfs'):
    queue = []
    visited = dict()
    queue.append((url, 0))
    pbar = tqdm(total = len(queue))
    while len(queue) > 0:
        u, depth = queue.pop() if mode == 'dfs' else queue.pop(0)        
        if u not in visited.keys():
            doc, _ = parse_page(u, news_domain, parent_path)
            pbar.update(1)
            visited[u] = doc
            if doc is not None:
                new_depth = depth + 1
                if (max_depth is None) or (new_depth < max_depth):
                    new_links = [(l, new_depth) for l in doc['links']]
                    if max_amount is not None:
                        if len(visited) < max_amount:
                            new_links = new_links[:max_amount-len(visited)]
                        else:
                            new_links = []
                    queue.extend(new_links)
                    pbar.total += len(new_links)
                    pbar.refresh()
        else:
            pbar.update(1)
    pbar.close()
    return visited

In [10]:
def crawl_news_site(mode='dfs', max_depth=5, news_path="news", max_amount=None):
    link_search('https://www.dailymail.co.uk/news/index.html', '/news/',\
                news_path, max_depth, max_amount)

In [56]:
crawl_news_site(max_depth=3)

  0%|          | 0/1 [00:00<?, ?it/s]

In [57]:
crawl_news_site(max_depth=3, mode='bfs', news_path="news_bfs")

  0%|          | 0/1 [00:00<?, ?it/s]