In [48]:
# STANDARD
from pprint import pprint

# PyPI
from bs4 import BeautifulSoup as BS
import requests
from tqdm import tqdm

# LOCAL
from interface_db import db_interface_sqlite as data

In [49]:
class insert_results:
    def __init__(self, inserted=0, not_inserted=0, exceptions=0):
        self.inserted = inserted
        self.not_inserted = not_inserted
        self.exceptions = exceptions

In [50]:
def parse_html(url: str) -> str:
    """Process and return the parsed html of any webpage.

    Arguments:
        url (str): webpage address to be processed

    Return
        str: [potentially large] string of parsed html

    """
    response = requests.get(url)
    return BS(response.content, 'html.parser')


def check_url_new(table: str, url: str) -> bool:
    """Check a URL against a database table to see if it already exists.
    
    Arguments:
        url (str): webpage page address to check against database table
        table (str): database table to check against

    Return
        bool: True if url exists in given table; False if it doesn't exist.
    
    """
    with data.database() as db:
        existing_url = db.execute(f'SELECT url FROM {table} WHERE url = \'{url}\'')
        if len(existing_url) == 0:
            return True
        else: 
            return False


def remove_smart_quotes (text):
  return text.replace(u"\u2018", "'") \
             .replace(u"\u2019", "'") \
             .replace(u"\u201c", '"') \
             .replace(u"\u201d", '"') \
             .replace(u'"', '')

In [51]:
def fetch_and_insert_categories_from_website(website_name: str) -> insert_results:
    category = data.category()
    results = insert_results()
    with data.database() as db:
        website = db.query_websites(website_name)
        print(f'Pulling categories for {website.name}')
        parsed_html = parse_html(website.url)
        for item in parsed_html.find_all('div', attrs={"class":"related-content clearfix related-content-sm decorated channel-list"}):
            category.url = item.find('a')['href']
            category.name = category.url.rsplit("/")[3]
            category.website_id = website.id
            if check_url_new('categories', category.url) == True:
                db.insert_category(category)
                results.inserted += 1
            else:
                # <placeholder for logging>
                results.not_inserted += 1
    return results

In [52]:
def fetch_and_insert_blogs_from_category(category_name: str) -> insert_results:
    blog = data.blog()
    results = insert_results()
    with data.database() as db:
        category = db.query_categories(category_name)
        print(category.url)
        print(f'Pulling blogs for {category.name}')
        parsed_html = parse_html(category.url)
        #pprint(parsed_html)
        for item in parsed_html.find_all('div', attrs={"class":"author-info"}):
            for title_html in item.find_all('div', attrs={"class":"title"}):
                title_html_a = title_html.find('a')
                blog.name = title_html_a.get_text()
                blog.name = remove_smart_quotes(blog.name)
                blog.url = title_html_a['href']
            for by_line_html in item.find_all('div', attrs={"class":"by-line"}):
                blog.author = by_line_html.find('a').get_text()
            blog.category_id = category.id
            if category.url == 'https://www.patheos.com/new-visions-blogs':
                print(blog.name, blog.url, blog.author)
            if check_url_new('blogs', blog.url) == True:
                db.insert_blog(blog)
                results.inserted += 1
            else:
                # <placeholder for logging>
                results.not_inserted += 1
    return results

In [54]:
with data.database() as db:
    fetch_and_insert_categories_from_website('Patheos Blogs')
    category_names = db.execute('SELECT name FROM categories')

for i in category_names:
    print(i[0])

for name in category_names:
    results = fetch_and_insert_blogs_from_category(name[0])
    print(results.inserted, results.not_inserted)

Pulling categories for Patheos Blogs
buddhist-blogs
catholic-blogs
contemplative-blogs
evangelical-blogs
general-christian-blogs
hindu-blogs
jewish-blogs
latter-day-saint-blogs
muslim-blogs
new-visions-blogs
nonreligious-blogs
pagan-blogs
progressive-christian-blogs
more-voices-blogs
sixseeds-family-blogs
politics-blue-blogs
politics-red-blogs
entertainment-blogs
faith-and-work-blogs
patheos-partner-blogs
https://www.patheos.com/buddhist-blogs
Pulling blogs for buddhist-blogs
5 0
https://www.patheos.com/catholic-blogs
Pulling blogs for catholic-blogs
88 0
https://www.patheos.com/contemplative-blogs
Pulling blogs for contemplative-blogs
8 0
https://www.patheos.com/evangelical-blogs
Pulling blogs for evangelical-blogs
83 0
https://www.patheos.com/general-christian-blogs
Pulling blogs for general-christian-blogs
18 0
https://www.patheos.com/hindu-blogs
Pulling blogs for hindu-blogs
4 0
https://www.patheos.com/jewish-blogs
Pulling blogs for jewish-blogs
6 0
https://www.patheos.com/latter-d