In [None]:
from bs4 import BeautifulSoup
from collections import Counter
from datetime import datetime
import json
import numpy as np
import pickle
import requests

## Create Functions for Scraping

In [None]:
def make_request(url, params=None):
    """
    Performs GET requests. Returns HTML.
    """
    headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}

    # Request the HTML from EconLog and parse the content.
    response = requests.get(url, headers=headers, params=params)
    html = BeautifulSoup(response.content, 'html.parser')

    return html

In [None]:
def request_containers(author, year):
    """
    Download the HTML containers used to store the article's metadata for a given year.
    Returns a list where each item is an article's HTML container.
    """

    # Request the authors article's posted in the defined year.
    url = 'https://www.econlib.org/author/{}/'.format(author)
    html = make_request(url=url, params={'selected_year': year})

    # Compile a complete list of posts for the year.
    containers = html.find_all('div', {'class': 'min-card-posts-container'})

    return containers

In [None]:
def request_article(url):
    """
    For each article, request the content, returning the paragraph tags. Returns HTML.
    """
    html = make_request(url=url)

    # Post content is a list of paragraphs <p> tags.
    article_content = html.find('div', attrs={"class": "post-content"}).find_all('p')

    try:
        article_label = html.find('div', attrs={"class": "article-label"}).text.strip()
    except:
        article_label = ''

    return article_content, article_label

In [None]:
def extract_authors():
    """
    Extracts all of the authors from the EconLog website. Returns a dictionary with the authors name (First, Last)
    and their name tag in the websites HTML code.
    """
    author_list = dict()

    url = 'https://www.econlib.org/econlog-author'
    html = make_request(url=url)    

    # For each author, add the authors name (Last, First) and the author's user name.
    for author in html.find_all('div', {'class':'title-cell'}):
        author_list[author.find('a').text] = author.find('a').get('href').split('#')[1]

    return author_list

In [None]:
def extract_years(author):
    """
    Extract all of the years an author published an article. Returns a list of the years formatted as integers.
    """
    years = []

    url = 'https://www.econlib.org/author/{}'.format(author)
    html = make_request(url=url)

    for year in html.find_all('div', {'class':"dropdown-menu dropdown-menu-right"})[0].find_all('a'):
        years.append(int(year.text))

    return years

In [None]:
def extract_metadata(container):
    """
    For each article container, extract the metadata. Each container has the articles URL, title and date posted.
    Returns a dictionary where each item is the articlces URL, title and date posted.

    pd.to_datetime(container.find('span', {'class':'min-card-date'}).text, format="%b %d %Y")
    datetime.strptime(container.find('span', {'class':'min-card-date'}).text, '%b %d %Y')
    """
    metadata = dict()

    # For each post, extract the metadata: title, date, and url.
    metadata['url'] = container.find('a').get('href')
    metadata['title'] = container.find('a').text
    metadata['date'] = datetime.strptime(container.find('span', {'class':'min-card-date'}).text, '%b %d %Y').strftime("%m/%d/%Y")

    return metadata

In [None]:
def extract_urls(article):
    """
    Extract the URLs embedded in article text, defined by 'p' tags. Returns a list of URLs.
    """
    embedded_urls = set()

    for p_tag in article:
        urls = p_tag.find_all('a')

        for url in urls:
            embedded_urls.add(url.get('href'))

    return embedded_urls

In [None]:
def extract_text(article):
    """
    Extract the article's text from the 'p' tags. Return a document (string).
    """
    text = []

    for p_tag in content:
        text.append(p_tag.text)

    return " ".join(text)

In [None]:
def count_words(document):
    """
    Count the number of words in
    """
    document = document.replace("'",'')
    document = document.lower()
    document = document.split()

    word_list = Counter(document).most_common()
    counts = [count for word, count in word_list]
    word_count = np.sum(counts)

    return word_count, word_list

### Scape EconLog

In [None]:
# Find all of the authors who've published on EconLog. Returns a dictionary of the author and their
# HTML user-name as {key:value}.
published_authors = extract_authors()

# Get the HTML user-name for Bryan Caplan.
author = published_authors['Caplan, Bryan']

# Using the user-name, extract all of the years the author published at least one article.
published_years = extract_years(author=author)

In [None]:
# In each year, request the HTML containers for the articles published in that year.
# Each HTML container has the article's title, author, date, and the articles URL.
article_metadata = dict()

for year in published_years:
    container_metadata = []

    html_containers = request_containers(author=author, year=year)

    for container in html_containers:
        container_metadata.append(extract_metadata(container=container))

    article_metadata[year] = container_metadata

In [None]:
for year, articles in article_metadata.items():
    print(year)
    for index, article in enumerate(articles):
        # Request the article content, returning tuple: (a list of HTML 'p' tags, the articles label).
        content, label = request_article(url=article['url'])

        # Extract embedded URLS.
        embedded_urls = extract_urls(article=content)

        # Extract article text.
        text = extract_text(article=content)

        # Calculate word count and extract word list.
        word_count, word_list = count_words(document=text)

        # Add features to the metadata dictionary.
        article_metadata[year][index]['label'] = label
        article_metadata[year][index]['word_count'] = word_count
        article_metadata[year][index]['num_embedded_urls'] = len(embedded_urls)
        article_metadata[year][index]['word_list'] = word_list
        article_metadata[year][index]['embedded_urls'] = embedded_urls
        article_metadata[year][index]['document'] = text

2021
2020
2019
2018
2017
2016
2015
2014
2013
2012
2011
2010
2009
2008
2007
2006
2005


In [None]:
# Export article content.

# Note on JSON: JSON to forces str, but it human readable. Pass the "default=str" option
# to avoid issues with JSON serializing datetime objects.
# with open('article_content.json', 'w') as f:
#     json.dump(article_metadata, f)#, default=str)

# Note on Pickle: Maintains datatypes, although BS4 causes issues. All BS4 objects have been removed in this
# iteration of the program.
with open('../data/article_content.pkl', 'wb') as f:
    pickle.dump(article_metadata, f)