In [None]:
from bs4 import BeautifulSoup
import requests
import newspaper
import re
import os
import pandas as pd
from collections import namedtuple
import tqdm

In [None]:
def get_page_html(url):
    """Short function for grabbing a page's html."""
    return BeautifulSoup(requests.get(url).text, "lxml")

def get_article_recommendations(next_article):
    """Extracts the recommended articles from a page."""
    article_html = get_page_html(next_article)
    recommended_for_you = article_html.find("div",attrs={'id':'article-rec'})
    assert recommended_for_you is not None
    if len(recommended_for_you) > 0:
        articles = recommended_for_you.find_all("a")
        articles = ["https://spectrum.ieee.org" + a["href"] for a in articles]
    else:
        articles = []
    
    is_not_sponsored = article_html.find("div",attrs={"class":"sponsors"}) is None
    
    return [a for a in articles if not is_article_excluded(a)], is_not_sponsored

# quick check to weed out obvious articles that don't fit the requirements
def is_article_excluded(url):
    """
    Check the URL for a few quick things that indicate that this
    article doesn't fit what we're looking for.
    """
    is_url_wrong = re.search("//spectrum\.ieee\.org/", url) is None
    is_whitepaper = re.search("/whitepaper/", url) is not None
    is_static = re.search("/static/",url) is not None
    is_media = re.search("/video/|/webinar/|/podcast/",url) is not None
    return is_media or is_whitepaper or is_static or is_url_wrong

# figure out which category an article belongs to, for reference when we try clustering
# the articles
def get_article_type(url):
    """Determine the category of the article."""
    ieee_article_regex = "^https://spectrum\.ieee\.org/(.*)/.*?$"
    article_type_string = re.match(ieee_article_regex, url)
    if article_type_string is None:
        return ""
    else:
        article_types = article_type_string.group(1).split("/")
        article_categories = [atype for atype in article_types if atype in ARTICLE_CATEGORIES]
        return article_categories[0]

In [None]:
ARTICLE_CATEGORIES = ["aerospace","at-work","biomedical","computing","energy","consumer-electronics",
                      "geek-life","green-tech","tech-history","robotics","semiconductors","telecom","transportation"]

IEEE_ARTICLE_FILE = "article_df.csv"

In [None]:
# If the article file already exists, append to it; if not, start anew.
# This was supposed to be if I wanted to collect a large number of articles
# in the long term.
if os.path.isfile(IEEE_ARTICLE_FILE):
    article_df = pd.read_csv(IEEE_ARTICLE_FILE, sep = "\t")
    old_articles = article_df["URL"].tolist()
else:
    article_df = pd.DataFrame({"URL":[],"Category":[],"Article_Text":[]})
    article_df = article_df[["URL","Category","Article_Text"]]
    old_articles = []

In [None]:
# Grab the articles.
ieee_spectrum = newspaper.build("https://spectrum.ieee.org/", memoize_articles = False)

In [None]:
# Collect the article urls into a list, but filter out old and other
# problematic ones.
new_urls = [a.url for a in ieee_spectrum.articles]
new_urls = [nu for nu in new_urls if nu not in old_articles]
new_urls = [re.sub("://www\.", "://", nu) for nu in new_urls]
new_urls = [nu for nu in new_urls if not is_article_excluded(nu)]
len(new_urls)

In [None]:
# Check each article found by newspaper for recommendations.  Anything new should be
# added to the list of articles to check.  Continue until all new articles are
# checked.

seen_articles = []

while len(new_urls) > 0:
    print(f"There are {len(new_urls)} unprocessed articles and {len(seen_articles)} new articles that have been stored.")
    all_articles = set(old_articles + seen_articles + new_urls)
    
    next_article = new_urls.pop(0)
    next_article = re.sub("://www\.", "://", next_article)
    print("Processing page " + next_article)
    try:
        new_articles, article_is_good = get_article_recommendations(next_article)
    except AssertionError:
        print("***No recommendations in this article - moving on...***")
    else:
        if article_is_good:
            seen_articles.append(next_article)
        if len(new_articles) > 0:
            recommended_articles = [na for na in new_articles if na not in all_articles]
        new_urls.extend(recommended_articles)
print("Done.")

In [None]:
# Download articles and extract the text.
# NOTE: article.download() intermittently failed for unclear reasons; the loop below
# was structured to be easy to resume for when that happened.

ArticleTuple = namedtuple("ArticleTuple",["URL","Category","Article_Text"])
list_of_article_tuples = []

for _ in tqdm.trange(len(seen_articles)):
    article_url = seen_articles[0]
    category = get_article_type(article_url)
    article = newspaper.Article(article_url)
    article.download()
    article.parse()
    article_tuple = ArticleTuple(URL = article_url, Category = category, Article_Text = article.text)
    list_of_article_tuples.append(article_tuple)
    seen_articles.pop(0)

In [None]:
# Convert to a dataframe and write to a csv.
new_articles = pd.DataFrame(list_of_article_tuples)
article_df = pd.concat([article_df, new_articles], axis = 0).reset_index(drop = True)
article_df.to_csv(IEEE_ARTICLE_FILE, sep = "\t", index = False)