In [1]:
from bs4 import BeautifulSoup
import requests
import newspaper
import re
import os
import pandas as pd
from collections import namedtuple
import tqdm

In [2]:
def get_page_html(url):
    return BeautifulSoup(requests.get(url).text, "lxml")

def get_article_recommendations(next_article):
    article_html = get_page_html(next_article)
    recommended_for_you = article_html.find("div",attrs={'id':'article-rec'})
    assert recommended_for_you is not None
    if len(recommended_for_you) > 0:
        articles = recommended_for_you.find_all("a")
        articles = ["https://spectrum.ieee.org" + a["href"] for a in articles]
    else:
        articles = []
    
    is_this_article_good = article_html.find("div",attrs={"class":"sponsors"}) is None
    
    return [a for a in articles if not is_article_excluded(a)], is_this_article_good

# quick check to weed out obvious articles that don't fit the requirements
def is_article_excluded(url):
    is_url_wrong = re.search("//spectrum\.ieee\.org/", url) is None
    is_whitepaper = re.search("/whitepaper/", url) is not None
    is_static = re.search("/static/",url) is not None
    is_media = re.search("/video/|/webinar/|/podcast/",url) is not None
    return is_media or is_whitepaper or is_static or is_url_wrong

# figure out which category an article belongs to, for reference when we try clustering
# the articles
def get_article_type(url):
    ieee_article_regex = "^https://spectrum\.ieee\.org/(.*)/.*?$"
    article_type_string = re.match(ieee_article_regex, url)
    if article_type_string is None:
        return ""
    else:
        article_types = article_type_string.group(1).split("/")
        article_categories = [atype for atype in article_types if atype in ARTICLE_CATEGORIES]
        return article_categories[0]

In [3]:
ARTICLE_CATEGORIES = ["aerospace","at-work","biomedical","computing","energy","consumer-electronics",
                      "geek-life","green-tech","tech-history","robotics","semiconductors","telecom","transportation"]

IEEE_ARTICLE_FILE = "article_df.csv"

In [5]:
# split articles from file away from newly found articles; need to scrape them separately (duh, me)

if os.path.isfile(IEEE_ARTICLE_FILE):
    article_df = pd.read_csv(IEEE_ARTICLE_FILE, sep = "\t")
    old_articles = article_df["URL"].tolist()
else:
    article_df = pd.DataFrame({"URL":[],"Category":[],"Article_Text":[]})
    article_df = article_df[["URL","Category","Article_Text"]]
    old_articles = []
num_old_articles = len(old_articles)

In [7]:
ieee_spectrum = newspaper.build("https://spectrum.ieee.org/", memoize_articles = False)

In [10]:
new_urls = [a.url for a in ieee_spectrum.articles]
new_urls = [nu for nu in new_urls if nu not in old_articles]
new_urls = [re.sub("://www\.", "://", nu) for nu in new_urls]
new_urls = [nu for nu in new_urls if not is_article_excluded(nu)]
len(new_urls)

105

In [11]:
seen_articles = []

while len(new_urls) > 0:
    print(f"There are {len(new_urls)} unprocessed articles and {len(seen_articles)} new articles that have been stored.")
    all_articles = set(old_articles + seen_articles + new_urls)
    
    next_article = new_urls.pop(0)
    next_article = re.sub("://www\.", "://", next_article)
    print("Processing page " + next_article)
    try:
        new_articles, article_is_good = get_article_recommendations(next_article)
    except AssertionError:
        print("***No recommendations in this article - moving on...***")
    else:
        if article_is_good:
            seen_articles.append(next_article)
        if len(new_articles) > 0:
            recommended_articles = [na for na in new_articles if na not in all_articles]
        new_urls.extend(recommended_articles)
print("Done.")

There are 105 unprocessed articles and 0 new articles that have been stored.
Processing page https://spectrum.ieee.org/tech-history/silicon-revolution/chip-hall-of-fame-nvidia-nv20
There are 106 unprocessed articles and 1 new articles that have been stored.
Processing page https://spectrum.ieee.org/tech-history/silicon-revolution/chip-hall-of-fame-intel-4004-microprocessor
There are 105 unprocessed articles and 2 new articles that have been stored.
Processing page https://spectrum.ieee.org/automaton/robotics/robotics-hardware/video-friday-japanese-androids-rolls-royce-microrobots-robotic-racecar
There are 105 unprocessed articles and 3 new articles that have been stored.
Processing page https://spectrum.ieee.org/view-from-the-valley/biomedical/diagnostics/new-wearable-sensor-detects-stress-hormone-in-sweat
There are 107 unprocessed articles and 4 new articles that have been stored.
Processing page https://spectrum.ieee.org/the-human-os/biomedical/bionics/flying-a-drone-with-your-body
T

There are 117 unprocessed articles and 35 new articles that have been stored.
Processing page https://spectrum.ieee.org/view-from-the-valley/geek-life/profiles/ee-turned-artist-lights-up-salesforce-tower
There are 116 unprocessed articles and 36 new articles that have been stored.
Processing page https://spectrum.ieee.org/geek-life/hands-on/build-your-own-google-neural-synthesizer
There are 115 unprocessed articles and 37 new articles that have been stored.
Processing page https://spectrum.ieee.org/view-from-the-valley/geek-life/reviews/reliefbands-motion-sickness-wearable-take-2-much-prettier-but-not-perfect
There are 114 unprocessed articles and 38 new articles that have been stored.
Processing page https://spectrum.ieee.org/energywise/energy/nuclear/a-double-first-in-china-for-advanced-nuclear-reactors
There are 114 unprocessed articles and 39 new articles that have been stored.
Processing page https://spectrum.ieee.org/transportation/advanced-cars/protean-electrics-inwheel-motors-c

There are 128 unprocessed articles and 75 new articles that have been stored.
Processing page https://spectrum.ieee.org/tech-history/silicon-revolution/chip-hall-of-fame-philips-tda7000-fm-receiver
There are 127 unprocessed articles and 76 new articles that have been stored.
Processing page https://spectrum.ieee.org/semiconductors/processors/chip-hall-of-fame-rca-cdp-1802
There are 126 unprocessed articles and 77 new articles that have been stored.
Processing page https://spectrum.ieee.org/tech-history/silicon-revolution/chip-hall-of-fame-photobit-pb100
There are 127 unprocessed articles and 78 new articles that have been stored.
Processing page https://spectrum.ieee.org/tech-talk/telecom/security/how-a-california-banker-received-credit-for-his-unbreakable-cryptography-130-years-later
There are 126 unprocessed articles and 79 new articles that have been stored.
Processing page https://spectrum.ieee.org/the-human-os/biomedical/ethics/synthetic-biology-behemoth-aims-to-police-its-own-ind

There are 128 unprocessed articles and 113 new articles that have been stored.
Processing page https://spectrum.ieee.org/nanoclast/computing/hardware/intels-new-path-to-quantum-computing
There are 128 unprocessed articles and 114 new articles that have been stored.
Processing page https://spectrum.ieee.org/riskfactor/computing/it/australias-digital-transformation-stumbles-badly
There are 127 unprocessed articles and 115 new articles that have been stored.
Processing page https://spectrum.ieee.org/computing/networks/illinois-vs-dubai-two-experiments-bring-blockchains-to-government
There are 128 unprocessed articles and 116 new articles that have been stored.
Processing page https://spectrum.ieee.org/automaton/robotics/drones/eth-zurich-omnicopter-plays-fetch
There are 129 unprocessed articles and 117 new articles that have been stored.
Processing page https://spectrum.ieee.org/at-work/tech-careers/grant-imahara-debunker-in-the-box
There are 129 unprocessed articles and 118 new articles 

There are 108 unprocessed articles and 152 new articles that have been stored.
Processing page https://spectrum.ieee.org/geek-life/profiles/ashok-gadgil-the-humanitarian-inventor
There are 109 unprocessed articles and 153 new articles that have been stored.
Processing page https://spectrum.ieee.org/cars-that-think/transportation/safety/an-autonomous-cowllision-alert-system-for-driving-in-india
There are 111 unprocessed articles and 154 new articles that have been stored.
Processing page https://spectrum.ieee.org/energy/renewables/a-skeptic-looks-at-alternative-energy
There are 111 unprocessed articles and 155 new articles that have been stored.
Processing page https://spectrum.ieee.org/tech-talk/green-tech/buildings/urban-organics-starts-up-aquaponics-system
There are 110 unprocessed articles and 156 new articles that have been stored.
Processing page https://spectrum.ieee.org/green-tech/buildings/the-indoor-aquaponics-farm
There are 109 unprocessed articles and 157 new articles that h

There are 94 unprocessed articles and 194 new articles that have been stored.
Processing page https://spectrum.ieee.org/energy/renewables/moores-curse
There are 94 unprocessed articles and 195 new articles that have been stored.
Processing page https://spectrum.ieee.org/automaton/robotics/industrial-robots/irobot-spinoff-ava-robotics-introduces-autonomous-telepresence-robot
There are 95 unprocessed articles and 196 new articles that have been stored.
Processing page https://spectrum.ieee.org/automaton/robotics/industrial-robots/ces-2018-suitable-tech-introduces-beampro-2-telepresence-platform
There are 97 unprocessed articles and 197 new articles that have been stored.
Processing page https://spectrum.ieee.org/transportation/self-driving/lidar-equipped-autonomous-wheelchairs-roll-out-in-singapore-and-japan
There are 97 unprocessed articles and 198 new articles that have been stored.
Processing page https://spectrum.ieee.org/consumer-electronics/portable-devices/do-police-body-cameras-r

There are 93 unprocessed articles and 233 new articles that have been stored.
Processing page https://spectrum.ieee.org/computing/embedded-systems/lowcost-sbcs-are-ideal-for-industrial-and-medical-applications
There are 92 unprocessed articles and 233 new articles that have been stored.
Processing page https://spectrum.ieee.org/energy/fossil-fuels/gas-and-power-grid-link-means-la-could-be-gigawatts-short-this-summer
There are 91 unprocessed articles and 234 new articles that have been stored.
Processing page https://spectrum.ieee.org/semiconductors/processors/lessons-from-the-1-billion-intel-tradesecret-theft
There are 90 unprocessed articles and 235 new articles that have been stored.
Processing page https://spectrum.ieee.org/semiconductors/materials/packages-go-vertical
There are 90 unprocessed articles and 236 new articles that have been stored.
Processing page https://spectrum.ieee.org/nanoclast/computing/hardware/qubits-quantum-computing-with-run-off-the-mill-cmos-transistors
Ther

There are 79 unprocessed articles and 271 new articles that have been stored.
Processing page https://spectrum.ieee.org/automaton/robotics/home-robots/toyotas-new-human-support-robot-gives-disabled-humans-a-hand-and-an-arm
There are 78 unprocessed articles and 272 new articles that have been stored.
Processing page https://spectrum.ieee.org/automaton/robotics/robotics-hardware/speedy-inchworm-robot-only-needs-one-motor
There are 77 unprocessed articles and 273 new articles that have been stored.
Processing page https://spectrum.ieee.org/tech-talk/computing/software/googles-deep-mind-boosts-memory-to-navigate-london-underground
There are 77 unprocessed articles and 274 new articles that have been stored.
Processing page https://spectrum.ieee.org/cars-that-think/transportation/human-factors/deep-learning-and-google-street-view-can-predict-neighborhood-politics-from-parked-cars
There are 77 unprocessed articles and 275 new articles that have been stored.
Processing page https://spectrum.i

There are 59 unprocessed articles and 308 new articles that have been stored.
Processing page https://spectrum.ieee.org/nanoclast/semiconductors/nanotechnology/nanosensor-detects-prostate-cancer-in-its-early-stages
There are 58 unprocessed articles and 309 new articles that have been stored.
Processing page https://spectrum.ieee.org/nanoclast/semiconductors/nanotechnology/graphene-nanosensor-tattooed-to--teeth-detects-bacteria
There are 57 unprocessed articles and 310 new articles that have been stored.
Processing page https://spectrum.ieee.org/consumer-electronics/portable-devices/learn-new-skills-with-superhuman-speed
There are 56 unprocessed articles and 311 new articles that have been stored.
Processing page https://spectrum.ieee.org/consumer-electronics/portable-devices/wearable-computers-will-transform-language
There are 55 unprocessed articles and 312 new articles that have been stored.
Processing page https://spectrum.ieee.org/automaton/robotics/home-robots/ces-2014-parrot-unve

There are 38 unprocessed articles and 349 new articles that have been stored.
Processing page https://spectrum.ieee.org/automaton/robotics/drones/quadrotor-with-tilting-propellers-can-twist-in-midair
There are 37 unprocessed articles and 350 new articles that have been stored.
Processing page https://spectrum.ieee.org/tech-talk/telecom/internet/citi-launches-blockchainbased-payments-service-with-nasdaq-for-private-equity
There are 38 unprocessed articles and 351 new articles that have been stored.
Processing page https://spectrum.ieee.org/computing/networks/a-blockchain-currency-that-beats-bitcoin-on-privacy
There are 39 unprocessed articles and 352 new articles that have been stored.
Processing page https://spectrum.ieee.org/computing/software/the-trading-test
There are 41 unprocessed articles and 353 new articles that have been stored.
Processing page https://spectrum.ieee.org/geek-life/hands-on/how-to-build-a-homebrew-radon-detector
There are 40 unprocessed articles and 354 new arti

There are 27 unprocessed articles and 389 new articles that have been stored.
Processing page https://spectrum.ieee.org/computing/software/simulation-apps-bring-stem-to-life
There are 26 unprocessed articles and 389 new articles that have been stored.
Processing page https://spectrum.ieee.org/automaton/robotics/robotics-hardware/video-friday-mit-origami-robots-sphero-mini-headless-robotic-cat
There are 25 unprocessed articles and 390 new articles that have been stored.
Processing page https://spectrum.ieee.org/automaton/robotics/robotics-hardware/video-friday-agility-robotics-pancake-robots-metallica-drone-show
There are 24 unprocessed articles and 391 new articles that have been stored.
Processing page https://spectrum.ieee.org/automaton/robotics/home-robots/irobot-scooba-230-how-it-works
There are 23 unprocessed articles and 392 new articles that have been stored.
Processing page https://spectrum.ieee.org/automaton/robotics/home-robots/irobot-intros-new-scooba-390-ducky-not-included


Done.


In [18]:
#seen_articles
ArticleTuple = namedtuple("ArticleTuple",["URL","Category","Article_Text"])
list_of_article_tuples = []

In [None]:
for article_url in seen_articles:
    print(article_url)
    category = get_article_type(article_url)
    article = newspaper.Article(article_url)
    article.download()
    article.parse()
    article_tuple = ArticleTuple(URL = article_url, Category = category, Article_Text = article.text)
    list_of_article_tuples.append(article_tuple)

In [27]:
for _ in tqdm.trange(len(seen_articles)):
    article_url = seen_articles[0]
    article = newspaper.Article(article_url)
    article.download()
    article.parse()
    article_tuple = ArticleTuple(URL = article_url, Category = category, Article_Text = article.text)
    list_of_article_tuples.append(article_tuple)
    seen_articles.pop(0)


  0%|          | 0/4 [00:00<?, ?it/s][A
 25%|██▌       | 1/4 [00:01<00:04,  1.56s/it][A
 50%|█████     | 2/4 [00:02<00:02,  1.30s/it][A
 75%|███████▌  | 3/4 [00:04<00:01,  1.50s/it][A
100%|██████████| 4/4 [00:05<00:00,  1.57s/it][A
[A

In [32]:
new_articles = pd.DataFrame(list_of_article_tuples)

In [37]:
article_df = pd.concat([article_df, new_articles], axis = 0).reset_index(drop = True)

In [38]:
article_df

Unnamed: 0,URL,Category,Article_Text
0,https://spectrum.ieee.org/geek-life/profiles/w...,geek-life,Photo: Gregg Segal\n\nAs I drive through the v...
1,https://spectrum.ieee.org/view-from-the-valley...,geek-life,Photo: Alfred Eisenstaedt/The LIFE Picture Col...
2,https://spectrum.ieee.org/view-from-the-valley...,geek-life,"Photo: Trimble\n\nNext month, Brad Parkinson r..."
3,https://spectrum.ieee.org/tech-talk/semiconduc...,semiconductors,Photo: David McNew/AFP/Getty Images Show-goers...
4,https://spectrum.ieee.org/aerospace/space-flig...,aerospace,Illustration: Equinox Graphics/Surrey Space Ce...
5,https://spectrum.ieee.org/view-from-the-valley...,at-work,Photo: iStockphoto\n\nColleges around the coun...
6,https://spectrum.ieee.org/automaton/robotics/d...,robotics,Photo: Evan Ackerman/IEEE Spectrum\n\nQuadroto...
7,https://spectrum.ieee.org/the-human-os/biomedi...,biomedical,Photo: Lillie Paquette/MIT School of Engineeri...
8,https://spectrum.ieee.org/nanoclast/semiconduc...,semiconductors,Illustration: Philip Krantz\n\nGraphene might ...
9,https://spectrum.ieee.org/view-from-the-valley...,computing,Illustrations: iStockphoto and Shutterstock\n\...


In [39]:
article_df.to_csv(IEEE_ARTICLE_FILE, sep = "\t", index = False)