In [1]:
from bs4 import BeautifulSoup
import requests
import newspaper
import re

In [2]:
def get_page_html(url):
    return BeautifulSoup(requests.get(url).text, "lxml")

In [3]:
def get_article_recommendations(next_article):
    article_html = get_page_html(next_article)
    recommended_for_you = article_html.find("div",attrs={'id':'article-rec'})
    assert recommended_for_you is not None
    if len(recommended_for_you) > 0:
        articles = recommended_for_you.find_all("a")
        articles = ["https://www.spectrum.ieee.org" + a["href"] for a in articles]
    else:
        articles = []
    return articles

def is_article_excluded(url):
    is_webinar = re.search("/webinar/", url) is not None
    is_whitepaper = re.search("/whitepaper/", url) is not None
    return is_webinar or is_whitepaper

In [6]:
test_url = "https://www.spectrum.ieee.org/riskfactor/computing/software/new-software-system-snags-tsbs-online-and-mobile-banking-customers-in-uk"
article_html = get_page_html(test_url)

In [12]:
recommended_for_you = article_html.find("div",attrs={'id':'article-rec'})
articles = recommended_for_you.find_all("a")
articles = ["https://www.spectrum.ieee.org" + a["href"] for a in articles]
articles

['https://www.spectrum.ieee.org/riskfactor/computing/software/michigans-midas-unemployment-system-algorithm-alchemy-that-created-lead-not-gold',
 'https://www.spectrum.ieee.org/transportation/self-driving/the-big-problem-with-selfdriving-cars-is-people',
 'https://www.spectrum.ieee.org/computing/software/software-the-invisible-technology']

In [4]:
ieee_spectrum = newspaper.build("https://www.spectrum.ieee.org/", memoize_articles = False)

In [5]:
seen_articles = []
article_urls = [a.url for a in ieee_spectrum.articles]
article_urls = [a for a in article_urls]

In [6]:
while len(article_urls) > 0:
    print("There are {0} unprocessed articles and {1} that have been seen.".format(len(article_urls), len(seen_articles)))
    all_articles = set(seen_articles).union(set(article_urls))
    
    next_article = article_urls.pop(0)
    print("Processing page " + next_article)
    try:
        new_articles = get_article_recommendations(next_article)
    except AssertionError:
        print("***No recommendations in this article - moving on...***")
    else:
        seen_articles.append(next_article)
        if len(new_articles) > 0:
            new_articles = [na for na in new_articles if na not in all_articles]
        article_urls.extend(new_articles)

There are 275 unprocessed articles and 0 that have been seen.
Processing page https://www.spectrum.ieee.org/semiconductors/devices/exabytes-in-a-test-tube-the-case-for-dna-data-storage
There are 277 unprocessed articles and 1 that have been seen.
Processing page https://www.spectrum.ieee.org/aerospace/aviation/zunum-aeros-hybrid-electric-airplane-aims-to-rejuvenate-regional-travel
There are 279 unprocessed articles and 2 that have been seen.
Processing page https://www.spectrum.ieee.org/geek-life/profiles/why-your-gps-receiver-isnt-bigger-than-a-breadbox
There are 280 unprocessed articles and 3 that have been seen.
Processing page https://www.spectrum.ieee.org/energy/policy/rebuilding-puerto-ricos-power-grid-the-inside-story
There are 281 unprocessed articles and 4 that have been seen.
Processing page https://www.spectrum.ieee.org/computing/software/forging-voices-and-faces-the-dangers-of-audio-and-video-fabrication
There are 282 unprocessed articles and 5 that have been seen.
Processi

There are 320 unprocessed articles and 37 that have been seen.
Processing page https://www.spectrum.ieee.org/view-from-the-valley/telecom/internet/zuckerberg-turns-back-to-making-facebook-about-meeting-girls
There are 321 unprocessed articles and 38 that have been seen.
Processing page https://www.spectrum.ieee.org/view-from-the-valley/at-work/start-ups/been-whining-about-facebook-how-about-you-building-something-better
There are 322 unprocessed articles and 39 that have been seen.
Processing page https://conferences.ieee.org/conferences_events/conferences/search?q=*&subsequent_q=&date=all&from=2018-03-15&to=2018-08-31&region=all&country=all&pos=0&sortorder=asc&sponsor=&sponsor_type=all&state=all&field_of_interest=all&sortfield=dates
***No recommendations in this article - moving on...***
There are 321 unprocessed articles and 39 that have been seen.
Processing page https://conferences.ieee.org/conferences_events/conferences/search?q=*&subsequent_q=&date=all&from=2018-01-01&to=2018-12-

***No recommendations in this article - moving on...***
There are 337 unprocessed articles and 61 that have been seen.
Processing page https://www.spectrum.ieee.org/video/green-tech/conservation/hightech-eavesdropping-on-the-ganges-river-dolphin
***No recommendations in this article - moving on...***
There are 336 unprocessed articles and 61 that have been seen.
Processing page https://www.spectrum.ieee.org/video/green-tech/conservation/organic-transits-enclosed-tricycle-is-half-bike-half-car
***No recommendations in this article - moving on...***
There are 335 unprocessed articles and 61 that have been seen.
Processing page https://www.spectrum.ieee.org/video/green-tech/buildings/hacking-tomatoes-at-the-worlds-greenest-greenhouse
***No recommendations in this article - moving on...***
There are 334 unprocessed articles and 61 that have been seen.
Processing page https://www.spectrum.ieee.org/video/green-tech/wind/a-field-test-for-kitebased-wind-power
***No recommendations in this arti

There are 359 unprocessed articles and 90 that have been seen.
Processing page https://www.spectrum.ieee.org/automaton/robotics/home-robots/sony-partners-with-cmu-to-develop-food-prep-and-delivery-robots
There are 360 unprocessed articles and 91 that have been seen.
Processing page https://www.spectrum.ieee.org/automaton/robotics/robotics-hardware/video-friday-playground-robotics-westworld-simtwo-circus-robot
There are 362 unprocessed articles and 92 that have been seen.
Processing page https://www.spectrum.ieee.org/view-from-the-valley/at-work/start-ups/marissa-mayers-new-startup-gets-lucky-a-lucky-building-that-is
There are 363 unprocessed articles and 93 that have been seen.
Processing page https://www.spectrum.ieee.org/automaton/robotics/industrial-robots/robots-continue-attempting-to-master-ikea-furniture-assembly
There are 365 unprocessed articles and 94 that have been seen.
Processing page https://www.spectrum.ieee.org/tech-talk/robotics/artificial-intelligence/real-dog-behavior

***No recommendations in this article - moving on...***
There are 391 unprocessed articles and 118 that have been seen.
Processing page http://standards.ieee.org/news/2018/ieee_1547-2018_standard_revision.html
***No recommendations in this article - moving on...***
There are 390 unprocessed articles and 118 that have been seen.
Processing page http://standards.ieee.org/news/2018/ieee1800-2017_revision.html
***No recommendations in this article - moving on...***
There are 389 unprocessed articles and 118 that have been seen.
Processing page https://www.spectrum.ieee.org/energy/the-smarter-grid/building-a-stronger-safer-zinc-battery
There are 388 unprocessed articles and 119 that have been seen.
Processing page https://www.spectrum.ieee.org/cars-that-think/transportation/infrastructure/mystery-brand-ev-will-offer-witricitys-wireless-charging-this-year
There are 390 unprocessed articles and 120 that have been seen.
Processing page https://www.spectrum.ieee.org/energywise/energy/nuclear/fi

There are 405 unprocessed articles and 144 that have been seen.
Processing page https://www.spectrum.ieee.org/tech-talk/telecom/security/report-nextlevel-cyberattacks-demand-data-clearinghouse
There are 407 unprocessed articles and 145 that have been seen.
Processing page https://www.spectrum.ieee.org/tech-talk/telecom/wireless/the-national-science-foundation-announces-pawr-test-platform-cities
There are 407 unprocessed articles and 146 that have been seen.
Processing page https://www.spectrum.ieee.org/tech-talk/semiconductors/optoelectronics/selfpowered-image-sensor-could-watch-you-forever
There are 409 unprocessed articles and 147 that have been seen.
Processing page https://www.spectrum.ieee.org/computing/software/using-the-labview-communications-system-design-suite-to-increase-spectral-efficiency-for-wireless-communication
There are 411 unprocessed articles and 148 that have been seen.
Processing page https://www.spectrum.ieee.org/tech-talk/computing/networks/a-lownoise-diamond-mas

There are 440 unprocessed articles and 179 that have been seen.
Processing page https://www.spectrum.ieee.org/tech-talk/semiconductors/devices/5-questions-for-hemt-inventor-takashi-mimura
There are 442 unprocessed articles and 180 that have been seen.
Processing page https://www.spectrum.ieee.org/view-from-the-valley/geek-life/history/writing-a-musical-about-silicon-valley-start-with-robert-noyce-add-a-dash-of-marissa-mayer
There are 443 unprocessed articles and 181 that have been seen.
Processing page https://www.spectrum.ieee.org/geek-life/tools-toys/dirac-highfidelity-audio-for-your-smartphone
There are 445 unprocessed articles and 182 that have been seen.
Processing page https://www.spectrum.ieee.org/view-from-the-valley/geek-life/history/quicktimes-developers-reflect-on-doing-digital-video-in-software
There are 447 unprocessed articles and 183 that have been seen.
Processing page https://www.spectrum.ieee.org/geek-life/hands-on/build-a-clock-with-lixies-the-nixietube-lookalike
The

There are 445 unprocessed articles and 203 that have been seen.
Processing page https://www.spectrum.ieee.org/biomedical/devices/biocomputer-and-memory-built-inside-living-bacteria
There are 447 unprocessed articles and 204 that have been seen.
Processing page https://www.spectrum.ieee.org/transportation/advanced-cars/my-first-year-with-a-plugin-hybrid
There are 446 unprocessed articles and 205 that have been seen.
Processing page https://www.spectrum.ieee.org/tech-talk/aerospace/aviation/nasas-newest-xplane-will-fly-with-18-electric-motors
There are 445 unprocessed articles and 206 that have been seen.
Processing page https://www.spectrum.ieee.org/aerospace/aviation/cheaper-lighter-quieter-the-electrification-of-flight-is-at-hand
There are 445 unprocessed articles and 207 that have been seen.
Processing page https://www.spectrum.ieee.org/semiconductors/design/superaccurate-gps-coming-to-smartphones-in-2018
There are 446 unprocessed articles and 208 that have been seen.
Processing page

There are 453 unprocessed articles and 244 that have been seen.
Processing page https://www.spectrum.ieee.org/the-human-os/biomedical/bionics/e-skin-lets-you-manipulate-objects-in-real-and-virtual-worlds
There are 455 unprocessed articles and 245 that have been seen.
Processing page https://www.spectrum.ieee.org/cars-that-think/transportation/mass-transit/and-now-theres-even-a-hyperloop-think-tank
There are 457 unprocessed articles and 246 that have been seen.
Processing page https://www.spectrum.ieee.org/view-from-the-valley/tech-history/cyberspace/atari-alumni-talk-about-the-tall-tales-the-told
There are 459 unprocessed articles and 247 that have been seen.
Processing page https://www.spectrum.ieee.org/view-from-the-valley/computing/hardware/nolan-bushnell-says-his-new-virtual-reality-startup-has-the-keys-to-the-holodeckand-its-portable
There are 460 unprocessed articles and 248 that have been seen.
Processing page https://www.spectrum.ieee.org/automaton/robotics/industrial-robots/xp

There are 475 unprocessed articles and 285 that have been seen.
Processing page https://www.spectrum.ieee.org/view-from-the-valley/telecom/internet/facebooks-plan-to-connect-the-world-is-not-just-social-its-political
There are 475 unprocessed articles and 286 that have been seen.
Processing page https://www.spectrum.ieee.org/telecom/internet/a-2-million-contest-seeks-solutions-to-big-internet-challenges
There are 476 unprocessed articles and 287 that have been seen.
Processing page https://www.spectrum.ieee.org/podcast/computing/networks/facebook-knows-your-friendseven-if-theyre-not-on-facebook
***No recommendations in this article - moving on...***
There are 475 unprocessed articles and 287 that have been seen.
Processing page https://www.spectrum.ieee.org/tech-talk/consumer-electronics/gaming/ces-2018-tactical-haptics-redesigns-its-magical-force-feedback-controller
There are 475 unprocessed articles and 288 that have been seen.
Processing page https://www.spectrum.ieee.org/view-from-

There are 475 unprocessed articles and 327 that have been seen.
Processing page https://www.spectrum.ieee.org/telecom/wireless/when-disaster-strikes-flying-cell-towers-could-aid-search-and-rescue
There are 475 unprocessed articles and 328 that have been seen.
Processing page https://www.spectrum.ieee.org/automaton/robotics/drones/electronic-license-plates-for-drones
There are 476 unprocessed articles and 329 that have been seen.
Processing page https://www.spectrum.ieee.org/nanoclast/consumer-electronics/portable-devices/flexible-mobile-devices-get-a-flexible-battery-made-from-nanotubes
There are 475 unprocessed articles and 330 that have been seen.
Processing page https://www.spectrum.ieee.org/aerospace/satellites/photovoltaics-in-satellites
There are 474 unprocessed articles and 331 that have been seen.
Processing page https://www.spectrum.ieee.org/nanoclast/green-tech/solar/graphene-and-perovskite-are-a-winning-combination-for-photovoltaics
There are 473 unprocessed articles and 332

KeyboardInterrupt: 

In [22]:
is_article_excluded(next_article)

True