In [1]:
# some potentially interesting clusting info at https://nlp.stanford.edu/IR-book/html/htmledition/flat-clustering-1.html
# filter out "Photo/Illustration by" up to pair of \n
# split on longer dash character
import re
import newspaper
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
def get_article_type(url):
    ieee_article_regex = "^https://www\.spectrum\.ieee\.org/(.*)/.*?$"
    article_type_string = re.match(ieee_article_regex, url)
    if article_type_string is None:
        return []
    else:
        article_types = article_type_string.group(1).split("/")
        article_categories = [atype for atype in article_types if atype in ARTICLE_CATEGORIES]
        return article_categories

In [3]:
ARTICLE_CATEGORIES = ["aerospace","at-work","biomedical","computing","energy","consumer-electronics",
                      "geek-life","green-tech","tech-history","robotics","semiconductors","telecom","transportation"]

In [2]:
ieee_spectrum = newspaper.build("https://www.spectrum.ieee.org/", memoize_articles = False)

In [6]:
article_urls = [a.url for a in ieee_spectrum.articles]
article_types = list(map(get_article_type, article_urls))
articles = [a for a,t in zip(article_urls, article_types) if len(t) > 0]
categories = [at[0] for at in article_types if len(at) > 0]

In [7]:
target_articles = [newspaper.Article(url) for url in articles]

In [8]:
for t in target_articles:
    print(t.url)
    t.download()
    t.parse()

https://www.spectrum.ieee.org/riskfactor/computing/software/how-much-trouble-is-the-new-us-defense-department-electronic-health-records-program-in
https://www.spectrum.ieee.org/nanoclast/semiconductors/memory/crossbar-pushes-reram-into-embedded-ai
https://www.spectrum.ieee.org/video/robotics/robotics-software/robots-learn-to-speak-body-language
https://www.spectrum.ieee.org/video/computing/networks/what-happens-in-your-brain-when-you-learn-a-song
https://www.spectrum.ieee.org/video/semiconductors/nanotechnology/how-will-we-go-beyond-moores-law-experts-weigh-in
https://www.spectrum.ieee.org/video/computing/software/how-to-detect-a-gps-spoof-on-a-superyacht
https://www.spectrum.ieee.org/video/geek-life/profiles/tech-meets-art-capturing-life-with-the-eyes-of-a-machine
https://www.spectrum.ieee.org/video/robotics/robotics-software/how-to-build-a-moral-robot
https://www.spectrum.ieee.org/computing/hardware/quantum-computers-strive-to-break-out-of-the-lab
https://www.spectrum.ieee.org/comput

https://www.spectrum.ieee.org/energywise/energy/renewables/usmexico-wall-wont-stop-crossborder-power-push
https://www.spectrum.ieee.org/green-tech/conservation/why-we-must-fight-for-the-right-to-repair-our-electronics
https://www.spectrum.ieee.org/energywise/energy/the-smarter-grid/should-a-devastated-caribbean-leap-forward-to-renewable-power-and-microgrids
https://www.spectrum.ieee.org/energywise/green-tech/wind/rechargeable-wind-power-over-the-open-ocean
https://www.spectrum.ieee.org/tech-talk/aerospace/satellites/satellite-radar-sees-invisible-changes-in-groundwater-levels
https://www.spectrum.ieee.org/view-from-the-valley/at-work/start-ups/how-chip-design-can-teach-us-to-build-better-hospitals
https://www.spectrum.ieee.org/the-human-os/biomedical/diagnostics/mouth-sensor-can-measure-the-salt-in-every-potato-chip-you-eat
https://www.spectrum.ieee.org/the-human-os/biomedical/diagnostics/5-million-prize-for-origin-of-genetic-code
https://www.spectrum.ieee.org/biomedical/devices/math-e

https://www.spectrum.ieee.org/tech-history/heroic-failures/the-european-union-at-60-it-should-be-happy-but-it-isnt
https://www.spectrum.ieee.org/view-from-the-valley/at-work/tech-careers/the-numbers-of-women-in-tech-rise-and-fall-but-sexual-harassment-is-ever-present
https://www.spectrum.ieee.org/view-from-the-valley/tech-history/silicon-revolution/behind-the-scenes-at-xerox-parcs-futures-day40-years-ago
https://www.spectrum.ieee.org/tech-history/dawn-of-electronics/the-shocking-truth-behind-arnold-nordsiecks-differential-analyzer
https://www.spectrum.ieee.org/at-work/innovation/the-language-we-invented-as-we-invented-the-future
https://www.spectrum.ieee.org/view-from-the-valley/tech-history/silicon-revolution/the-xerox-alto-struts-its-stuff-on-its-40th-birthday
https://www.spectrum.ieee.org/automaton/robotics/robotics-software/wizards-of-ros-willow-garage-and-the-making-of-the-robot-operating-system
https://www.spectrum.ieee.org/automaton/robotics/robotics-software/the-origin-story-of

https://www.spectrum.ieee.org/video/geek-life/hands-on/constructing-a-better-bike-light
https://www.spectrum.ieee.org/video/transportation/mass-transit/the-most-interesting-thing-about-stephen-colberts-monologues-is-the-wall-behind-him
https://www.spectrum.ieee.org/video/geek-life/profiles/extended-directors-cut-ted-nelson-on-what-modern-programmers-can-learn-from-the-past
https://www.spectrum.ieee.org/video/geek-life/profiles/ted-nelson-on-what-modern-programmers-can-learn-from-the-past
https://www.spectrum.ieee.org/video/geek-life/hands-on/build-a-cordless-soldering-iron
https://www.spectrum.ieee.org/video/geek-life/hands-on/testing-diy-digital-video-for-fpv-flying
https://www.spectrum.ieee.org/geek-life/history/a-man-in-a-hurry-claude-shannons-new-york-years
https://www.spectrum.ieee.org/tech-history/cyberspace/social-medias-dialup-ancestor-the-bulletin-board-system
https://www.spectrum.ieee.org/computing/software/linux-at-25-qa-with-linus-torvalds
https://www.spectrum.ieee.org/tran

ArticleException: 

In [11]:
article_texts = [t.text for t in target_articles]
article_urls = [t.url for t in target_articles]
article_text_df = pd.DataFrame({"URL":article_urls, "ArticleText":article_texts})
article_text_df.to_csv("article_texts.csv", sep = "\t")

In [35]:
article_text_df = pd.read_csv("article_texts.csv", sep = "\t")
article_texts = list(article_text_df["ArticleText"].dropna())
article_urls = list(article_text_df["URL"])[:234]
article_types = list(map(get_article_type, article_urls))
articles = [a for a,t in zip(article_urls, article_types) if len(t) > 0]
categories = [at[0] for at in article_types if len(at) > 0]

In [43]:
article_texts[13]

'Advertisement\n\nNow that 5G is getting closer to deployment, what are the killer apps going to be that push 5G forward into broad deployment? Will 5G be influential enough to push the technology to the elite status of a general purpose technology? This article aims to explore some of the hottest emerging 5G technologies and the potential impact of 5G.\n\nContents\n\nA Look at the Future of 5G\n\nEmerging 5G Applications: Virtual Reality and Augmented Reality\n\nAutonomous Vehicles\n\nThe Industrial Internet of Things\n\nDownload the white paper.'

In [23]:
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk

In [6]:
def clean_article_text(article_text):
    # remove photo captions
    photo_caption_regex = "\n\n(Photo|Gif):.*?\n\n"
    photo_at_start_of_article_regex = "^Photo:.*?\n\n"
    article_text = re.sub(photo_caption_regex, "\n\n", article_text)
    article_text = re.sub(photo_at_start_of_article_regex, "", article_text)
    article_text = re.sub("\n\n"," ",article_text)
    article_text = article_text.lower()
    return article_text

stemmer = SnowballStemmer("english")
def tokenize_and_stem(text):
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    tokens = [t for t in tokens if re.search]
    stems = [stemmer.stem(t) for t in tokens]
    return stems

In [7]:
tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=1500,
                                 min_df=0.1, stop_words='english',
                                 use_idf=True, tokenizer=tokenize_and_stem, ngram_range=(1,3))

In [24]:
article_texts = [clean_article_text(a) for a in article_texts]
tfidf_matrix = tfidf_vectorizer.fit_transform(article_texts)
tfidf_matrix.shape

(234, 972)

In [25]:
from sklearn.cluster import KMeans

In [26]:
km = KMeans(n_clusters=13)
km.fit(tfidf_matrix)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=13, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [38]:
clusters = {"URL":article_urls, "Cluster":km.labels_.tolist(), "Category":categories}
clusters = pd.DataFrame(clusters)
pd.crosstab(clusters.Category, clusters.Cluster)

Cluster,0,1,2,3,4,5,6,7,8,9,10,11,12
Category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
aerospace,0,2,10,1,1,1,0,0,0,6,0,2,0
at-work,0,2,0,6,0,0,1,1,4,7,0,1,0
biomedical,0,2,0,12,1,1,0,2,1,2,1,1,2
computing,1,0,1,5,0,0,1,2,6,1,2,1,1
consumer-electronics,0,1,0,1,0,0,0,0,0,0,0,1,0
energy,0,1,0,0,0,2,0,0,0,1,0,10,0
geek-life,0,1,3,7,0,0,1,0,4,3,1,0,1
green-tech,0,0,0,3,1,1,0,0,1,1,2,4,1
robotics,0,0,0,0,23,0,0,0,1,1,2,0,2
semiconductors,0,2,1,5,0,1,0,0,2,0,0,0,3
