In [17]:
# some potentially interesting clusting info at https://nlp.stanford.edu/IR-book/html/htmledition/flat-clustering-1.html
# filter out "Photo/Illustration by" up to pair of \n
# split on longer dash character
# http://brandonrose.org/clustering
import re
import numpy as np
import pandas as pd

In [12]:
def get_article_type(url):
    ieee_article_regex = "^https://www\.spectrum\.ieee\.org/(.*)/.*?$"
    article_type_string = re.match(ieee_article_regex, url)
    if article_type_string is None:
        return ""
    else:
        article_types = article_type_string.group(1).split("/")
        article_categories = [atype for atype in article_types if atype in ARTICLE_CATEGORIES]
        return article_categories[0]

ARTICLE_CATEGORIES = ["aerospace","at-work","biomedical","computing","energy","consumer-electronics",
                      "geek-life","green-tech","tech-history","robotics","semiconductors","telecom","transportation"]

In [24]:
article_df = pd.read_csv("article_df.csv", sep = "\t")

In [25]:
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk

In [79]:
def clean_article_text(article_text):
    # remove photo captions
    photo_caption_regex = "\n\n(Photo|Gif):.*?\n\n"
    photo_at_start_of_article_regex = "^Photo:.*?\n\n"
    article_text = re.sub(photo_caption_regex, "\n\n", article_text)
    article_text = re.sub(photo_at_start_of_article_regex, "", article_text)
    article_text = re.sub("\n\n"," ",article_text)
    article_text = article_text.lower()
    return article_text

stemmer = SnowballStemmer("english")
def tokenize_and_stem(text):
    splitting_characters = ["—","-"]
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    for ch in splitting_characters:
        tokens = [el for word in tokens for el in word.split(ch)]
    tokens = [re.sub("[^a-zA-Z']", "", t) for t in tokens]
    tokens = [t for t in tokens if re.search]
    stems = [stemmer.stem(t) for t in tokens]
    return stems

In [78]:
tokenize_and_stem(article_df.iloc[0,2])

['Photo',
 ':',
 'Gregg',
 'Segal',
 'As',
 'I',
 'drive',
 'through',
 'the',
 'vineyard',
 'covered',
 'hills',
 'of',
 'San',
 'Luis',
 'Obispo',
 ',',
 'Calif.',
 ',',
 'the',
 'tiny',
 'Global',
 'Positioning',
 'System',
 'receiver',
 'in',
 'my',
 'phone',
 'works',
 'with',
 'Google',
 'Maps',
 'to',
 'alert',
 'me',
 'to',
 'upcoming',
 'turns',
 '.',
 'The',
 'app',
 'reassures',
 'me',
 'that',
 'I’ll',
 'arrive',
 'at',
 'my',
 'destination',
 'on',
 'time',
 ',',
 'in',
 'spite',
 'of',
 'a',
 'short',
 'delay',
 'for',
 'construction',
 '.',
 'How',
 'different',
 'this',
 'trip',
 'would',
 'have',
 'been',
 'in',
 'the',
 'pre',
 'GPS',
 'era',
 ',',
 'when',
 'the',
 'obscured',
 'road',
 'sign',
 'at',
 'one',
 'intersection',
 'would',
 'likely',
 'have',
 'sent',
 'me',
 'off',
 'track',
 '.',
 'I',
 'have',
 'a',
 'weak',
 'sense',
 'of',
 'direction',
 ',',
 'and',
 'getting',
 'lost',
 'or',
 'worrying',
 'about',
 'getting',
 'lost',
 'was',
 'a',
 'stressful',


In [44]:
test_text = ["test","granted—com"]
[y for x in test_text for y in x.split("—")]

['test', 'granted', 'com']

In [90]:
tfidf_vectorizer = TfidfVectorizer(max_df=0.75, max_features=1500,
                                 min_df=0.1, stop_words='english',
                                 use_idf=True, tokenizer=tokenize_and_stem, ngram_range=(1,3))

In [85]:
article_texts = [clean_article_text(a) for a in article_df["Article_Text"].tolist()]

In [91]:
tfidf_matrix = tfidf_vectorizer.fit_transform(article_texts)
tfidf_matrix.shape

(2137, 1070)

In [87]:
tfidf_vectorizer.vocabulary_

{'drive': 394,
 'cover': 341,
 'san': 866,
 'calif': 267,
 'tini': 994,
 'global': 493,
 'posit': 756,
 'receiv': 820,
 'phone': 742,
 'work': 1059,
 'googl': 497,
 'map': 645,
 'turn': 1008,
 'time': 992,
 'short': 900,
 'construct': 325,
 'differ': 380,
 'road': 851,
 'sign': 902,
 'like': 617,
 'sent': 889,
 'track': 1002,
 'sens': 886,
 'direct': 383,
 'life': 614,
 'long': 628,
 'person': 741,
 'technolog': 977,
 'tool': 1000,
 'come': 302,
 'togeth': 997,
 'ieee': 535,
 'lead': 606,
 'develop': 374,
 'push': 800,
 'earli': 400,
 'applic': 208,
 'just': 584,
 'dont': 392,
 'say': 869,
 'moment': 673,
 'meet': 660,
 'chief': 290,
 'fail': 456,
 'hard': 511,
 'vision': 1037,
 'navig': 685,
 'realiti': 816,
 'role': 854,
 'began': 244,
 'wall': 1040,
 'larg': 596,
 'water': 1044,
 'explor': 448,
 'easi': 402,
 'job': 582,
 'project': 788,
 'anoth': 201,
 'step': 952,
 'air': 191,
 'forc': 475,
 'took': 999,
 'sever': 896,
 'cours': 339,
 'opportun': 714,
 'led': 609,
 'year': 1066,
 

In [29]:
from sklearn.cluster import KMeans

In [92]:
km = KMeans(n_clusters=13)
km.fit(tfidf_matrix)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=13, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [89]:
clusters = {"URL":article_df["URL"], "Cluster":km.labels_.tolist(), "Category":article_df["Category"]}
clusters = pd.DataFrame(clusters)
pd.crosstab(clusters.Category, clusters.Cluster)

Cluster,0,1,2,3,4,5,6,7,8,9,10,11,12
Category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
aerospace,46,4,3,13,1,14,4,4,0,2,16,0,0
at-work,13,0,4,2,1,6,3,1,0,52,54,2,0
biomedical,66,6,20,29,7,12,0,1,1,2,16,73,1
computing,48,0,66,6,11,29,4,1,4,11,40,8,0
consumer-electronics,10,0,2,8,1,6,3,5,1,2,67,0,0
energy,23,0,1,2,0,19,97,9,2,5,8,0,0
geek-life,8,0,12,1,0,2,0,0,1,1,29,1,1
green-tech,1,0,0,9,0,4,23,5,0,2,3,0,0
robotics,6,108,29,3,0,39,1,2,5,0,26,0,190
semiconductors,7,0,8,108,37,10,4,4,0,0,14,2,0
