In [1]:
# some potentially interesting clusting info at https://nlp.stanford.edu/IR-book/html/htmledition/flat-clustering-1.html
# filter out "Photo/Illustration by" up to pair of \n
# split on longer dash character
# http://brandonrose.org/clustering
from collections import Counter
import re

import pandas as pd
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm

In [65]:
def clean_article_text(article_text):
    """
    Removes a few obvious irregularities from the article
    text (e.g., captions for images)."""
    photo_caption_regex = "\n\n(Photo|Gif):.*?\n\n"
    photo_at_start_of_article_regex = "^Photo:.*?\n\n"
    article_text = re.sub(photo_caption_regex, "\n\n", article_text)
    article_text = re.sub(photo_at_start_of_article_regex, "", article_text)
    article_text = re.sub("\n\n"," ",article_text)
    article_text = article_text.lower()
    return article_text

def top_n_terms_row(tfidf_row, n = 30):
    """
    Gets the top n terms from a single row of a tf-idf matrix.
    """
    ordering = tfidf_row.sort_values(ascending = False).index
    return ordering[:n].tolist()

def top_n_terms_cluster(tfidf_df, clusters, cnumber, n_terms):
    """
    Approximation of what the most popular terms in a cluster are.
    Returns important terms from each row (article), and then sees
    which are the most common among those for all articles.
    """
    cluster_n = clusters[clusters["Cluster"] == cnumber]
    cluster_indices = cluster_n.index.tolist()
    cluster_tfidf = tfidf_df.iloc[cluster_indices,:]
    important_terms = cluster_tfidf.apply(top_n_terms_row, axis = 1)
    return Counter([x for y in important_terms.values for x in y]).most_common(n_terms)

In [3]:
# Merge and clean the articles
article_df = pd.read_csv("article_df.csv", sep = "\t")
article_texts = [clean_article_text(a) for a in article_df["Article_Text"].tolist()]

In [68]:
# Generate the tf-idf matrix
tfidf_vectorizer = TfidfVectorizer(max_df=0.75, max_features=1500,
                                   min_df=0.1, stop_words='english',
                                   use_idf=True, ngram_range=(1,3))

tfidf_matrix = tfidf_vectorizer.fit_transform(article_texts)

In [71]:
# clustering attempt with 13 clusters (one for each category)
km = KMeans(n_clusters=13)
km.fit(tfidf_matrix)

clusters = {"URL":article_df["URL"], "Cluster":km.labels_.tolist(), "Category":article_df["Category"]}
clusters = pd.DataFrame(clusters)
pd.crosstab(clusters["Category"], clusters["Cluster"])#.to_csv("13 clusters.csv")

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=13, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [13]:
# clustering attempt with 26 clusters (one for each category)
km2 = KMeans(n_clusters=26)
km2.fit(tfidf_matrix)

clusters2 = {"URL":article_df["URL"], "Cluster":km2.labels_.tolist(), "Category":article_df["Category"]}
clusters2 = pd.DataFrame(clusters2)
pd.crosstab(clusters2["Category"], clusters2["Cluster"])#.to_csv("26 clusters.csv")

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=26, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [27]:
# get the top terms for each of the 13 clusters in the first case
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns = tfidf_vectorizer.get_feature_names())
top_n_per_cluster = [top_n_terms_cluster(tfidf_df, clusters, x, n_terms = 10) for x in range(13)]
pd.DataFrame([[x[0] for x in y] for y in top_n_per_cluster])