## Imports

In [31]:
import re
import string

import nltk
import numpy as np
import pandas as pd

from collections import Counter 

from gensim.models import Word2Vec

from nltk import word_tokenize
from nltk.corpus import stopwords

from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics import silhouette_samples, silhouette_score

nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/dylancastillo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Read data

In [32]:
df_raw = pd.read_csv("data/news_data.csv")

In [33]:
df_raw.sample(3)

Unnamed: 0.1,Unnamed: 0,source_id,source_name,author,title,description,url,url_to_image,published_at,content,top_article,engagement_reaction_count,engagement_comment_count,engagement_share_count,engagement_comment_plugin_count
6965,6965,business-insider,Business Insider,Monica Chin,The Google Pixel 3 is $350 off at Best Buy — a...,"Google's Pixel 3 is an excellent, affordable s...",https://www.businessinsider.com/pixel-3-google...,https://amp.businessinsider.com/images/5d839b4...,2019-09-19T16:22:00Z,It's official: The Google Pixel 4 is coming. G...,0.0,1.0,1.0,3.0,0.0
7239,7239,newsweek,Newsweek,Jeffery Martin,"Obama Jabs Trump for Watching TV, Reading Soci...","At a tech conference, former President Barack ...",https://www.newsweek.com/obama-jabs-trump-watc...,https://d.newsweek.com/en/full/1528639/trump-r...,2019-09-19T03:56:35Z,"Speaking at a tech conference Wednesday, forme...",0.0,177.0,21.0,18.0,0.0
8304,8304,bbc-news,BBC News,,Hong Kong protesters hit the streets as China ...,An activist involved in anti-government protes...,https://www.bbc.co.uk/programmes/w172wq4x78lr607,https://ichef.bbci.co.uk/images/ic/1200x675/p0...,2019-10-01T14:31:00Z,An activist involved in anti-government protes...,0.0,0.0,0.0,0.0,0.0


## Clean data

### Define function to clean and tokenize

In [34]:
def clean_text(text, tokenizer, stopwords):
    """Pre-process text and generate tokens

    Args:
        text: Text to tokenize.

    Returns:
        Tokenized text.
    """
    text = str(text).lower()  # Lowercase words
    text = re.sub(r"\[(.*?)\]", "", text)  # Remove [+XYZ chars] in content
    text = re.sub(r"\s+", " ", text)  # Remove multiple spaces in content
    text = re.sub(r"\w+…|…", "", text)  # Remove ellipsis (and last word)
    text = re.sub(r"(?<=\w)-(?=\w)", " ", text)  # Replace dash between words
    text = re.sub(
        f"[{re.escape(string.punctuation)}]", "", text
    )  # Remove punctuation

    tokens = tokenizer(text)  # Get tokens from text
    tokens = [t for t in tokens if not t in stopwords]  # Remove stopwords
    tokens = ["" if t.isdigit() else t for t in tokens]  # Remove digits
    tokens = [t for t in tokens if len(t) > 1]  # Remove short tokens
    return tokens

### Apply function and remove duplicates

In [35]:
custom_stopwords = set(stopwords.words("english") + ["news", "new", "top"])
text_columns = ["title", "description", "content"]

df = df_raw.copy()
df["content"] = df["content"].fillna("")

for col in text_columns:
    df[col] = df[col].astype(str)

# Create text column based on title, description, and content
df["text"] = df[text_columns].apply(lambda x: " | ".join(x), axis=1)
df["tokens"] = df["text"].map(lambda x: clean_text(x, word_tokenize, custom_stopwords))

# Remove duplicated after preprocessing
_, idx = np.unique(df["tokens"], return_index=True)
df = df.iloc[idx, :]

# Remove empty values
df = df.loc[df.tokens.map(lambda x: len(x) > 0), ["text", "tokens"]]

print(f"Original dataframe: {df_raw.shape}")
print(f"Pre-processed dataframe: {df.shape}")

Original dataframe: (10437, 15)
Pre-processed dataframe: (9882, 2)


### Check vocabulary

In [36]:
docs = df["text"].values
tokenized_docs = df["tokens"].values
vocab = Counter()
for token in tokenized_docs:
    vocab.update(token)

In [37]:
vocab.most_common(10)

[('us', 2757),
 ('said', 2519),
 ('year', 1781),
 ('president', 1756),
 ('trump', 1705),
 ('world', 1620),
 ('says', 1511),
 ('one', 1418),
 ('two', 1284),
 ('first', 1195)]

## Generate vectors from document

### Define function for creating a single vectors from word embeddings

In [6]:
def vectorize(list_of_docs, model):
    """Generate vectors for list of documents using a Word Embedding

    Args:
        list_of_docs: List of documents
        model: Gensim's Word Embedding

    Returns:
        List of document vectors
    """
    features = []

    for tokens in list_of_docs:
        zero_vector = np.zeros(model.vector_size)
        vectors = []
        for token in tokens:
            if token in model.wv:
                try:
                    vectors.append(model.wv[token])
                except KeyError:
                    continue
        if vectors:
            vectors = np.asarray(vectors)
            avg_vec = vectors.mean(axis=0)
            features.append(avg_vec)
        else:
            features.append(zero_vector)
    return features

### Apply function to previously pre-processed text

In [38]:
model = Word2Vec(sentences=tokenized_docs, vector_size=100, workers=1, seed=42)

In [21]:
model.wv.most_similar("trump")

[('trumps', 0.988541841506958),
 ('president', 0.9746493697166443),
 ('donald', 0.9274922013282776),
 ('ivanka', 0.9203903079032898),
 ('impeachment', 0.9195784330368042),
 ('pences', 0.9152231812477112),
 ('avlon', 0.9148306846618652),
 ('biden', 0.9146010279655457),
 ('breitbart', 0.9144087433815002),
 ('vice', 0.9067237973213196)]

In [28]:
vectorized_docs = vectorize(tokenized_docs, model=model)
len(vectorized_docs), len(vectorized_docs[0])

(9882, 100)

### Generate and analyze clusters

In [43]:
def mbkmeans_clusters(X, k, mb=500, print_silhouette_values=False):
    """Generate clusters.

    Args:
        X: Matrix of features.
        k: Number of clusters.
        mb: Size of mini-batches. Defaults to 500.
        print_silhouette_values: Print silhouette values per cluster.

    Returns:
        Trained clustering model and labels based on X.
    """
    km = MiniBatchKMeans(n_clusters=k, batch_size=mb).fit(X)
    print(f"For n_clusters = {k}")
    print(f"Silhouette coefficient: {silhouette_score(X, km.labels_):0.2f}")
    print(f"Inertia:{km.inertia_}")

    if print_silhouette_values:
        sample_silhouette_values = silhouette_samples(X, km.labels_)
        print(f"Silhouette values:")
        silhouette_values = []
        for i in range(k):
            cluster_silhouette_values = sample_silhouette_values[km.labels_ == i]
            silhouette_values.append(
                (
                    i,
                    cluster_silhouette_values.shape[0],
                    cluster_silhouette_values.mean(),
                    cluster_silhouette_values.min(),
                    cluster_silhouette_values.max(),
                )
            )
        silhouette_values = sorted(
            silhouette_values, key=lambda tup: tup[2], reverse=True
        )
        for s in silhouette_values:
            print(
                f"    Cluster {s[0]}: Size:{s[1]} | Avg:{s[2]:.2f} | Min:{s[3]:.2f} | Max: {s[4]:.2f}"
            )
    return km, km.labels_

In [44]:
clustering, cluster_labels = mbkmeans_clusters(X=vectorized_docs, k=50, print_silhouette_values=True)
df_clusters = pd.DataFrame({
    "text": docs,
    "tokens": [" ".join(text) for text in tokenized_docs],
    "cluster": cluster_labels
})

For n_clusters = 50
Silhouette coefficient: 0.11
Inertia:3561.4632984073837
Silhouette values:
    Cluster 48: Size:56 | Avg:0.34 | Min:-0.02 | Max: 0.54
    Cluster 42: Size:112 | Avg:0.33 | Min:-0.01 | Max: 0.54
    Cluster 4: Size:138 | Avg:0.32 | Min:-0.01 | Max: 0.52
    Cluster 21: Size:84 | Avg:0.31 | Min:-0.06 | Max: 0.53
    Cluster 9: Size:35 | Avg:0.28 | Min:0.03 | Max: 0.52
    Cluster 28: Size:132 | Avg:0.27 | Min:-0.09 | Max: 0.52
    Cluster 12: Size:253 | Avg:0.26 | Min:-0.00 | Max: 0.47
    Cluster 26: Size:60 | Avg:0.26 | Min:-0.04 | Max: 0.51
    Cluster 30: Size:122 | Avg:0.25 | Min:-0.06 | Max: 0.46
    Cluster 0: Size:122 | Avg:0.25 | Min:-0.02 | Max: 0.47
    Cluster 40: Size:107 | Avg:0.23 | Min:-0.07 | Max: 0.48
    Cluster 20: Size:140 | Avg:0.23 | Min:-0.08 | Max: 0.46
    Cluster 7: Size:182 | Avg:0.19 | Min:-0.03 | Max: 0.39
    Cluster 45: Size:57 | Avg:0.18 | Min:-0.03 | Max: 0.41
    Cluster 33: Size:181 | Avg:0.17 | Min:0.01 | Max: 0.38
    Cluster 22: 

In [45]:
print("Top terms per cluster (based on centroids):")
for i in range(50):
    tokens_per_cluster = ""
    most_representative = model.wv.most_similar(positive=[clustering.cluster_centers_[i]], topn=5)
    for t in most_representative:
        tokens_per_cluster += f"{t[0]} "
    print(f"Cluster {i}: {tokens_per_cluster}")

Top terms per cluster (based on centroids):
Cluster 0: serial trying shocked contained passenger 
Cluster 1: rare mosquito train borne commercial 
Cluster 2: arrest funeral illinois dissident founding 
Cluster 3: kiev departments spokeswoman repeal saikawa 
Cluster 4: pm proposals johnsons delay benjamin 
Cluster 5: december plunged analysts total spring 
Cluster 6: aides senate congressional pelosi request 
Cluster 7: speech referendum labour donohoe leo 
Cluster 8: lilinow path heavy survivors projected 
Cluster 9: doonbeg disagreed macron emmanuel administrations 
Cluster 10: stabbing amber neighbor botham guyger 
Cluster 11: winning takes injury fifth points 
Cluster 12: likes popularity ai tips access 
Cluster 13: represents truly shareholders laid planning 
Cluster 14: zelensky volodymyr whistleblowers ukrainian impeach 
Cluster 15: apartment murdering suspicion fatal girl 
Cluster 16: prize throughout tops raw pittsburgh 
Cluster 17: madrid coaches bulgaria zhengzhou anfield 
Cl

In [47]:
test_cluster = 48
most_representative_docs = np.argsort(
    np.linalg.norm(vectorized_docs - clustering.cluster_centers_[test_cluster], axis=1)
)
for d in most_representative_docs[:10]:
    print(docs[d])
    print("-------------")

Dorian, Comey and Debra Messing: What Trump tweeted on Labor Day weekend | President Donald Trump axed his visit to Poland over the weekend to monitor Hurricane Dorian from Camp David with emergency management staff, but if the President's more than 120 tweets are any indication, he had more than just the storm on his mind. | Washington (CNN)President Donald Trump axed his visit to Poland over the weekend to monitor Hurricane Dorian from Camp David with emergency management staff, but if the President's more than 120 tweets are any indication, he had more than just the storm on hi… [+3027 chars]
-------------
Lead NOAA scientist vows to probe agency's defense of Trump | The Acting Head of NOAA, the government weather agency, goes to Alabama Tuesday to address a recent uproar over Hurricane Dorian. President Trump repeatedly and wrongly claimed the hurricane was a threat to Alabama. NOAA was criticized for saying one of its f… | 
-------------
The ÜS president in a weekend tweet had nam