## Import required libraries

In [1]:
import os
import re
import string
from collections import Counter
from string import punctuation
from time import time

import gensim
import nltk
import numpy as np
import pandas as pd

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics import silhouette_samples, silhouette_score
from tqdm.notebook import tqdm

from ds_utils.config import set_display_options
from ds_utils.data import NEWS_DATA
from ds_utils.functions import vectorize

nltk.download("stopwords")
nltk.download("punkt")
set_display_options()

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/dylancastillo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/dylancastillo/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Read data

In [2]:
df = pd.read_csv(NEWS_DATA)

In [3]:
df.shape

(10437, 15)

In [4]:
df.columns

Index(['Unnamed: 0', 'source_id', 'source_name', 'author', 'title',
       'description', 'url', 'url_to_image', 'published_at', 'content',
       'top_article', 'engagement_reaction_count', 'engagement_comment_count',
       'engagement_share_count', 'engagement_comment_plugin_count'],
      dtype='object')

In [5]:
df.sample(1).T

Unnamed: 0,5900
Unnamed: 0,5900
source_id,business-insider
source_name,Business Insider
author,Eliza Relman
title,Elizabeth Warren endorsed two left-wing primary challengers to House Democrats. It could be a risky play to win Alexandria Ocasio-Cortez's endorsement.
description,"Sen. Elizabeth Warren on Monday endorsed two progressive primary challengers to sitting House Democrats. The challengers — both women — are running against two of the most conservative Democrats in the House and are both endorsed by Justice Democrats, the gro…"
url,https://www.businessinsider.com/elizabeth-warren-endorses-candidates-alexandria-ocasio-cortez-mold-2019-9
url_to_image,https://amp.businessinsider.com/images/5d78eb602e22af24c02ed525-2732-1366.jpg
published_at,2019-09-14T13:12:00Z
content,Sen. Elizabeth Warren on Monday endorsed two progressive primary challengers to sitting House Democrats — an unusual and controversial move that could be a play to win the endorsement of star freshman Rep. Alexandria Ocasio-Cortez of New York. \r\n The two cand… [+4116 chars]


## Fill missing values

In [6]:
df.isna().mean()

Unnamed: 0                        0.00
source_id                         0.00
source_name                       0.00
author                            0.10
title                             0.00
description                       0.00
url                               0.00
url_to_image                      0.06
published_at                      0.00
content                           0.12
top_article                       0.00
engagement_reaction_count         0.01
engagement_comment_count          0.01
engagement_share_count            0.01
engagement_comment_plugin_count   0.01
dtype: float64

In [7]:
df["content"] = df["content"].fillna("")

## Generate tokens

In [8]:
df.sample(1).apply(lambda x: x["title"] + " | " + x["description"] + " | " + x["content"], axis=1).values[0]

"Mountain Lion Bites Head of 8-year-old Boy, Child Fights Off Animal With Stick: 'I Tried to Get It in the Eye' | Bailey boy Pike Carlson needed two surgeries and may need one more after horrific attack in his own backyard. | An 8-year-old boy has spoken about how he bravely tried to fend off a mountain lion using just a stick while he was being attacked outside his home in Colorado.\r\nPike Carlson was playing in the backyard with his older brother, Gage, in the mountainous town of… [+2506 chars]"

In [9]:
stop_words = set(stopwords.words("english") + ["news", "new", "top"])

def generate_tokens(text, tokenizer=word_tokenize, stop_words=stop_words):
    text = str(text).lower() # Lowercase words
    text = re.sub(r"\[(.*?)\]", "", text) # Remove [+XYZ chars] in content
    text = re.sub(r"\s+", " ", text) # Remove multiple spaces in content
    text = re.sub(r"\w+…|…", "", text) # Remove ellipsis (and last word)
    text = re.sub(r"(?<=\w)-(?=\w)", " ", text) # Replace dash between words
    text = re.sub(f"[{re.escape(string.punctuation)}]", "", text) # Remove punctuation
    
    tokens = tokenizer(text) # Get tokens from text
    tokens = [t for t in tokens if not t in stop_words] # Remove stopwords
    tokens = ["" if t.isdigit() else t for t in tokens] # Remove digits
    tokens = [t for t in tokens if len(t) > 1] # Remove short tokens
    return tokens

for _ in range(5):
    sample_text = df.sample(1).apply(lambda x: x["title"] + " | " + x["description"], axis=1).values[0]
    print(f"SAMPLE TEXT: {sample_text}")
    print(f"TOKENS: {generate_tokens(sample_text)}")
    print(f"------")

SAMPLE TEXT: Players’ view: What changes will Dublin and Kerry make and who will win? | Enda Smith, Brian Hurley, Mickey Quinn and Cian Mackey all give verdicts on the replay
TOKENS: ['players', 'view', 'changes', 'dublin', 'kerry', 'make', 'win', 'enda', 'smith', 'brian', 'hurley', 'mickey', 'quinn', 'cian', 'mackey', 'give', 'verdicts', 'replay']
------
SAMPLE TEXT: Pence Staying at Trump Property in Ireland at Trump’s ‘Suggestion,’ Aide Says | The vice president, who has meetings in Dublin, is staying in Doonbeg, which is on the other side of Ireland.
TOKENS: ['pence', 'staying', 'trump', 'property', 'ireland', 'trump', 'suggestion', 'aide', 'says', 'vice', 'president', 'meetings', 'dublin', 'staying', 'doonbeg', 'side', 'ireland']
------
SAMPLE TEXT: 11 traits you need to be an effective remote worker | Working from home or outside of a traditional office setting can be ideal for some people who want to make their own schedule. But working remotely is not for everyone. It requires 

In [10]:
text_columns = ["title", "description", "content"]

for col in text_columns:
    df[col] = df[col].astype(str)

# Create text column based on title, description, and content
df["text"] = df[text_columns].apply(lambda x: ' | '.join(x), axis=1)
df["tokens"] = df["text"].map(lambda x: generate_tokens(x))

# Remove duplicated after preprocessing
_, idx = np.unique(df["tokens"], return_index=True)
df_proc = df.iloc[idx, :]

# Remove empty values
df_proc = df_proc.loc[df_proc.tokens.map(lambda x: len(x) > 0)]

df.shape, df_proc.shape

((10437, 17), (9882, 17))

## Review vocabulary

In [11]:
docs = df_proc["text"].values
tokenized_docs = df_proc["tokens"].values
vocab = Counter()
for token in tokenized_docs:
    vocab.update(token)

In [12]:
len(vocab)

32454

In [13]:
vocab.most_common(10)

[('us', 2757),
 ('said', 2519),
 ('year', 1781),
 ('president', 1756),
 ('trump', 1705),
 ('world', 1620),
 ('says', 1511),
 ('one', 1418),
 ('two', 1284),
 ('first', 1195)]

## BoW + SVD + Normalizer

In [53]:
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer

bow = CountVectorizer(analyzer="word",
                      stop_words=stop_words,
                      ngram_range=(1, 3),
                      max_df=.5, 
                      min_df=5)
svd = TruncatedSVD(200)
normalizer = Normalizer(copy=False)
vectorizer = make_pipeline(bow, svd, normalizer)
vectorized_docs = vectorizer.fit_transform(docs)

In [37]:
vectorized_docs.shape

(9882, 200)

## Choose number of clusters

In [38]:
def generate_clusters(X, k, mb=500, random_state=42):
    clustering = MiniBatchKMeans(n_clusters=k, batch_size=mb, random_state=random_state)
    cluster_labels = clustering.fit_predict(X)
    print(f"For n_clusters = {k}")
    silhouette_avg = silhouette_score(X, cluster_labels)
    print(f"The average Silhouette_score is: {silhouette_avg:.2f}")
    sample_silhouette_values = silhouette_samples(X, cluster_labels)
    for i in range(k):
        ith_cluster_silhouette_values = sample_silhouette_values[cluster_labels == i]
        print(f"    Silhoute values for cluster {i}: "
        f"Size:{ith_cluster_silhouette_values.shape[0]}"
        f"| Min:{ith_cluster_silhouette_values.min():.2f}"
        f"| Avg:{ith_cluster_silhouette_values.mean():.2f}"
        f"| Max: {ith_cluster_silhouette_values.max():.2f}")
    try:
        print(f"The Inertia is :{clustering.inertia_}")
        distorsions.append(clustering.inertia_)
    except:
        pass
    return clustering, cluster_labels

In [39]:
distorsions = []
for k in tqdm(range(2, 25)):
    generate_clusters(vectorized_docs, k)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=23.0), HTML(value='')))

For n_clusters = 2
The average Silhouette_score is: 0.01
    Silhoute values for cluster 0: Size:7851| Min:-0.02| Avg:0.00| Max: 0.03
    Silhoute values for cluster 1: Size:2031| Min:0.01| Avg:0.05| Max: 0.14
The Inertia is :9264.918827407188
For n_clusters = 3
The average Silhouette_score is: 0.02
    Silhoute values for cluster 0: Size:6157| Min:-0.03| Avg:0.00| Max: 0.03
    Silhoute values for cluster 1: Size:1399| Min:0.00| Avg:0.09| Max: 0.20
    Silhoute values for cluster 2: Size:2326| Min:-0.03| Avg:0.03| Max: 0.08
The Inertia is :9091.297413244354
For n_clusters = 4
The average Silhouette_score is: 0.03
    Silhoute values for cluster 0: Size:5033| Min:-0.05| Avg:-0.00| Max: 0.03
    Silhoute values for cluster 1: Size:1185| Min:-0.02| Avg:0.09| Max: 0.23
    Silhoute values for cluster 2: Size:1013| Min:-0.02| Avg:0.11| Max: 0.23
    Silhoute values for cluster 3: Size:2651| Min:-0.03| Avg:0.02| Max: 0.08
The Inertia is :8967.868450283417
For n_clusters = 5
The average Silh

## Analyze generated clusters

In [55]:
clustering, cluster_labels = generate_clusters(vectorized_docs, 50)

For n_clusters = 50
The average Silhouette_score is: 0.09
    Silhoute values for cluster 0: Size:255| Min:-0.13| Avg:0.03| Max: 0.15
    Silhoute values for cluster 1: Size:254| Min:-0.03| Avg:0.06| Max: 0.20
    Silhoute values for cluster 2: Size:245| Min:-0.10| Avg:0.02| Max: 0.12
    Silhoute values for cluster 3: Size:217| Min:0.02| Avg:0.21| Max: 0.38
    Silhoute values for cluster 4: Size:252| Min:-0.10| Avg:0.01| Max: 0.14
    Silhoute values for cluster 5: Size:400| Min:0.11| Avg:0.43| Max: 0.60
    Silhoute values for cluster 6: Size:47| Min:-0.06| Avg:0.09| Max: 0.22
    Silhoute values for cluster 7: Size:127| Min:0.13| Avg:0.38| Max: 0.54
    Silhoute values for cluster 8: Size:317| Min:-0.05| Avg:0.06| Max: 0.27
    Silhoute values for cluster 9: Size:193| Min:-0.10| Avg:0.02| Max: 0.12
    Silhoute values for cluster 10: Size:264| Min:-0.07| Avg:0.06| Max: 0.19
    Silhoute values for cluster 11: Size:131| Min:0.03| Avg:0.23| Max: 0.36
    Silhoute values for cluster 1

In [110]:
original_space_centroids.shape, order_centroids.shape

((50, 15389), (50, 15389))

In [114]:
order_centroids[0, :5]

array([12086,  8130, 13522,  5897,  9458])

In [65]:
print("Top terms per cluster:")
original_space_centroids = svd.inverse_transform(clustering.cluster_centers_)
order_centroids = original_space_centroids.argsort()[:, ::-1]
terms = bow.get_feature_names()

for i in range(50):
    print("Cluster %d:" % i, end='')
    for ind in order_centroids[i, :5]:
        print(' %s' % terms[ind], end='')
    print()

Top terms per cluster:
Cluster 0: season league team football nfl
Cluster 1: years ago years ago three life
Cluster 2: us trump president talks administration
Cluster 3: johnson boris brexit boris johnson prime
Cluster 4: like media get facebook social
Cluster 5: national video get online world
Cluster 6: border mexico wall says trump
Cluster 7: us facebook find world messenger
Cluster 8: one world year two day
Cluster 9: time democratic presidential 2020 week
Cluster 10: man police said officer shot
Cluster 11: world cup world cup rugby ireland
Cluster 12: thursday year reuters said oil
Cluster 13: people health least year says
Cluster 14: minister prime prime minister netanyahu election
Cluster 15: would said mr could says
Cluster 16: trump president donald donald trump president donald
Cluster 17: chief executive chief executive said officer
Cluster 18: two years women year friday
Cluster 19: mugabe robert robert mugabe zimbabwe leader
Cluster 20: first time first time year set
Clus

In [67]:
df_clusters = pd.DataFrame({
    "text": docs,
    "tokens": [" ".join(text) for text in tokenized_docs],
    "cluster": cluster_labels
})

### Most frequent tokens

In [90]:
for i in range(50):
    empty = ""
    most_frequent = Counter(" ".join(df_clusters.query(f"cluster == {i}")["tokens"]).split()).most_common(5)
    for t in most_frequent:
        empty += f"{t[0]}({str(t[1])}) "
    print(f"Cluster {i}: {empty}")

Cluster 0: season(195) league(159) team(120) football(96) nfl(90) 
Cluster 1: years(394) ago(106) three(66) five(36) last(35) 
Cluster 2: us(500) trump(99) president(74) talks(51) said(46) 
Cluster 3: johnson(457) boris(377) brexit(347) minister(278) prime(274) 
Cluster 4: like(210) get(79) media(65) facebook(65) social(58) 
Cluster 5: national(418) get(409) video(409) world(407) online(406) 
Cluster 6: border(110) trump(35) wall(34) us(25) near(24) 
Cluster 7: us(138) world(129) chat(127) facebook(127) messenger(127) 
Cluster 8: one(498) year(58) day(39) two(37) like(34) 
Cluster 9: democratic(152) time(138) presidential(111) sanders(73) debate(70) 
Cluster 10: man(416) police(345) officer(84) said(83) found(79) 
Cluster 11: cup(265) world(260) rugby(109) ireland(94) win(68) 
Cluster 12: year(152) thursday(147) reuters(120) said(104) russia(99) 
Cluster 13: people(445) health(67) killed(60) least(59) said(58) 
Cluster 14: minister(230) prime(191) netanyahu(104) election(87) israeli(71

### Most representative documents

In [91]:
test_cluster = 44
most_representative_docs = np.argsort(
    np.linalg.norm(vectorized_docs - clustering.cluster_centers_[test_cluster], axis=1)
)
for d in most_representative_docs[:10]:
    print(docs[d])
    print("-------------")

Odessa gunman "was on a long spiral of going down" before shooting, officials say | The gunman who went on a shooting rampage in West Texas called police before and during the massacre. Authorities say the attack happened after the gunman had been fired from his job, and that he "was on a long spiral of going down" before the shooting. Mirey… | 
-------------
The Latest: Lawyers: Deal reached on Vegas shooting lawsuits | Attorneys for victims of the deadliest mass shooting in modern U.S. history say they’ve reached a settlement to resolve lawsuits that’s expected to pay between $735 million and $800 million | The Latest on a legal settlement over the deadliest mass shooting in modern U.S. history (all times local):
8:20 a.m.
Attorneys for victims of the deadliest mass shooting in modern U.S. history say theyve reached a settlement to resolve lawsuits thats expec… [+1458 chars]
-------------
Lawyer for Las Vegas shooting victims to talk developments | An attorney who represents victims 

### Random sample of documents

In [72]:
for i,t in enumerate(df_clusters.query(f"cluster == {test_cluster}").sample(10).iterrows()):
    print(t[1]["text"])
    print("-------------")

Robert Pattinson and Willem Dafoe spiral into madness in the fantastic horror movie, 'The Lighthouse' | Robert Pattinson and Willem Dafoe give amazingly disturbing performances as two men stranded on an island in "The Lighthouse." The movie is by writer-director Robert Eggers, who made the acclaimed 2015 horror "The Witch." This movie is more twisted and distur… | Each want the light, but only one can be mad enough to get it. 
 Robert Eggers' new movie "The Lighthouse," which is the follow-up to his acclaimed 2015 debut feature, "The Witch," shows just how talented this writer-director is — and how twisted his mind wo… [+2139 chars]
-------------
Robert Pattinson was blanking 'furious' over Batman leak | While Robert Pattinson is thrilled to be cast as the new Batman, he was less than thrilled when the news got out early. | 
-------------
World leaders react to Robert Mugabe's death | The longtime Zimbabwean leader has received a glowing tribute particularly from African leaders. | Rob