## Import required libraries

In [1]:
import os
import re
from collections import Counter
from time import time

import gensim
import numpy as np
import pandas as pd

from tqdm.notebook import tqdm

from ds_utils.config import set_display_options
from ds_utils.clustering import Tokenizer, load_data, clean_news_data, vectorize, mbkmeans_clusters

from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer

set_display_options()
set_random_seed()

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/dylancastillo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Read data

In [2]:
df = load_data("news")

In [3]:
df.shape

(10437, 15)

In [4]:
df.columns

Index(['Unnamed: 0', 'source_id', 'source_name', 'author', 'title',
       'description', 'url', 'url_to_image', 'published_at', 'content',
       'top_article', 'engagement_reaction_count', 'engagement_comment_count',
       'engagement_share_count', 'engagement_comment_plugin_count'],
      dtype='object')

## Clean data

In [5]:
df = clean_news_data(df)

Original dataframe: (10437, 15)
Pre-processed dataframe: (9882, 2)


In [6]:
df.sample(1).T

Unnamed: 0,6909
text,"Trump's new national security adviser is a 'nice guy', but poorly equipped for one of the toughest jobs in the White House, sources say | Robert O'Brien, Donald Trump's new national security advisor, is well respected and has proven himself before. But former colleagues of the US' former chief hostage negotiator say he lacks experience, is policy lightweight, and has little knowledge of interna… | President Trump's new national security adviser, Robert O'Brien, impressed the president with his ability to help negotiate high-profile releases of US prisoners around the world, and drew some admiration from colleagues for his competence. \r\n However, accord… [+4576 chars]"
tokens,"[trumps, national, security, adviser, nice, guy, poorly, equipped, one, toughest, jobs, white, house, sources, say, robert, obrien, donald, trumps, national, security, advisor, well, respected, proven, former, colleagues, us, former, chief, hostage, negotiator, say, lacks, experience, policy, lightweight, little, knowledge, president, trumps, national, security, adviser, robert, obrien, impressed, president, ability, help, negotiate, high, profile, releases, us, prisoners, around, world, drew, admiration, colleagues, competence, however]"


## Review tokens and vocabulary

### Tokens

In [7]:
sample_text = df.sample(1)
print(f"SAMPLE TEXT: {sample_text['text'].values[0]}")
print(f"------")
print(f"TOKENS: {sample_text['tokens'].values[0]}")

SAMPLE TEXT: Klopp savours quality of Liverpool goals after fightback against Newcastle | Sadio Mane grabs two as Liverpool continue perfect start to season at Anfield | Liverpool 3 Newcastle 1
Jürgen Klopp hailed the quality of the goals his Liverpool side scored as the Premier League leaders came from behind to beat Newcastle 3-1 at Anfield.
The Reds were stunned as Steve Bruces Newcastle claimed a shock early lead with a… [+2641 chars]
------
TOKENS: ['klopp', 'savours', 'quality', 'liverpool', 'goals', 'fightback', 'newcastle', 'sadio', 'mane', 'grabs', 'two', 'liverpool', 'continue', 'perfect', 'start', 'season', 'anfield', 'liverpool', 'newcastle', 'jürgen', 'klopp', 'hailed', 'quality', 'goals', 'liverpool', 'side', 'scored', 'premier', 'league', 'leaders', 'came', 'behind', 'beat', 'newcastle', 'anfield', 'reds', 'stunned', 'steve', 'bruces', 'newcastle', 'claimed', 'shock', 'early', 'lead']


### Vocabulary

In [8]:
docs = df["text"].values
tokenized_docs = df["tokens"].values
vocab = Counter()
for token in tokenized_docs:
    vocab.update(token)

In [9]:
len(vocab)

32454

In [10]:
vocab.most_common(10)

[('us', 2757),
 ('said', 2519),
 ('year', 1781),
 ('president', 1756),
 ('trump', 1705),
 ('world', 1620),
 ('says', 1511),
 ('one', 1418),
 ('two', 1284),
 ('first', 1195)]

## BoW + SVD + Normalizer

In [11]:
analyzer = Tokenizer()
bow = TfidfVectorizer(analyzer=analyzer, max_df=.5, min_df=5)
svd = TruncatedSVD(200)
normalizer = Normalizer(copy=False)
vectorizer = make_pipeline(bow, svd, normalizer)
vectorized_docs = vectorizer.fit_transform(docs)

In [12]:
vectorized_docs.shape

(9882, 200)

## Generate and analyze clusters

In [13]:
clustering, cluster_labels = mbkmeans_clusters(vectorized_docs, 50, print_silhouette_values=True)
df_clusters = pd.DataFrame({
    "text": docs,
    "tokens": [" ".join(text) for text in tokenized_docs],
    "cluster": cluster_labels
})

For n_clusters = 50
Silhouette coefficient: 0.07
Inertia:7382.775819185531
Silhouette values:
    Cluster 37: Size:73 | Avg:0.33 | Min:0.13 | Max: 0.48
    Cluster 19: Size:136 | Avg:0.29 | Min:0.06 | Max: 0.43
    Cluster 25: Size:119 | Avg:0.26 | Min:-0.00 | Max: 0.44
    Cluster 32: Size:106 | Avg:0.24 | Min:0.06 | Max: 0.39
    Cluster 47: Size:73 | Avg:0.22 | Min:0.03 | Max: 0.37
    Cluster 2: Size:282 | Avg:0.20 | Min:0.05 | Max: 0.36
    Cluster 40: Size:185 | Avg:0.19 | Min:0.07 | Max: 0.34
    Cluster 26: Size:362 | Avg:0.19 | Min:0.00 | Max: 0.36
    Cluster 12: Size:109 | Avg:0.18 | Min:-0.03 | Max: 0.34
    Cluster 33: Size:311 | Avg:0.17 | Min:0.01 | Max: 0.30
    Cluster 10: Size:179 | Avg:0.17 | Min:-0.02 | Max: 0.35
    Cluster 36: Size:104 | Avg:0.14 | Min:0.01 | Max: 0.28
    Cluster 41: Size:143 | Avg:0.14 | Min:0.01 | Max: 0.29
    Cluster 16: Size:107 | Avg:0.14 | Min:-0.02 | Max: 0.30
    Cluster 42: Size:87 | Avg:0.14 | Min:-0.01 | Max: 0.28
    Cluster 45: Size

### Evaluate top terms per cluster (based on clusters' centroids)

In [14]:
print("Top terms per cluster (based on centroids):")
original_space_centroids = svd.inverse_transform(clustering.cluster_centers_)
order_centroids = original_space_centroids.argsort()[:, ::-1]
terms = bow.get_feature_names()
for i in range(50):
    print("Cluster %d:" % i, end='')
    for ind in order_centroids[i, :5]:
        print(' %s' % terms[ind], end='')
    print()

Top terms per cluster (based on centroids):
Cluster 0: vaping said car strike us
Cluster 1: food meat fast plant based
Cluster 2: hurricane dorian bahamas storm carolina
Cluster 3: man arrested charged old murder
Cluster 4: government nuclear deal iran anti
Cluster 5: chief executive officer ceo said
Cluster 6: south africa african korea attacks
Cluster 7: minister prime trudeau kashmir justin
Cluster 8: year old woman boy died
Cluster 9: trump president donald trumps us
Cluster 10: saudi oil arabia attacks drone
Cluster 11: credit card make know people
Cluster 12: impeachment trump inquiry house democrats
Cluster 13: two years one men women
Cluster 14: million year pay jackpot nearly
Cluster 15: gun state states us united
Cluster 16: russia russian ukraine moscow putin
Cluster 17: climate school change students high
Cluster 18: found could health study may
Cluster 19: hong kong protests protesters police
Cluster 20: police officer officers shot paris
Cluster 21: hours murder air repor

### Evaluate top terms per cluster (based on words frequencies)

In [15]:
print("Top terms per cluster (based on words frequencies):")
for i in range(50):
    empty = ""
    most_frequent = Counter(" ".join(df_clusters.query(f"cluster == {i}")["tokens"]).split()).most_common(5)
    for t in most_frequent:
        empty += f"{t[0]}({str(t[1])}) "
    print(f"Cluster {i}: {empty}")

Top terms per cluster (based on words frequencies):
Cluster 0: said(176) us(146) vaping(143) strike(104) workers(103) 
Cluster 1: food(136) meat(73) fast(59) plant(43) said(42) 
Cluster 2: hurricane(616) dorian(546) bahamas(280) storm(238) carolina(99) 
Cluster 3: man(462) year(75) old(69) said(65) police(58) 
Cluster 4: government(279) nuclear(81) deal(76) iran(67) said(66) 
Cluster 5: chief(189) executive(119) said(56) officer(37) reuters(36) 
Cluster 6: south(207) africa(68) african(55) korea(52) north(39) 
Cluster 7: minister(166) prime(98) trudeau(74) kashmir(73) justin(68) 
Cluster 8: year(466) old(374) woman(174) boy(75) died(71) 
Cluster 9: trump(697) president(464) donald(335) us(166) trumps(99) 
Cluster 10: saudi(440) oil(322) arabia(129) attacks(112) us(106) 
Cluster 11: credit(94) get(93) money(87) time(83) card(81) 
Cluster 12: impeachment(214) trump(177) house(146) president(136) inquiry(111) 
Cluster 13: two(371) years(56) one(44) us(29) image(25) 
Cluster 14: million(49

### Retrieve most representative documents (based on clusters' centroids)

In [16]:
test_cluster = 37
most_representative_docs = np.argsort(
    np.linalg.norm(vectorized_docs - clustering.cluster_centers_[test_cluster], axis=1)
)
for d in most_representative_docs[:10]:
    print(docs[d])
    print("-------------")

Opinion: Zimbabwe’s Robert Mugabe Ruined a Once Prosperous Country | The former dictator of Zimbabwe, Robert Mugabe, died September 6, 2019, leaving behind a legacy of economic failure and mass oppression. Image: Meng Chenguang / Zuma Wire | 
-------------
Robert Mugabe's most famous quotes | A quick look at President Mugabe's colourful language throughout his 37-year reign as leader of Zimbabwe. | Zimbabwe's Former President Robert Mugabe has died aged 95. The death was announced by his succesor Emmerson Mnangagwa who mourned him as an "icon of liberation."
"It is with the utmost sadness that I announce the passing on of Zimbabwe's founding father and… [+4531 chars]
-------------
Robert Mugabe: World Reacts to Death of Ex-president Who Ruled Zimbabwe for 37 Years—'Even Dictators Finally Die' | The 95-year-old, who died in Singapore, ruled Zimbabwe with an iron fist until toppled by a military coup in 2017. | Robert Mugabe, the former leader of Zimbabwe, has died in a Singapore hospita

### Random sample of documents for a given cluster

In [17]:
for i,t in enumerate(df_clusters.query(f"cluster == {test_cluster}").sample(10).iterrows()):
    print(t[1]["text"])
    print("-------------")

Timeline: Key dates in the life of Robert Mugabe | Zimbabwe's former leader died on Friday in Singapore, at the age of 95. | Zimbabwe's former President Robert Mugabe has died at the age of 95.
The rebel who led Zimbabwe to independence and ruled the country for 37 years died on Friday in Singapore, where he had often visited in recent years for medical treatment.
Below are the k… [+4172 chars]
-------------
Robert Mugabe's most famous quotes | A quick look at President Mugabe's colourful language throughout his 37-year reign as leader of Zimbabwe. | Zimbabwe's Former President Robert Mugabe has died aged 95. The death was announced by his succesor Emmerson Mnangagwa who mourned him as an "icon of liberation."
"It is with the utmost sadness that I announce the passing on of Zimbabwe's founding father and… [+4531 chars]
-------------
Robert Mugabe to be buried next week in his village: Family | The ex-Zimbabwean leader's family opposes government plan to bury him at the national monumen