## Import required libraries

In [62]:
import os
from collections import Counter
from time import time

import gensim
import numpy as np
import pandas as pd
from gensim.models import Word2Vec
from tqdm.notebook import tqdm

from ds_utils.config import set_display_options
from ds_utils.clustering import Tokenizer, load_data, clean_news_data, vectorize, mbkmeans_clusters

set_display_options()

## Read data

In [63]:
df = load_data("news")

In [64]:
df.shape

(10437, 15)

In [65]:
df.columns

Index(['Unnamed: 0', 'source_id', 'source_name', 'author', 'title',
       'description', 'url', 'url_to_image', 'published_at', 'content',
       'top_article', 'engagement_reaction_count', 'engagement_comment_count',
       'engagement_share_count', 'engagement_comment_plugin_count'],
      dtype='object')

## Clean data

In [66]:
df = clean_news_data(df)

Original dataframe: (10437, 15)
Pre-processed dataframe: (9882, 2)


In [67]:
df.sample(1).T

Unnamed: 0,10248
text,Trump's conspiracies are reaching a fever pitch amid revelations that the whistleblower went to Congress before filing their complaint | Trumpworld flew into a frenzy after The New York Times published a story revealing that a CIA officer who filed a whistleblower complaint against President Donald Trump approached a House Intelligence Committee aide with their concerns before filing the compl… | Trumpworld seized on a New York Times story Wednesday that revealed a CIA officer who filed a whistleblower complaint against President Donald Trump approached a House Intelligence Committee aide with their concerns before filing the complaint.\r\nThe official … [+6873 chars]
tokens,"[trumps, conspiracies, reaching, fever, pitch, amid, revelations, whistleblower, went, congress, filing, complaint, trumpworld, flew, frenzy, york, times, published, story, revealing, cia, officer, filed, whistleblower, complaint, president, donald, trump, approached, house, intelligence, committee, aide, concerns, filing, trumpworld, seized, york, times, story, wednesday, revealed, cia, officer, filed, whistleblower, complaint, president, donald, trump, approached, house, intelligence, committee, aide, concerns, filing, complaint, official]"


## Review tokens and vocabulary

### Tokens

In [68]:
sample_text = df.sample(1)
print(f"SAMPLE TEXT: {sample_text['text'].values[0]}")
print(f"------")
print(f"TOKENS: {sample_text['tokens'].values[0]}")

SAMPLE TEXT: How to post on LinkedIn to share job news, articles, and more with your network, and edit or delete your posts | You can post on LinkedIn to share articles, anecdotes, or job listings with your network. You can also make posts public on LinkedIn, so anyone can find your post, or share your post on other connected social media networks. After you post on LinkedIn, it's e… | LinkedIn is the world's largest professional social network, where CEOs, designers, executives, artists, and experts of all kinds can share job listings, articles, and posts ( though, there are some things you should avoid sharing on LinkedIn). 
 Whether you… [+1912 chars]
------
TOKENS: ['post', 'linkedin', 'share', 'job', 'articles', 'network', 'edit', 'delete', 'posts', 'post', 'linkedin', 'share', 'articles', 'anecdotes', 'job', 'listings', 'network', 'also', 'make', 'posts', 'public', 'linkedin', 'anyone', 'find', 'post', 'share', 'post', 'connected', 'social', 'media', 'networks', 'post', 'linkedin

### Vocabulary

In [69]:
docs = df["text"].values
tokenized_docs = df["tokens"].values
vocab = Counter()
for token in tokenized_docs:
    vocab.update(token)

In [70]:
len(vocab)

32454

In [71]:
vocab.most_common(10)

[('us', 2757),
 ('said', 2519),
 ('year', 1781),
 ('president', 1756),
 ('trump', 1705),
 ('world', 1620),
 ('says', 1511),
 ('one', 1418),
 ('two', 1284),
 ('first', 1195)]

## Train Word2Vec model

In [72]:
model = Word2Vec(sentences=tokenized_docs, vector_size=100, workers=4, seed=42)

In [73]:
model.wv.most_similar("trump")

[('trumps', 0.9820594191551208),
 ('president', 0.9721266627311707),
 ('breitbart', 0.9520003199577332),
 ('donald', 0.9352222681045532),
 ('ivanka', 0.9282146692276001),
 ('pences', 0.9265344142913818),
 ('impeachment', 0.9258443713188171),
 ('vice', 0.9133374094963074),
 ('biden', 0.909093976020813),
 ('inquiry', 0.9052674770355225)]

In [74]:
model.wv.most_similar("facebook")

[('chat', 0.9613429307937622),
 ('messenger', 0.9329640865325928),
 ('find', 0.9031698107719421),
 ('wjla', 0.8943736553192139),
 ('google', 0.8838839530944824),
 ('commissions', 0.8639299273490906),
 ('online', 0.863520622253418),
 ('gambling', 0.8563323616981506),
 ('zuckerberg', 0.8453306555747986),
 ('interviews', 0.837069034576416)]

## Generate vectors from documents

In [75]:
vectorized_docs = vectorize(tokenized_docs, model=model, strategy="average")
len(vectorized_docs), len(vectorized_docs[0])

(9882, 100)

## Generate and analyze clusters

In [77]:
clustering, cluster_labels = mbkmeans_clusters(vectorized_docs, 50, print_silhouette_values=True)
df_clusters = pd.DataFrame({
    "text": docs,
    "tokens": [" ".join(text) for text in tokenized_docs],
    "cluster": cluster_labels
})

For n_clusters = 50
Silhouette coefficient: 0.11
Inertia:3672.275531624991
Silhouette values:
    Cluster 31: Size:34 | Avg:0.46 | Min:0.22 | Max: 0.62
    Cluster 9: Size:99 | Avg:0.33 | Min:0.02 | Max: 0.53
    Cluster 34: Size:31 | Avg:0.32 | Min:0.01 | Max: 0.52
    Cluster 6: Size:134 | Avg:0.32 | Min:-0.01 | Max: 0.52
    Cluster 21: Size:107 | Avg:0.27 | Min:-0.03 | Max: 0.49
    Cluster 22: Size:86 | Avg:0.27 | Min:-0.05 | Max: 0.47
    Cluster 4: Size:112 | Avg:0.27 | Min:-0.08 | Max: 0.47
    Cluster 15: Size:133 | Avg:0.26 | Min:-0.00 | Max: 0.42
    Cluster 45: Size:54 | Avg:0.26 | Min:-0.04 | Max: 0.50
    Cluster 17: Size:150 | Avg:0.26 | Min:-0.05 | Max: 0.50
    Cluster 43: Size:86 | Avg:0.24 | Min:0.01 | Max: 0.43
    Cluster 13: Size:119 | Avg:0.23 | Min:-0.04 | Max: 0.43
    Cluster 33: Size:174 | Avg:0.18 | Min:-0.07 | Max: 0.41
    Cluster 12: Size:489 | Avg:0.17 | Min:-0.01 | Max: 0.37
    Cluster 41: Size:253 | Avg:0.16 | Min:0.01 | Max: 0.37
    Cluster 11: Size

### Evaluate top terms per cluster (based on clusters' centroids)

In [89]:
print("Top terms per cluster (based on centroids):")
for i in range(50):
    tokens_per_cluster = ""
    most_representative = model.wv.most_similar(positive=[clustering.cluster_centers_[i]], topn=5)
    for t in most_representative:
        tokens_per_cluster += f"{t[0]} "
    print(f"Cluster {i}: {tokens_per_cluster}")

Top terms per cluster (based on centroids):
Cluster 0: started produced roger holes given 
Cluster 1: legal cabinet commissioner intention dominic 
Cluster 2: aides envoy erdogan congressional defended 
Cluster 3: indiana attempted custody relatives soldier 
Cluster 4: trying detective wwii footage serial 
Cluster 5: imposing kiev continuing renewed crisis 
Cluster 6: taoiseach varadkar leo delay simon 
Cluster 7: single saw middlesex newcastle espana 
Cluster 8: steven situation lawyers confirms singapore 
Cluster 9: category ravaged landfall tropical abaco 
Cluster 10: boys pregnant philippines walking scene 
Cluster 11: zelensky suggestion whistleblowers cnnpresident emmanuel 
Cluster 12: finding honor chef mayo highly 
Cluster 13: qualifying warm argentina finals mens 
Cluster 14: convicted jury slaying pleaded guyger 
Cluster 15: bulletin popularity reviews creator interact 
Cluster 16: parties constructive agree welcomed breakthrough 
Cluster 17: islands coastal carolinas charles

### Evaluate top terms per cluster (based on words frequencies)

In [90]:
for i in range(50):
    tokens_per_cluster = ""
    most_frequent = Counter(" ".join(df_clusters.query(f"cluster == {i}")["tokens"]).split()).most_common(5)
    for t in most_frequent:
        tokens_per_cluster += f"{t[0]}({str(t[1])}) "
    print(f"Cluster {i}: {tokens_per_cluster}")

Cluster 0: one(151) first(99) year(95) two(91) time(87) 
Cluster 1: minister(122) said(68) government(65) prime(59) election(50) 
Cluster 2: us(172) trump(139) president(108) donald(75) said(71) 
Cluster 3: police(160) man(109) said(87) court(75) year(72) 
Cluster 4: world(111) find(111) video(106) online(106) get(105) 
Cluster 5: us(298) said(133) reuters(103) states(99) united(90) 
Cluster 6: johnson(294) brexit(273) minister(227) boris(226) prime(217) 
Cluster 7: world(103) first(93) ireland(91) time(70) united(64) 
Cluster 8: said(118) president(105) us(101) court(98) former(90) 
Cluster 9: hurricane(263) dorian(221) storm(130) bahamas(103) carolina(70) 
Cluster 10: year(132) man(125) old(124) years(121) two(101) 
Cluster 11: trump(367) president(269) donald(171) house(122) bolton(103) 
Cluster 12: like(68) says(59) one(49) could(40) also(38) 
Cluster 13: world(266) cup(245) rugby(97) ireland(77) win(62) 
Cluster 14: police(238) man(165) year(118) old(103) found(88) 
Cluster 15: vi

### Retrieve most representative documents (based on clusters' centroids)

In [91]:
test_cluster = 31
most_representative_docs = np.argsort(
    np.linalg.norm(vectorized_docs - clustering.cluster_centers_[test_cluster], axis=1)
)
for d in most_representative_docs[:10]:
    print(docs[d])
    print("-------------")

Wilbur Ross 'Needs to Resign Now' If He Threatened to Fire Top NOAA Officials for Tweet Contradicting Trump on Hurricane Dorian, Democrat Says | As President Donald Trump claimed Hurricane Dorian could hit Alabama, the National Weather Service tweeted to correct the rumors. | Commerce Secretary Wilbur Ross is facing calls to resign over a report he threatened to fire top officials at NOAA for a tweet disputing President Donald Trump's claim that Hurricane Dorian would hit Alabama.
"If that story is true, and I don't know that it … [+3709 chars]
-------------
Dorian, Comey and Debra Messing: What Trump tweeted on Labor Day weekend | President Donald Trump axed his visit to Poland over the weekend to monitor Hurricane Dorian from Camp David with emergency management staff, but if the President's more than 120 tweets are any indication, he had more than just the storm on his mind. | Washington (CNN)President Donald Trump axed his visit to Poland over the weekend to monitor Hurricane Doria

### Retrieve a random sample of documents for a given cluster

In [28]:
for i,t in enumerate(df_clusters.query(f"cluster == {test_cluster}").sample(10).iterrows()):
    print(t[1]["text"])
    print("-------------")

Hong Kong Police Officer Shoots Protester | A protester was shot with a live round by Hong Kong police Tuesday during widespread demonstrations against China’s National Day Celebration. Photo:CAMPUS TV/HKUSU | 
-------------
Amnesty accuses Hong Kong police of arbitrary arrests, torture | Amnesty International accused Hong Kong police on Friday of torture and other abuses in their handling of more than three months of pro-democracy protests, but the police say they have shown restraint on the street in the face of increased violence. | HONG KONG (Reuters) - Amnesty International accused Hong Kong police on Friday of torture and other abuses in their handling of more than three months of pro-democracy protests, but the police say they have shown restraint on the street in the face of increas… [+3967 chars]
-------------
'We won't give up': Hong Kong protesters defiant after shooting | Shooting of 18-year-old took place during angry demonstrations on Tuesday, prompting mass vigils, more 