## Import required libraries

In [1]:
import sys
sys.path.append("../")

import os
from collections import Counter
from time import time

import gensim
import numpy as np
import pandas as pd
from gensim.models import Word2Vec
from tqdm.notebook import tqdm

from ds_utils.config import set_display_options, set_random_seed
from ds_utils.clustering import Tokenizer, load_data, clean_news_data, vectorize, mbkmeans_clusters

set_display_options()
set_random_seed()

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/dylancastillo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Read data

In [2]:
df = load_data("news")

In [3]:
df.shape

(10437, 15)

In [4]:
df.columns

Index(['Unnamed: 0', 'source_id', 'source_name', 'author', 'title',
       'description', 'url', 'url_to_image', 'published_at', 'content',
       'top_article', 'engagement_reaction_count', 'engagement_comment_count',
       'engagement_share_count', 'engagement_comment_plugin_count'],
      dtype='object')

## Clean data

In [5]:
df = clean_news_data(df)

Original dataframe: (10437, 15)
Pre-processed dataframe: (9882, 2)


In [6]:
df.sample(1).T

Unnamed: 0,8103
text,Indian PM dodges mention at UN of disputed region of Kashmir | Indian Prime Minister Narendra Modi broadly denounced terrorism at the United Nations on Friday but avoided any mention of India's crackdown in the disputed Himalayan region of Kashmir | Indian Prime Minister Narendra Modi broadly denounced terrorism at the United Nations on Friday but avoided any direct mention of one of the world's most perilous standoffs: Pakistan's recent warning that India's crackdown in the disputed region of Kashmir ri… [+3209 chars]
tokens,"[indian, pm, dodges, mention, un, disputed, region, kashmir, indian, prime, minister, narendra, modi, broadly, denounced, terrorism, united, nations, friday, avoided, mention, indias, crackdown, disputed, himalayan, region, kashmir, indian, prime, minister, narendra, modi, broadly, denounced, terrorism, united, nations, friday, avoided, direct, mention, one, worlds, perilous, standoffs, pakistans, recent, warning, indias, crackdown, disputed, region, kashmir]"


## Review tokens and vocabulary

### Tokens

In [7]:
sample_text = df.sample(1)
print(f"SAMPLE TEXT: {sample_text['text'].values[0]}")
print(f"------")
print(f"TOKENS: {sample_text['tokens'].values[0]}")

SAMPLE TEXT: 3 SEAL Team 7 leaders fired for team's alleged misconduct | SEAL Team 7's three top leaders have been relieved of duty because of allegations of misconduct involving a unit under their command. | In a rare move, the three senior leaders of the U.S. Navy's elite SEAL Team 7 have been removed from their positions due to a "loss of confidence" after allegations of misconduct among one of the platoons under their command that was sent home from Iraq.
The… [+4085 chars]
------
TOKENS: ['seal', 'team', 'leaders', 'fired', 'teams', 'alleged', 'misconduct', 'seal', 'team', '7s', 'three', 'leaders', 'relieved', 'duty', 'allegations', 'misconduct', 'involving', 'unit', 'command', 'rare', 'move', 'three', 'senior', 'leaders', 'us', 'navys', 'elite', 'seal', 'team', 'removed', 'positions', 'due', 'loss', 'confidence', 'allegations', 'misconduct', 'among', 'one', 'platoons', 'command', 'sent', 'home', 'iraq']


### Vocabulary

In [8]:
docs = df["text"].values
tokenized_docs = df["tokens"].values
vocab = Counter()
for token in tokenized_docs:
    vocab.update(token)

In [9]:
len(vocab)

32454

In [10]:
vocab.most_common(10)

[('us', 2757),
 ('said', 2519),
 ('year', 1781),
 ('president', 1756),
 ('trump', 1705),
 ('world', 1620),
 ('says', 1511),
 ('one', 1418),
 ('two', 1284),
 ('first', 1195)]

## Train Word2Vec model

In [11]:
model = Word2Vec(sentences=tokenized_docs, vector_size=100, workers=1, seed=42)

In [12]:
model.wv.most_similar("trump")

[('trumps', 0.988541841506958),
 ('president', 0.9746493697166443),
 ('donald', 0.9274922013282776),
 ('ivanka', 0.9203903079032898),
 ('impeachment', 0.9195784330368042),
 ('pences', 0.9152231812477112),
 ('avlon', 0.9148306846618652),
 ('biden', 0.9146010279655457),
 ('breitbart', 0.9144087433815002),
 ('vice', 0.9067237973213196)]

In [13]:
model.wv.most_similar("facebook")

[('chat', 0.9635254740715027),
 ('gambling', 0.9399046301841736),
 ('google', 0.9298744797706604),
 ('messenger', 0.9281919598579407),
 ('find', 0.9147608280181885),
 ('interviews', 0.8606226444244385),
 ('commissions', 0.8595864176750183),
 ('analysis', 0.8582143187522888),
 ('online', 0.8573527932167053),
 ('whats', 0.8550292253494263)]

## Generate vectors from documents

In [14]:
vectorized_docs = vectorize(tokenized_docs, model=model, strategy="average")
len(vectorized_docs), len(vectorized_docs[0])

(9882, 100)

## Generate and analyze clusters

In [15]:
clustering, cluster_labels = mbkmeans_clusters(X=vectorized_docs, k=50, print_silhouette_values=True)
df_clusters = pd.DataFrame({
    "text": docs,
    "tokens": [" ".join(text) for text in tokenized_docs],
    "cluster": cluster_labels
})

For n_clusters = 50
Silhouette coefficient: 0.11
Inertia:3568.342791047967
Silhouette values:
    Cluster 29: Size:50 | Avg:0.39 | Min:0.01 | Max: 0.59
    Cluster 35: Size:30 | Avg:0.34 | Min:0.05 | Max: 0.54
    Cluster 37: Size:58 | Avg:0.32 | Min:0.09 | Max: 0.51
    Cluster 39: Size:81 | Avg:0.31 | Min:-0.05 | Max: 0.52
    Cluster 27: Size:63 | Avg:0.28 | Min:0.02 | Max: 0.46
    Cluster 6: Size:101 | Avg:0.27 | Min:0.02 | Max: 0.46
    Cluster 24: Size:120 | Avg:0.26 | Min:-0.04 | Max: 0.46
    Cluster 49: Size:65 | Avg:0.26 | Min:-0.03 | Max: 0.47
    Cluster 47: Size:53 | Avg:0.23 | Min:0.01 | Max: 0.45
    Cluster 22: Size:78 | Avg:0.22 | Min:-0.01 | Max: 0.43
    Cluster 45: Size:38 | Avg:0.21 | Min:-0.07 | Max: 0.41
    Cluster 32: Size:148 | Avg:0.21 | Min:-0.04 | Max: 0.40
    Cluster 19: Size:562 | Avg:0.18 | Min:-0.00 | Max: 0.40
    Cluster 31: Size:612 | Avg:0.16 | Min:-0.01 | Max: 0.37
    Cluster 23: Size:96 | Avg:0.16 | Min:-0.04 | Max: 0.38
    Cluster 14: Size:21

### Evaluate top terms per cluster (based on clusters' centroids)

In [16]:
print("Top terms per cluster (based on centroids):")
for i in range(50):
    tokens_per_cluster = ""
    most_representative = model.wv.most_similar(positive=[clustering.cluster_centers_[i]], topn=5)
    for t in most_representative:
        tokens_per_cluster += f"{t[0]} "
    print(f"Cluster {i}: {tokens_per_cluster}")

Top terms per cluster (based on centroids):
Cluster 0: suspend swinson block speech warned 
Cluster 1: obama tweet blower praised republicans 
Cluster 2: microsoft companys fintech headset revenue 
Cluster 3: founding burial attacking jr gang 
Cluster 4: flagship managers lenders institute program 
Cluster 5: charleston flooding ravaged islands ocracoke 
Cluster 6: johnsons proposals pm backstop benjamin 
Cluster 7: ukrainian volodymyr zelensky aides impeach 
Cluster 8: funded manhattan dropped freed hughes 
Cluster 9: delhi plc gains milan boeing 
Cluster 10: suffered born previous grew boston 
Cluster 11: exercise able pixel probably netflix 
Cluster 12: indictment afghan german singapore islamic 
Cluster 13: french alleged dissident al lawsuit 
Cluster 14: tips likes deals someone carmichael 
Cluster 15: tournament finished madrid sundays winning 
Cluster 16: agree avoid bloc landmark imran 
Cluster 17: murdering neighbor girl stabbing convicted 
Cluster 18: appearances mcavoy anoth

### Evaluate top terms per cluster (based on words frequencies)

In [17]:
for i in range(50):
    tokens_per_cluster = ""
    most_frequent = Counter(" ".join(df_clusters.query(f"cluster == {i}")["tokens"]).split()).most_common(5)
    for t in most_frequent:
        tokens_per_cluster += f"{t[0]}({str(t[1])}) "
    print(f"Cluster {i}: {tokens_per_cluster}")

Cluster 0: minister(195) brexit(148) prime(137) deal(98) election(94) 
Cluster 1: trump(242) president(188) donald(108) us(95) trumps(77) 
Cluster 2: business(168) insider(90) story(78) company(64) intelligence(57) 
Cluster 3: mugabe(88) robert(69) former(66) president(61) year(60) 
Cluster 4: us(156) company(89) world(85) business(72) market(70) 
Cluster 5: hurricane(194) dorian(183) bahamas(124) storm(74) carolina(39) 
Cluster 6: johnson(232) brexit(224) minister(178) boris(176) prime(171) 
Cluster 7: trump(412) president(356) donald(178) us(153) house(117) 
Cluster 8: us(116) said(113) people(91) two(69) york(50) 
Cluster 9: reuters(166) said(132) company(106) million(104) billion(97) 
Cluster 10: year(131) two(72) one(68) first(60) last(59) 
Cluster 11: video(73) like(68) find(55) people(54) app(52) 
Cluster 12: said(125) court(109) state(76) says(75) former(57) 
Cluster 13: said(93) us(67) police(55) killed(47) officials(40) 
Cluster 14: video(227) world(225) find(219) national(21

### Retrieve most representative documents (based on clusters' centroids)

In [18]:
test_cluster = 29
most_representative_docs = np.argsort(
    np.linalg.norm(vectorized_docs - clustering.cluster_centers_[test_cluster], axis=1)
)
for d in most_representative_docs[:10]:
    print(docs[d])
    print("-------------")

Dorian, Comey and Debra Messing: What Trump tweeted on Labor Day weekend | President Donald Trump axed his visit to Poland over the weekend to monitor Hurricane Dorian from Camp David with emergency management staff, but if the President's more than 120 tweets are any indication, he had more than just the storm on his mind. | Washington (CNN)President Donald Trump axed his visit to Poland over the weekend to monitor Hurricane Dorian from Camp David with emergency management staff, but if the President's more than 120 tweets are any indication, he had more than just the storm on hi… [+3027 chars]
-------------
Ross Must Resign If Report He Threatened NOAA Officials Is True: Democrat | As President Donald Trump claimed Hurricane Dorian could hit Alabama, the National Weather Service tweeted to correct the rumors. | Commerce Secretary Wilbur Ross is facing calls to resign over a report alleging that he threatened to fire top officials at NOAA for a tweet disputing President Donald Trump's

### Retrieve a random sample of documents for a given cluster

In [19]:
for i,t in enumerate(df_clusters.query(f"cluster == {test_cluster}").sample(10).iterrows()):
    print(t[1]["text"])
    print("-------------")

Trump tweeted 122 times with weather updates and attacks on the media from his Virginia golf course as Hurricane Dorian pummeled the Caribbean | President Donald Trump tweeted prolifically over Labor Day weekend as Hurricane Dorian pummeled the Bahamas, blasting out frequent updates on the weather to his 64 million followers. According to a report in The New York Times, Trump tweeted 122 times in tota… | President Donald Trump tweeted prolifically over Labor Day weekend as Hurricane Dorian pummeled the Bahamas, blasting out frequent updates on the weather mixed in with his usual attacks on the media. 
 Trump tweeted 122 times in total from his golf course in… [+3451 chars]
-------------
The Lessons Florida Has Learned From Past Hurricanes - The New York Times | The Lessons Florida Has Learned From Past Hurricanes The New York Times Trump claimed Dorian could hit Alabama -- even after weather service refuted it CNN White House declares state of emergency in Georgia ahead of Dorian | Th