## Import required libraries

In [1]:
import os
from collections import Counter
from time import time

import gensim
import numpy as np
import pandas as pd
import gensim.downloader as api
from gensim.models import Word2Vec
from tqdm.notebook import tqdm

from ds_utils.config import set_display_options
from ds_utils.clustering import Tokenizer, load_data, clean_news_data, vectorize, mbkmeans_clusters

set_display_options()

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/dylancastillo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Read data

In [2]:
df = load_data("news")

In [3]:
df.shape

(10437, 15)

In [4]:
df.columns

Index(['Unnamed: 0', 'source_id', 'source_name', 'author', 'title',
       'description', 'url', 'url_to_image', 'published_at', 'content',
       'top_article', 'engagement_reaction_count', 'engagement_comment_count',
       'engagement_share_count', 'engagement_comment_plugin_count'],
      dtype='object')

## Clean data

In [5]:
df = clean_news_data(df)

Original dataframe: (10437, 15)
Pre-processed dataframe: (9882, 2)


In [6]:
df.sample(1).T

Unnamed: 0,9079
text,"To save themselves, Republicans might yet desert Trump | Upon dismissing his chief of staff in 1973, Richard Nixon said that he loved him “like my brother”. The line seems too easy until you remember that the US president had seen two of his own die young. Nixon’s bond with H R Haldeman and other colleagues fortifi… | Upon dismissing his chief of staff in 1973, Richard Nixon said that he loved him like my brother. The line seems too easy until you remember that the US president had seen two of his own die young. Nixons bond with H R Haldeman and other colleagues fortified … [+4374 chars]"
tokens,"[save, republicans, might, yet, desert, trump, upon, dismissing, chief, staff, richard, nixon, said, loved, like, brother, line, seems, easy, remember, us, president, seen, two, die, young, nixon, bond, haldeman, colleagues, upon, dismissing, chief, staff, richard, nixon, said, loved, like, brother, line, seems, easy, remember, us, president, seen, two, die, young, nixons, bond, haldeman, colleagues, fortified]"


## Review tokens and vocabulary

### Tokens

In [7]:
sample_text = df.sample(1)
print(f"SAMPLE TEXT: {sample_text['text'].values[0]}")
print(f"------")
print(f"TOKENS: {sample_text['tokens'].values[0]}")

SAMPLE TEXT: Eye Opener at 8: NOAA’s lead scientist vows to investigate Dorian controversy | A look back at what we've been covering on "CBS This Morning." | 
------
TOKENS: ['eye', 'opener', 'noaa', 'lead', 'scientist', 'vows', 'investigate', 'dorian', 'controversy', 'look', 'back', 'weve', 'covering', 'cbs', 'morning']


### Vocabulary

In [8]:
docs = df["text"].values
tokenized_docs = df["tokens"].values
vocab = Counter()
for token in tokenized_docs:
    vocab.update(token)

In [9]:
len(vocab)

32454

In [10]:
vocab.most_common(10)

[('us', 2757),
 ('said', 2519),
 ('year', 1781),
 ('president', 1756),
 ('trump', 1705),
 ('world', 1620),
 ('says', 1511),
 ('one', 1418),
 ('two', 1284),
 ('first', 1195)]

## Load pretrained FastText

In [11]:
model = api.load("fasttext-wiki-news-subwords-300")

In [12]:
model.most_similar("trump")

[('trumps', 0.8457011580467224),
 ('trumping', 0.7876768708229065),
 ('non-trump', 0.7490020394325256),
 ('trumped', 0.7124733328819275),
 ('notrump', 0.6544546484947205),
 ('supercede', 0.6326021552085876),
 ('overrule', 0.6288058161735535),
 ('no-trump', 0.627895176410675),
 ('override', 0.6258442401885986),
 ('supersede', 0.6115216016769409)]

In [13]:
model.most_similar("facebook")

[('facebook.', 0.8114862442016602),
 ('facebooks', 0.7959319949150085),
 ('Facebook', 0.7885890007019043),
 ('twitter', 0.7708379030227661),
 ('facebook.com', 0.7622057199478149),
 ('facebooking', 0.737972617149353),
 ('non-Facebook', 0.7352786064147949),
 ('instagram', 0.7332959771156311),
 ('Facebook.', 0.7232986688613892),
 ('myspace', 0.7073581218719482)]

## Generate vectors from documents

In [14]:
vectorized_docs = vectorize(tokenized_docs, model=model, strategy="min-max")
len(vectorized_docs), len(vectorized_docs[0])

(9882, 600)

## Generate and analyze clusters

In [15]:
clustering, cluster_labels = mbkmeans_clusters(vectorized_docs, 50, print_silhouette_values=True)
df_clusters = pd.DataFrame({
    "text": docs,
    "tokens": [" ".join(text) for text in tokenized_docs],
    "cluster": cluster_labels
})

For n_clusters = 50
Silhouette coefficient: 0.02
Inertia:6290.78210898176
Silhouette values:
    Cluster 3: Size:3 | Avg:0.45 | Min:0.43 | Max: 0.48
    Cluster 37: Size:9 | Avg:0.22 | Min:0.18 | Max: 0.26
    Cluster 26: Size:55 | Avg:0.19 | Min:0.04 | Max: 0.27
    Cluster 39: Size:71 | Avg:0.17 | Min:0.11 | Max: 0.22
    Cluster 43: Size:25 | Avg:0.17 | Min:0.10 | Max: 0.23
    Cluster 25: Size:31 | Avg:0.15 | Min:0.10 | Max: 0.20
    Cluster 24: Size:169 | Avg:0.14 | Min:-0.05 | Max: 0.24
    Cluster 40: Size:70 | Avg:0.13 | Min:-0.01 | Max: 0.21
    Cluster 4: Size:210 | Avg:0.13 | Min:0.05 | Max: 0.23
    Cluster 49: Size:23 | Avg:0.13 | Min:-0.03 | Max: 0.22
    Cluster 44: Size:128 | Avg:0.12 | Min:0.02 | Max: 0.20
    Cluster 12: Size:166 | Avg:0.12 | Min:-0.05 | Max: 0.16
    Cluster 10: Size:82 | Avg:0.12 | Min:0.05 | Max: 0.18
    Cluster 2: Size:89 | Avg:0.12 | Min:0.02 | Max: 0.18
    Cluster 8: Size:79 | Avg:0.10 | Min:-0.06 | Max: 0.14
    Cluster 11: Size:268 | Avg:0.1

### Evaluate top terms per cluster (based on words frequencies)

In [16]:
for i in range(50):
    tokens_per_cluster = ""
    most_frequent = Counter(" ".join(df_clusters.query(f"cluster == {i}")["tokens"]).split()).most_common(5)
    for t in most_frequent:
        tokens_per_cluster += f"{t[0]}({str(t[1])}) "
    print(f"Cluster {i}: {tokens_per_cluster}")

Cluster 0: joe(185) biden(117) president(88) democratic(57) former(56) 
Cluster 1: one(117) first(105) said(99) two(89) man(87) 
Cluster 2: us(99) world(91) chat(89) facebook(89) messenger(89) 
Cluster 3: us(23) delisting(9) chinese(9) source(9) possibility(8) 
Cluster 4: national(219) abc(214) video(213) world(210) online(210) 
Cluster 5: de(81) la(38) us(28) france(27) said(25) 
Cluster 6: england(4) ashes(4) australia(3) name(2) unchanged(2) 
Cluster 7: ago(248) years(175) year(68) two(65) first(43) 
Cluster 8: ex(98) former(65) mugabe(25) death(24) murder(23) 
Cluster 9: york(180) trump(91) president(66) said(61) donald(48) 
Cluster 10: cup(186) world(133) win(39) rugby(38) japan(33) 
Cluster 11: mr(304) said(133) trump(102) president(72) johnson(64) 
Cluster 12: uk(241) brexit(126) minister(85) prime(81) johnson(81) 
Cluster 13: said(227) reuters(131) president(110) says(106) minister(103) 
Cluster 14: president(38) trump(37) cbsn(25) says(24) first(23) 
Cluster 15: us(313) trade(

### Retrieve most representative documents (based on clusters' centroids)

In [17]:
test_cluster = 41
most_representative_docs = np.argsort(
    np.linalg.norm(vectorized_docs - clustering.cluster_centers_[test_cluster], axis=1)
)
for d in most_representative_docs[:10]:
    print(docs[d])
    print("-------------")

Finland School Stabbing Leaves 1 Dead and 10 Injured, Police Say | The attack happened at a vocational college in a shopping center in the city of Kuopio, the police said, identifying the suspect as a student. | A stabbing attack on Tuesday at a vocational college in a shopping center in Finland killed one person and injured 10 others, the police said. 
The suspect in the attack is a Finnish student at the college, Savo Vocational College, which is at the Herman sho… [+799 chars]
-------------
Three killed in renewed fighting in southern Tripoli witness | At least three fighters aligned with Libya's U.N.-backed Government of National Accord (GNA) were killed in an offensive on Saturday aimed at pushing back eastern forces led by commander Khalifa Haftar, a witness said. | TRIPOLI (Reuters) - At least three fighters aligned with Libya’s U.N.-backed Government of National Accord (GNA) were killed in an offensive on Saturday aimed at pushing back eastern forces led by commander Khalifa Ha

### Retrieve a random sample of documents for a given cluster

In [18]:
for i,t in enumerate(df_clusters.query(f"cluster == {test_cluster}").sample(10).iterrows()):
    print(t[1]["text"])
    print("-------------")

Female cyclists complain of harassment on the roads by young men | An Taisce launches campaign to encourage girls to cycle to school every day | Fewer than one in 250 girls cycle to school every day, according to An Taisce, and those who do cycle say they are dealing with harassment on the roads from drivers and young men. 
The environmental charity has launched a campaign #andshecycles through its G… [+1240 chars]
-------------
NOAA defends Trump's claims about Hurricane Dorian and Alabama, one day after he reportedly personally directed a Coast Guard admiral to back him up | The National Oceanic and Atmospheric Administration (NOAA) defended President Donald Trump and its earlier assessment of Hurricane Dorian, and downplayed a contradicting statement from the National Weather Service's Alabama location. NOAA noted in its recent … | The National Oceanic and Atmospheric Administration (NOAA) defended President Donald Trump and its earlier assessment of Hurricane Dorian, and downplayed