## Import required libraries

In [1]:
import os
from collections import Counter
from time import time

import gensim
import numpy as np
import pandas as pd
import gensim.downloader as api
from gensim.models import Word2Vec
from tqdm.notebook import tqdm

from ds_utils.config import set_display_options
from ds_utils.clustering import Tokenizer, load_data, clean_news_data, vectorize, mbkmeans_clusters

set_display_options()

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/dylancastillo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Read data

In [2]:
df = load_data("news")

In [3]:
df.shape

(10437, 15)

In [4]:
df.columns

Index(['Unnamed: 0', 'source_id', 'source_name', 'author', 'title',
       'description', 'url', 'url_to_image', 'published_at', 'content',
       'top_article', 'engagement_reaction_count', 'engagement_comment_count',
       'engagement_share_count', 'engagement_comment_plugin_count'],
      dtype='object')

## Clean data

In [5]:
df = clean_news_data(df)

Original dataframe: (10437, 15)
Pre-processed dataframe: (9882, 2)


In [6]:
df.sample(1).T

Unnamed: 0,6557
text,"Expert: Serial murderers' victim selection ""often very specific"" | Forensic psychologist Kris Mohandie explains what drives serial killers like Michael Gargiulo | Kris Mohandie is a clinical, police and forensic psychologist with more than 30 years of experience in the psychology of violence. He has conducted extensive interviews of violent offenders, including stalkers, hostage takers, and serial murderers. He is a ""4… [+1537 chars]"
tokens,"[expert, serial, murderers, victim, selection, often, specific, forensic, psychologist, kris, mohandie, explains, drives, serial, killers, like, michael, gargiulo, kris, mohandie, clinical, police, forensic, psychologist, years, experience, psychology, violence, conducted, extensive, interviews, violent, offenders, including, stalkers, hostage, takers, serial, murderers]"


## Review tokens and vocabulary

### Tokens

In [7]:
sample_text = df.sample(1)
print(f"SAMPLE TEXT: {sample_text['text'].values[0]}")
print(f"------")
print(f"TOKENS: {sample_text['tokens'].values[0]}")

SAMPLE TEXT: Apple to reveal streaming service prices while iPhones in 'holding pattern' until 5G | Apple Inc kicked off Tuesday its presentation where it is expected to announce pricing for its forthcoming streaming TV service as well as updates to its iPhone lineup, as the tech giant reaches a turning point where it focuses as much on services as its hard… | CUPERTINO, Calif. (Reuters) - Apple Inc kicked off Tuesday its presentation where it is expected to announce pricing for its forthcoming streaming TV service as well as updates to its iPhone lineup, as the tech giant reaches a turning point where it focuses a… [+4301 chars]
------
TOKENS: ['apple', 'reveal', 'streaming', 'service', 'prices', 'iphones', 'holding', 'pattern', '5g', 'apple', 'inc', 'kicked', 'tuesday', 'presentation', 'expected', 'announce', 'pricing', 'forthcoming', 'streaming', 'tv', 'service', 'well', 'updates', 'iphone', 'lineup', 'tech', 'giant', 'reaches', 'turning', 'point', 'focuses', 'much', 'services', 'cu

### Vocabulary

In [8]:
docs = df["text"].values
tokenized_docs = df["tokens"].values
vocab = Counter()
for token in tokenized_docs:
    vocab.update(token)

In [9]:
len(vocab)

32454

In [10]:
vocab.most_common(10)

[('us', 2757),
 ('said', 2519),
 ('year', 1781),
 ('president', 1756),
 ('trump', 1705),
 ('world', 1620),
 ('says', 1511),
 ('one', 1418),
 ('two', 1284),
 ('first', 1195)]

## Load pretrained FastText

In [11]:
model = api.load("fasttext-wiki-news-subwords-300")

In [13]:
model.most_similar("trump")

[('trumps', 0.8457011580467224),
 ('trumping', 0.7876768708229065),
 ('non-trump', 0.7490020394325256),
 ('trumped', 0.7124733328819275),
 ('notrump', 0.6544546484947205),
 ('supercede', 0.6326021552085876),
 ('overrule', 0.6288058161735535),
 ('no-trump', 0.627895176410675),
 ('override', 0.6258442401885986),
 ('supersede', 0.6115216016769409)]

In [14]:
model.most_similar("facebook")

[('facebook.', 0.8114862442016602),
 ('facebooks', 0.7959319949150085),
 ('Facebook', 0.7885890007019043),
 ('twitter', 0.7708379030227661),
 ('facebook.com', 0.7622057199478149),
 ('facebooking', 0.737972617149353),
 ('non-Facebook', 0.7352786064147949),
 ('instagram', 0.7332959771156311),
 ('Facebook.', 0.7232986688613892),
 ('myspace', 0.7073581218719482)]

## Generate vectors from documents

In [16]:
vectorized_docs = vectorize(tokenized_docs, model=model, strategy="min-max")
len(vectorized_docs), len(vectorized_docs[0])

(9882, 600)

## Generate and analyze clusters

In [17]:
clustering, cluster_labels = mbkmeans_clusters(vectorized_docs, 50, print_silhouette_values=True)
df_clusters = pd.DataFrame({
    "text": docs,
    "tokens": [" ".join(text) for text in tokenized_docs],
    "cluster": cluster_labels
})

For n_clusters = 50
Silhouette coefficient: 0.03
Inertia:6280.486664220623
Silhouette values:
    Cluster 22: Size:54 | Avg:0.20 | Min:0.06 | Max: 0.28
    Cluster 5: Size:2 | Avg:0.19 | Min:0.17 | Max: 0.22
    Cluster 23: Size:74 | Avg:0.17 | Min:0.04 | Max: 0.22
    Cluster 47: Size:55 | Avg:0.16 | Min:0.07 | Max: 0.22
    Cluster 34: Size:48 | Avg:0.16 | Min:-0.00 | Max: 0.23
    Cluster 19: Size:172 | Avg:0.14 | Min:-0.05 | Max: 0.23
    Cluster 45: Size:27 | Avg:0.14 | Min:-0.01 | Max: 0.20
    Cluster 9: Size:94 | Avg:0.13 | Min:0.05 | Max: 0.20
    Cluster 42: Size:73 | Avg:0.12 | Min:-0.02 | Max: 0.17
    Cluster 24: Size:285 | Avg:0.12 | Min:0.03 | Max: 0.19
    Cluster 16: Size:71 | Avg:0.11 | Min:0.01 | Max: 0.15
    Cluster 8: Size:237 | Avg:0.11 | Min:0.00 | Max: 0.16
    Cluster 21: Size:167 | Avg:0.10 | Min:-0.06 | Max: 0.17
    Cluster 18: Size:131 | Avg:0.10 | Min:-0.04 | Max: 0.16
    Cluster 2: Size:135 | Avg:0.10 | Min:-0.03 | Max: 0.16
    Cluster 36: Size:208 | A

### Evaluate top terms per cluster (based on words frequencies)

In [19]:
for i in range(50):
    tokens_per_cluster = ""
    most_frequent = Counter(" ".join(df_clusters.query(f"cluster == {i}")["tokens"]).split()).most_common(5)
    for t in most_frequent:
        tokens_per_cluster += f"{t[0]}({str(t[1])}) "
    print(f"Cluster {i}: {tokens_per_cluster}")

Cluster 0: us(69) et(26) trump(26) president(19) economy(18) 
Cluster 1: trump(44) president(39) cbsn(24) first(23) hurricane(23) 
Cluster 2: cup(286) world(231) rugby(85) japan(64) team(59) 
Cluster 3: eu(3) finance(3) ministers(3) fiscal(3) rules(3) 
Cluster 4: two(234) year(173) one(138) million(93) people(88) 
Cluster 5: million(4) capital(4) anadarko(4) petroleum(4) avianca(3) 
Cluster 6: said(66) police(63) first(57) one(56) year(53) 
Cluster 7: us(957) trump(233) president(181) said(176) reuters(139) 
Cluster 8: mr(307) said(114) trump(104) president(72) johnson(64) 
Cluster 9: al(132) bin(61) laden(48) us(47) said(44) 
Cluster 10: eu(219) brexit(150) european(113) johnson(83) deal(83) 
Cluster 11: tv(204) apple(35) watch(33) season(33) first(32) 
Cluster 12: could(179) would(142) said(122) trump(93) president(78) 
Cluster 13: said(84) year(80) two(62) iphone(60) one(56) 
Cluster 14: mp(50) ap(45) johnson(35) minister(32) says(30) 
Cluster 15: september(35) year(34) first(31) tr

### Retrieve most representative documents (based on clusters' centroids)

In [24]:
test_cluster = 22
most_representative_docs = np.argsort(
    np.linalg.norm(vectorized_docs - clustering.cluster_centers_[test_cluster], axis=1)
)
for d in most_representative_docs[:10]:
    print(docs[d])
    print("-------------")

El Paso shooting: Prosecutor plans to pursue death penalty after capital murder indictment | The man accused of opening fire in an El Paso, Texas, Walmart, killing 22 people and wounding several others, has been indicted on a capital murder charge, the El Paso County District Attorney's Office said following the grand jury's Thursday decision. | (CNN)The man accused of opening fire in an El Paso, Texas, Walmart, killing 22 people and wounding several others, has been indicted on a capital murder charge, the El Paso County District Attorney's Office said following the grand jury's Thursday decision. 
… [+1979 chars]
-------------
Man accused of killing 22 in El Paso indicted on murder charges | Texas grand jury indicts Patrick Crusius, who police say went on a shooting rampage targeting Mexicans in a Walmart. | A Texas grand jury on Thursday indicted a man accused of killing 22 people in an August shooting at Walmart in El Paso, Texas, who had told authorities he was targeting Mexicans,

### Retrieve a random sample of documents for a given cluster

In [25]:
for i,t in enumerate(df_clusters.query(f"cluster == {test_cluster}").sample(10).iterrows()):
    print(t[1]["text"])
    print("-------------")

Protests break out across Egypt demanding el-Sisi's resignation | Demonstrators take to the streets after Friday prayers as security forces ramp up checks in Cairo. | Protests have broken out in the south of Egypt with demonstrators calling for the departure of President Abdel Fattah el-Sisi amid a high security alert.
Following Friday prayers in the Warraq area in Giza governorate, demonstrators chanted slogans calling f… [+2097 chars]
-------------
Nearly 2000 arrested as Egypt braces for anti-Sisi protests | Egypt is bracing itself for a second weekend of protests on Friday, with authorities stepping up arrests and tightening security in major cities amid calls for a "million-man march" against President Abdel Fattah el-Sisi. Also in the programme: Brother of mur… | Egypt is bracing itself for a second weekend of protests on Friday, with authorities stepping up arrests and tightening security in major cities amid calls for a "million-man march" against President Abdel Fattah el-Sisi