## Import required libraries

In [1]:
import os
import re
import string
from collections import Counter
from string import punctuation
from time import time

import gensim
import nltk
import numpy as np
import pandas as pd
from gensim.models import Word2Vec
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics import silhouette_samples, silhouette_score
from tqdm.notebook import tqdm

from ds_utils.config import set_display_options
from ds_utils.data import NEWS_DATA
from ds_utils.functions import vectorize

nltk.download("stopwords")
nltk.download("punkt")
set_display_options()

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/dylancastillo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/dylancastillo/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Read data

In [2]:
df = pd.read_csv(NEWS_DATA)

In [3]:
df.shape

(10437, 15)

In [4]:
df.columns

Index(['Unnamed: 0', 'source_id', 'source_name', 'author', 'title',
       'description', 'url', 'url_to_image', 'published_at', 'content',
       'top_article', 'engagement_reaction_count', 'engagement_comment_count',
       'engagement_share_count', 'engagement_comment_plugin_count'],
      dtype='object')

In [5]:
df.sample(1).T

Unnamed: 0,163
Unnamed: 0,163
source_id,business-insider
source_name,Business Insider
author,Alicia Betz
title,This mineral-based sunscreen doesn't leave white residue behind — it protects my family's fair skin and is reef-friendly
description,"Mineral-based sunscreens that are reef-safe are becoming an increasingly popular sunscreen choice. Goddess Garden Organics offers four sunscreen formulations: daily, sport, kids, and baby. They also offer three delivery options: lotion, spray, and stick. The …"
url,https://www.businessinsider.com/goddess-garden-organics-sunscreen-review
url_to_image,https://amp.businessinsider.com/images/5d6941072e22af3d336e6615-960-480.jpg
published_at,2019-09-03T15:45:00Z
content,"Everyone should be wearing sunscreen on a daily basis to protect their skin, but I have pretty fair skin, so sunscreen is also a must for me if I don't want to look like a lobster after a day in the sun. My husband has even fairer skin, and it doesn't take mu… [+4705 chars]"


## Fill missing values

In [6]:
df.isna().mean()

Unnamed: 0                        0.00
source_id                         0.00
source_name                       0.00
author                            0.10
title                             0.00
description                       0.00
url                               0.00
url_to_image                      0.06
published_at                      0.00
content                           0.12
top_article                       0.00
engagement_reaction_count         0.01
engagement_comment_count          0.01
engagement_share_count            0.01
engagement_comment_plugin_count   0.01
dtype: float64

In [7]:
df["content"] = df["content"].fillna("")

## Generate tokens

In [8]:
df.sample(1).apply(lambda x: x["title"] + " | " + x["description"] + " | " + x["content"], axis=1).values[0]

'Curlew in danger of extinction in Ireland, says conservationist | Task force recommends series of measures after 96% decline in numbers since 1980s | Ireland should restrict afforestation, recreate peatlands and wetlands and safeguard bogs to protect the endangered curlew bird, a report has recommended.\r\nOn Friday, the curlew task force published a series of recommendations aimed at promoting key policies … [+2365 chars]'

In [9]:
stop_words = set(stopwords.words("english") + ["news", "new", "top"])

def generate_tokens(text, tokenizer=word_tokenize, stop_words=stop_words):
    text = str(text).lower() # Lowercase words
    text = re.sub(r"\[(.*?)\]", "", text) # Remove [+XYZ chars] in content
    text = re.sub(r"\s+", " ", text) # Remove multiple spaces in content
    text = re.sub(r"\w+…|…", "", text) # Remove ellipsis (and last word)
    text = re.sub(r"(?<=\w)-(?=\w)", " ", text) # Replace dash between words
    text = re.sub(f"[{re.escape(string.punctuation)}]", "", text) # Remove punctuation
    
    tokens = tokenizer(text) # Get tokens from text
    tokens = [t for t in tokens if not t in stop_words] # Remove stopwords
    tokens = ["" if t.isdigit() else t for t in tokens] # Remove digits
    tokens = [t for t in tokens if len(t) > 1] # Remove short tokens
    return tokens

for _ in range(5):
    sample_text = df.sample(1).apply(lambda x: x["title"] + " | " + x["description"], axis=1).values[0]
    print(f"SAMPLE TEXT: {sample_text}")
    print(f"TOKENS: {generate_tokens(sample_text)}")
    print(f"------")

SAMPLE TEXT: Some Royal Caribbean, Disney, and Norwegian Cruise ship passengers are stuck at sea as Hurricane Dorian creeps dangerously close to Florida (RCL, CCL) | Royal Caribbean International, Disney Cruise Line, and Norwegian Cruise Line have extended the itineraries for some cruises that began in Florida as Hurricane Dorian nears the state. The cruise lines have canceled or changed the itineraries for a number of ot…
TOKENS: ['royal', 'caribbean', 'disney', 'norwegian', 'cruise', 'ship', 'passengers', 'stuck', 'sea', 'hurricane', 'dorian', 'creeps', 'dangerously', 'close', 'florida', 'rcl', 'ccl', 'royal', 'caribbean', 'international', 'disney', 'cruise', 'line', 'norwegian', 'cruise', 'line', 'extended', 'itineraries', 'cruises', 'began', 'florida', 'hurricane', 'dorian', 'nears', 'state', 'cruise', 'lines', 'canceled', 'changed', 'itineraries', 'number']
------
SAMPLE TEXT: Hottest September on record in Australia's Perth city | The driest September in 42 years has residents co

In [10]:
text_columns = ["title", "description", "content"]

for col in text_columns:
    df[col] = df[col].astype(str)

# Create text column based on title, description, and content
df["text"] = df[text_columns].apply(lambda x: ' | '.join(x), axis=1)
df["tokens"] = df["text"].map(lambda x: generate_tokens(x))

# Remove duplicated after preprocessing
_, idx = np.unique(df["tokens"], return_index=True)
df_proc = df.iloc[idx, :]

# Remove empty values
df_proc = df_proc.loc[df_proc.tokens.map(lambda x: len(x) > 0)]

df.shape, df_proc.shape

((10437, 17), (9882, 17))

## Review vocabulary

In [11]:
docs = df_proc["text"].values
tokenized_docs = df_proc["tokens"].values
vocab = Counter()
for token in tokenized_docs:
    vocab.update(token)

In [12]:
len(vocab)

32454

In [13]:
vocab.most_common(10)

[('us', 2757),
 ('said', 2519),
 ('year', 1781),
 ('president', 1756),
 ('trump', 1705),
 ('world', 1620),
 ('says', 1511),
 ('one', 1418),
 ('two', 1284),
 ('first', 1195)]

## Train Word2Vec model

In [14]:
model = Word2Vec(sentences=tokenized_docs, vector_size=100, workers=4, seed=42)

In [15]:
model.wv.most_similar("trump")

[('trumps', 0.9881476759910583),
 ('president', 0.9689247012138367),
 ('donald', 0.9595009088516235),
 ('breitbart', 0.9428410530090332),
 ('administration', 0.9353697299957275),
 ('turnberry', 0.9300839304924011),
 ('pencil', 0.92219477891922),
 ('impeachment', 0.9217286705970764),
 ('biden', 0.9121127128601074),
 ('administrations', 0.9102789163589478)]

In [16]:
model.wv.most_similar("facebook")

[('chat', 0.949539303779602),
 ('messenger', 0.926665723323822),
 ('gambling', 0.90227210521698),
 ('google', 0.893943727016449),
 ('find', 0.8897424936294556),
 ('wjla', 0.8891974687576294),
 ('marketplace', 0.8577138185501099),
 ('communicate', 0.8472960591316223),
 ('analysis', 0.8444861769676208),
 ('whats', 0.8422533869743347)]

## Generate vectors from documents

In [17]:
def vectorize(list_of_docs, model, strategy):
    features = []
    size_output = model.vector_size
    embedding_dict = model

    if strategy == "min-max":
        size_output *= 2

    if hasattr(model, "wv"):
        embedding_dict = model.wv

    for tokens in list_of_docs:
        zero_vector = np.zeros(size_output)
        vectors = []
        for token in tokens:
            if token in embedding_dict:
                try:
                    vectors.append(embedding_dict[token])
                except KeyError:
                    continue
        if vectors:
            vectors = np.asarray(vectors)
            if strategy == "min-max":
                min_vec = vectors.min(axis=0)
                max_vec = vectors.max(axis=0)
                features.append(np.concatenate((min_vec, max_vec)))
            elif strategy == "average":
                avg_vec = vectors.mean(axis=0)
                features.append(avg_vec)
            else:
                raise ValueError(f"Aggregation strategy {strategy} does not exist!")
        else:
            features.append(zero_vector)
    return features

In [18]:
vectorized_docs = vectorize(tokenized_docs, model=model, strategy="average")
len(vectorized_docs), len(vectorized_docs[0])

(9882, 100)

## Create and analyze clusters

In [58]:
def generate_clusters(X, k, mb=500, random_state=42, print_silhoutte_values=False):
    km = MiniBatchKMeans(n_clusters=k, batch_size=mb, random_state=random_state).fit(X)
    print(f"For n_clusters = {k}")
    print(f"Silhouette coefficient: {silhouette_score(X, km.labels_):0.2f}")
    print(f"Inertia:{clustering.inertia_}")

    if print_silhoutte_values:
        sample_silhouette_values = silhouette_samples(X, km.labels_)
        print(f"Silhouette values:")
        silhouette_values = []
        for i in range(k):
            silhoutte_value = sample_silhouette_values[km.labels_ == i]
            size = ith_cluster_silhouette_values.shape[0]
            min = ith_cluster_silhouette_values.min()
            silhoutte_values.append()
            print(f"    Cluster {i}: "
            f"Size:{}"
            f"| Min:{:.2f}"
            f"| Avg:{ith_cluster_silhouette_values.mean():.2f}"
            f"| Max: {ith_cluster_silhouette_values.max():.2f}")
    return clustering, cluster_labels

In [59]:
clustering, cluster_labels = generate_clusters(vectorized_docs, 50, print_silhoutte_values=True)

For n_clusters = 50
Silhouette coefficient: 0.11
Inertia:3685.83510376604
Silhouette values:
    Cluster 0: Size:584| Min:0.01| Avg:0.20| Max: 0.41
    Cluster 1: Size:223| Min:-0.15| Avg:0.00| Max: 0.19
    Cluster 2: Size:192| Min:-0.20| Avg:0.00| Max: 0.21
    Cluster 3: Size:31| Min:0.00| Avg:0.31| Max: 0.51
    Cluster 4: Size:140| Min:-0.01| Avg:0.21| Max: 0.42
    Cluster 5: Size:160| Min:-0.01| Avg:0.17| Max: 0.37
    Cluster 6: Size:52| Min:0.04| Avg:0.38| Max: 0.58
    Cluster 7: Size:234| Min:-0.12| Avg:0.09| Max: 0.30
    Cluster 8: Size:83| Min:-0.17| Avg:0.09| Max: 0.35
    Cluster 9: Size:96| Min:-0.11| Avg:0.28| Max: 0.48
    Cluster 10: Size:221| Min:-0.11| Avg:0.09| Max: 0.31
    Cluster 11: Size:534| Min:-0.13| Avg:0.08| Max: 0.27
    Cluster 12: Size:352| Min:-0.05| Avg:0.13| Max: 0.36
    Cluster 13: Size:147| Min:-0.02| Avg:0.22| Max: 0.45
    Cluster 14: Size:213| Min:-0.09| Avg:0.08| Max: 0.26
    Cluster 15: Size:483| Min:-0.10| Avg:0.08| Max: 0.28
    Cluster 

In [45]:
df_clusters = pd.DataFrame({
    "text": docs,
    "tokens": [" ".join(text) for text in tokenized_docs],
    "cluster": cluster_labels
})

### Most frequent tokens

In [46]:
for i in range(50):
    tokens_per_cluster = ""
    most_frequent = Counter(" ".join(df_clusters.query(f"cluster == {i}")["tokens"]).split()).most_common(5)
    for t in most_frequent:
        tokens_per_cluster += f"{t[0]}({str(t[1])}) "
    print(f"Cluster {i}: {tokens_per_cluster}")

Cluster 0: one(93) like(83) two(53) says(52) also(50) 
Cluster 1: year(221) old(95) million(90) years(88) last(70) 
Cluster 2: us(300) said(139) states(93) reuters(88) united(85) 
Cluster 3: hurricane(42) dorian(36) find(32) world(31) bahamas(29) 
Cluster 4: johnson(188) brexit(183) minister(174) boris(174) prime(158) 
Cluster 5: trump(376) president(283) donald(179) house(114) bolton(98) 
Cluster 6: trump(119) hurricane(103) dorian(100) president(83) alabama(78) 
Cluster 7: police(221) people(162) killed(132) said(131) man(92) 
Cluster 8: facebook(100) us(68) find(58) world(57) messenger(51) 
Cluster 9: world(95) find(95) video(90) online(90) get(89) 
Cluster 10: president(185) former(131) us(80) said(72) mugabe(70) 
Cluster 11: said(152) us(112) people(86) says(76) one(74) 
Cluster 12: world(149) like(96) get(87) one(81) best(70) 
Cluster 13: world(289) cup(242) rugby(96) ireland(80) team(74) 
Cluster 14: minister(110) said(104) deal(82) says(76) european(62) 
Cluster 15: like(91) us

### Most representative documents

In [47]:
test_cluster = 16
most_representative_docs = np.argsort(
    np.linalg.norm(vectorized_docs - clustering.cluster_centers_[test_cluster], axis=1)
)
for d in most_representative_docs[:10]:
    print(docs[d])
    print("-------------")

World's Largest Oil Facility Set Ablaze in Drone Attacks Claimed by Tehran-Backed Rebels | Yemen's Iran-backed Houthi rebels claimed responsibility for drone attacks at the world's largest oil processing facility in Saudi Arabia, military and government spokespersons said Saturday. | Yemen's Tehran-backed Houthi rebels claimed responsibility for drone attacks and reports of gunfire at the world's largest oil processing facility in Saudi Arabia, military and government spokespersons said Saturday.
Videos emerged online Saturday morning sh… [+2376 chars]
-------------
Drones hit two Saudi Aramco oil facilities, cause fires | Blazes at major oil facilities in Abqaiq and Khurais brought under control, interior ministry says. | Drone attacks on two Saudi Aramco factories have caused fires, according to Saudi Arabia's state media.
Citing an interior ministry spokesperson, the SPA news agency said on Saturday the blazes at the facilities in Abqaiq and Khurais were under control.
The… [+1518 c

### Most representative tokens

In [27]:
model.wv.most_similar(positive=[clustering.cluster_centers_[test_cluster]], topn=10)

[('protesters', 0.9774875044822693),
 ('unrest', 0.9710173010826111),
 ('demonstrators', 0.9702542424201965),
 ('clashes', 0.9644534587860107),
 ('protests', 0.9640728235244751),
 ('defied', 0.9640650153160095),
 ('protester', 0.9624690413475037),
 ('arrests', 0.9554653763771057),
 ('anti', 0.9465689659118652),
 ('wielding', 0.9450966715812683)]

### Random sample of documents

In [28]:
for i,t in enumerate(df_clusters.query(f"cluster == {test_cluster}").sample(10).iterrows()):
    print(t[1]["text"])
    print("-------------")

Hong Kong Police Officer Shoots Protester | A protester was shot with a live round by Hong Kong police Tuesday during widespread demonstrations against China’s National Day Celebration. Photo:CAMPUS TV/HKUSU | 
-------------
Amnesty accuses Hong Kong police of arbitrary arrests, torture | Amnesty International accused Hong Kong police on Friday of torture and other abuses in their handling of more than three months of pro-democracy protests, but the police say they have shown restraint on the street in the face of increased violence. | HONG KONG (Reuters) - Amnesty International accused Hong Kong police on Friday of torture and other abuses in their handling of more than three months of pro-democracy protests, but the police say they have shown restraint on the street in the face of increas… [+3967 chars]
-------------
'We won't give up': Hong Kong protesters defiant after shooting | Shooting of 18-year-old took place during angry demonstrations on Tuesday, prompting mass vigils, more 