## Import required libraries

In [1]:
import os
import re
import string
from collections import Counter
from string import punctuation
from time import time

import gensim
import nltk
import numpy as np
import pandas as pd
from gensim.models import Word2Vec
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics import silhouette_samples, silhouette_score
from tqdm.notebook import tqdm

from ds_utils.config import set_display_options
from ds_utils.data import NEWS_DATA
from ds_utils.functions import vectorize

nltk.download("stopwords")
nltk.download("punkt")
set_display_options()

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/dylancastillo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/dylancastillo/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Read Data

In [2]:
df = pd.read_csv(NEWS_DATA)

In [3]:
df.shape

(10437, 15)

In [4]:
df.columns

Index(['Unnamed: 0', 'source_id', 'source_name', 'author', 'title',
       'description', 'url', 'url_to_image', 'published_at', 'content',
       'top_article', 'engagement_reaction_count', 'engagement_comment_count',
       'engagement_share_count', 'engagement_comment_plugin_count'],
      dtype='object')

In [5]:
df.sample(1).T

Unnamed: 0,1480
Unnamed: 0,1480
source_id,al-jazeera-english
source_name,Al Jazeera English
author,Al Jazeera
title,Hurricane Dorian set to make landfall again
description,This slow moving and powerful storm is set to strike the US and hurricane warnings remain in place.
url,https://www.aljazeera.com/news/2019/09/hurricane-dorian-set-landfall-190905085536676.html
url_to_image,https://www.aljazeera.com/mritems/Images/2019/9/5/8ba8a27872624dcd9120e60a22a0358f_18.jpg
published_at,2019-09-05T11:17:38Z
content,"Hurricane Dorian continued to move north early on Thursday, running parallel to the coast of the southeast United States with a turn to the northeast expected later in the day.\r\nIt has increased in strength and is now a Category 3 storm on the Saffir Simpson … [+1713 chars]"


## Fill missing values

In [6]:
df.isna().mean()

Unnamed: 0                        0.00
source_id                         0.00
source_name                       0.00
author                            0.10
title                             0.00
description                       0.00
url                               0.00
url_to_image                      0.06
published_at                      0.00
content                           0.12
top_article                       0.00
engagement_reaction_count         0.01
engagement_comment_count          0.01
engagement_share_count            0.01
engagement_comment_plugin_count   0.01
dtype: float64

In [7]:
df["content"] = df["content"].fillna("")

## Generate tokens

In [8]:
df.sample(1).apply(lambda x: x["title"] + " | " + x["description"] + " | " + x["content"], axis=1).values[0]

'NRA Sues San Francisco Over Being Labeled \'Domestic Terrorist Organization\' | The NRA, which includes millions of Americans, says it was targeted and unlawfully "blacklisted" by San Francisco. | Less than a week after the city of San Francisco called the National Rifle Association a "domestic terrorist organization," the NRA has fired back with a lawsuit.\r\nThe NRA is suing both the city and county of San Francisco, and its Board of Supervisors, for r… [+2340 chars]'

In [9]:
stop_words = set(stopwords.words("english") + ["news", "new", "top"])

def generate_tokens(text, tokenizer=word_tokenize, stop_words=stop_words):
    text = str(text).lower() # Lowercase words
    text = re.sub(r"\[(.*?)\]", "", text) # Remove [+XYZ chars] in content
    text = re.sub(r"\s+", " ", text) # Remove multiple spaces in content
    text = re.sub(r"\w+…|…", "", text) # Remove ellipsis (and last word)
    text = re.sub(r"(?<=\w)-(?=\w)", " ", text) # Replace dash between words
    text = re.sub(f"[{re.escape(string.punctuation)}]", "", text) # Remove punctuation
    
    tokens = tokenizer(text) # Get tokens from text
    tokens = [t for t in tokens if not t in stop_words] # Remove stopwords
    tokens = ["" if t.isdigit() else t for t in tokens] # Remove digits
    tokens = [t for t in tokens if len(t) > 1] # Remove short tokens
    return tokens

for _ in range(5):
    sample_text = df.sample(1).apply(lambda x: x["title"] + " | " + x["description"], axis=1).values[0]
    print(f"SAMPLE TEXT: {sample_text}")
    print(f"TOKENS: {generate_tokens(sample_text)}")
    print(f"------")

SAMPLE TEXT: Facebook Faces Antitrust Investigation by State Attorneys General | Regulators in eight states and the District of Columbia are moving forward with an antitrust investigation into Facebook, New York’s attorney general said.
TOKENS: ['facebook', 'faces', 'antitrust', 'investigation', 'state', 'attorneys', 'general', 'regulators', 'eight', 'states', 'district', 'columbia', 'moving', 'forward', 'antitrust', 'investigation', 'facebook', 'york', 'attorney', 'general', 'said']
------
SAMPLE TEXT: Opioid Defendants Seek to Disqualify Judge Overseeing 2,300 Cases | A Hail Mary pass by drug distributors and pharmacy chains on the eve of a landmark federal trial argues that the judge already believes that defendants should pay.
TOKENS: ['opioid', 'defendants', 'seek', 'disqualify', 'judge', 'overseeing', 'cases', 'hail', 'mary', 'pass', 'drug', 'distributors', 'pharmacy', 'chains', 'eve', 'landmark', 'federal', 'trial', 'argues', 'judge', 'already', 'believes', 'defendants', 'pay']


In [10]:
text_columns = ["title", "description", "content"]

for col in text_columns:
    df[col] = df[col].astype(str)

# Create text column based on title, description, and content
df["text"] = df[text_columns].apply(lambda x: ' | '.join(x), axis=1)
df["tokens"] = df["text"].map(lambda x: generate_tokens(x))

# Remove duplicated after preprocessing
_, idx = np.unique(df["tokens"], return_index=True)
df_proc = df.iloc[idx, :]

# Remove empty values
df_proc = df_proc.loc[df_proc.tokens.map(lambda x: len(x) > 0)]

df.shape, df_proc.shape

((10437, 17), (9882, 17))

## Review vocabulary

In [11]:
docs = df_proc["text"].values
tokenized_docs = df_proc["tokens"].values
vocab = Counter()
for token in tokenized_docs:
    vocab.update(token)

In [12]:
len(vocab)

32454

In [13]:
vocab.most_common(10)

[('us', 2757),
 ('said', 2519),
 ('year', 1781),
 ('president', 1756),
 ('trump', 1705),
 ('world', 1620),
 ('says', 1511),
 ('one', 1418),
 ('two', 1284),
 ('first', 1195)]

## Train Word2Vec model

In [14]:
word2vec_size = 100
model = Word2Vec(sentences=tokenized_docs, vector_size=word2vec_size, workers=4, seed=42)

In [70]:
model.wv.most_similar("trump")

[('trumps', 0.9858574271202087),
 ('president', 0.9813684821128845),
 ('donald', 0.9601849317550659),
 ('breitbart', 0.9439398050308228),
 ('administration', 0.9402869939804077),
 ('impeachment', 0.9304977655410767),
 ('avlon', 0.9246295690536499),
 ('inquiry', 0.9182679653167725),
 ('whistleblower', 0.9173984527587891),
 ('pences', 0.9152600765228271)]

In [73]:
model.wv.most_similar("facebook")

[('chat', 0.9477117657661438),
 ('messenger', 0.9265488982200623),
 ('google', 0.9085774421691895),
 ('find', 0.9003002643585205),
 ('gambling', 0.8879637122154236),
 ('online', 0.8667547702789307),
 ('communicate', 0.8389983177185059),
 ('interviews', 0.8367419242858887),
 ('commissions', 0.8337017893791199),
 ('whats', 0.8254891037940979)]

## Generate vectors from documents

In [67]:
vectorized_docs = vectorize(tokenized_docs, model=model, strategy="average")
len(vectorized_docs), len(vectorized_docs[0])

(9882, 100)

## Choose number of clusters

In [68]:
def generate_clusters(X, k, mb=500, random_state=42):
    clustering = MiniBatchKMeans(n_clusters=k, batch_size=mb, random_state=random_state)
    cluster_labels = clustering.fit_predict(X)
    print(f"For n_clusters = {k}")
    silhouette_avg = silhouette_score(X, cluster_labels)
    print(f"The average Silhouette_score is: {silhouette_avg:.2f}")
    sample_silhouette_values = silhouette_samples(X, cluster_labels)
    for i in range(k):
        ith_cluster_silhouette_values = sample_silhouette_values[cluster_labels == i]
        print(f"    Silhoute values for cluster {i}: "
        f"Size:{ith_cluster_silhouette_values.shape[0]}"
        f"| Min:{ith_cluster_silhouette_values.min():.2f}"
        f"| Avg:{ith_cluster_silhouette_values.mean():.2f}"
        f"| Max: {ith_cluster_silhouette_values.max():.2f}")
    try:
        print(f"The Inertia is :{clustering.inertia_}")
        distorsions.append(clustering.inertia_)
    except:
        pass
    return clustering, cluster_labels

In [69]:
distorsions = []
for k in tqdm(range(2, 25)):
    generate_clusters(vectorized_docs, k)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=23.0), HTML(value='')))

For n_clusters = 2
The average Silhouette_score is: 0.25
    Silhoute values for cluster 0: Size:3080| Min:-0.16| Avg:0.11| Max: 0.33
    Silhoute values for cluster 1: Size:6802| Min:0.03| Avg:0.31| Max: 0.48
The Inertia is :15523.247198073519
For n_clusters = 3
The average Silhouette_score is: 0.21
    Silhoute values for cluster 0: Size:2614| Min:-0.21| Avg:0.09| Max: 0.33
    Silhoute values for cluster 1: Size:2376| Min:-0.19| Avg:0.09| Max: 0.33
    Silhoute values for cluster 2: Size:4892| Min:0.02| Avg:0.33| Max: 0.49
The Inertia is :13534.303469138087
For n_clusters = 4
The average Silhouette_score is: 0.24
    Silhoute values for cluster 0: Size:1651| Min:0.01| Avg:0.22| Max: 0.44
    Silhoute values for cluster 1: Size:5593| Min:-0.04| Avg:0.26| Max: 0.46
    Silhoute values for cluster 2: Size:2142| Min:-0.10| Avg:0.17| Max: 0.41
    Silhoute values for cluster 3: Size:496| Min:0.01| Avg:0.39| Max: 0.59
The Inertia is :11808.402813733965
For n_clusters = 5
The average Silho

## Analyze generated clusters

In [81]:
clustering, cluster_labels = generate_clusters(vectorized_docs, 7)

For n_clusters = 7
The average Silhouette_score is: 0.22
    Silhoute values for cluster 0: Size:3054| Min:0.00| Avg:0.28| Max: 0.46
    Silhoute values for cluster 1: Size:477| Min:-0.06| Avg:0.36| Max: 0.58
    Silhoute values for cluster 2: Size:302| Min:-0.07| Avg:0.40| Max: 0.62
    Silhoute values for cluster 3: Size:3329| Min:-0.16| Avg:0.13| Max: 0.38
    Silhoute values for cluster 4: Size:1240| Min:-0.15| Avg:0.14| Max: 0.39
    Silhoute values for cluster 5: Size:949| Min:-0.10| Avg:0.24| Max: 0.49
    Silhoute values for cluster 6: Size:531| Min:-0.12| Avg:0.29| Max: 0.48
The Inertia is :7752.106607012318


In [82]:
df_clusters = pd.DataFrame({
    "text": docs,
    "tokens": [" ".join(text) for text in tokenized_docs],
    "cluster": cluster_labels
})

### Most frequent tokens

In [115]:
test_cluster = 2
Counter(" ".join(df_clusters.query(f"cluster == {test_cluster}")["tokens"]).split()).most_common(50)

[('hurricane', 638),
 ('dorian', 569),
 ('bahamas', 292),
 ('storm', 251),
 ('carolina', 103),
 ('north', 100),
 ('us', 94),
 ('trump', 90),
 ('coast', 87),
 ('category', 75),
 ('people', 75),
 ('winds', 73),
 ('tropical', 71),
 ('alabama', 67),
 ('hit', 67),
 ('national', 67),
 ('florida', 64),
 ('president', 64),
 ('island', 59),
 ('islands', 55),
 ('dorians', 53),
 ('expected', 51),
 ('could', 48),
 ('south', 48),
 ('said', 47),
 ('landfall', 46),
 ('humberto', 45),
 ('donald', 45),
 ('weather', 43),
 ('rain', 41),
 ('one', 39),
 ('reuters', 37),
 ('heavy', 36),
 ('parts', 36),
 ('homes', 35),
 ('least', 35),
 ('officials', 35),
 ('cbs', 34),
 ('death', 34),
 ('playing', 34),
 ('days', 32),
 ('thousands', 32),
 ('eye', 32),
 ('still', 31),
 ('toll', 31),
 ('path', 31),
 ('carolinas', 31),
 ('world', 30),
 ('thursday', 30),
 ('damage', 28)]

### Most representative documents

In [117]:
most_representative_docs = np.argsort(
    np.linalg.norm(vectorized_docs - clustering.cluster_centers_[test_cluster], axis=1)
)
for d in most_representative_docs[:10]:
    print(docs[d])
    print("-------------")

Live updates: Hurricane Dorian heads for the US - CNN | Hurricane Dorian is tracking its way north, near Florida's east coast. Follow here for the latest forecasts and news updates. | Floridians are boarding up their homes and businesses and leaving behind messages for Hurricane Dorian.
"Dorian Olive U 2 go back 2 sea," reads the board on Olive U Mediterranean Grill in Palm Beach Gardens, Florida.
The owner, Abdul, told CNN that staff we… [+380 chars]
-------------
North Carolina's outer banks prepare for a potential direct hit from Hurricane Dorian | North Carolina's outer banks are dangerously exposed to Hurricane Dorian's wind and the potential for a storm surge, and residents are bracing for a possible direct hit. The storm has already caused tornados, including one caught on a doorbell camera. Omar Vi… | 
-------------
Hurricane Dorian takes aim at the Carolinas as death toll rises in the Bahamas | Water levels were rising along South Carolina's coast early Thursday morning as Hur

### Most representative tokens

In [118]:
model.wv.most_similar(positive=[clustering.cluster_centers_[test_cluster]], topn=10)

[('carolinas', 0.9925810694694519),
 ('decimated', 0.9917576313018799),
 ('ravaged', 0.9912359714508057),
 ('islands', 0.9912341833114624),
 ('charleston', 0.9891331195831299),
 ('category', 0.9891209602355957),
 ('tornadoes', 0.9884063601493835),
 ('trajectory', 0.9877771139144897),
 ('floridas', 0.987080454826355),
 ('outer', 0.9845644235610962)]

### Random sample of documents

In [119]:
for i,t in enumerate(df_clusters.query(f"cluster == {test_cluster}").sample(10).iterrows()):
    print(t[1]["text"])
    print("-------------")

Hurricane Dorian Path Map Update: Category 3 Storm's Slow Movement Will Make It More Destructive Say Experts | Experts say calm conditions above the storm are keeping it stationary, meaning that it is wreaking more damage on the Bahamas, as it bears down on the U.S. east coast. | Hurricane Dorian is wreaking more havoc on the Bahamas because the calm atmosphere above it is keeping it stationary, meteorologists have warned.
In a counterintuitive analysis of the most powerful Atlantic hurricanes on record, where winds have reached up t… [+2947 chars]
-------------
Trump said he skipped a diplomatic trip to monitor Hurricane Dorian. Then he spent the weekend golfing. | President Donald Trump told reporters last week that he was canceling a highly anticipated diplomatic trip to Poland so he could monitor Hurricane Dorian from Camp David. Trump spent hours playing golf at the Trump National Golf Club in Sterling, Virginia thr… | President Donald Trump announced last week that he was canceli