## Import required libraries

In [16]:
import os
import re
import string
from collections import Counter
from string import punctuation
from time import time

import gensim
import nltk
import numpy as np
import pandas as pd
import gensim.downloader as api
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics import silhouette_samples, silhouette_score
from tqdm.notebook import tqdm

from ds_utils.config import set_display_options
from ds_utils.data import NEWS_DATA
from ds_utils.functions import vectorize

nltk.download("stopwords")
nltk.download("punkt")
set_display_options()

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/dylancastillo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/dylancastillo/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Read data

In [17]:
df = pd.read_csv(NEWS_DATA)

In [18]:
df.shape

(10437, 15)

In [19]:
df.columns

Index(['Unnamed: 0', 'source_id', 'source_name', 'author', 'title',
       'description', 'url', 'url_to_image', 'published_at', 'content',
       'top_article', 'engagement_reaction_count', 'engagement_comment_count',
       'engagement_share_count', 'engagement_comment_plugin_count'],
      dtype='object')

In [20]:
df.sample(1).T

Unnamed: 0,2683
Unnamed: 0,2683
source_id,the-irish-times
source_name,The Irish Times
author,Tanya Sweeney
title,Make a Move to . . . Ringsend
description,"Despite seismic demographic changes, Ringsend retains its community spirit"
url,https://www.irishtimes.com/\t\t\t\t\t\t\t/life-and-style/homes-and-property/make-a-move-to-ringsend-1.4005265\t
url_to_image,https://www.irishtimes.com/image-creator/?id=1.4005258&origw=1440
published_at,2019-09-07T05:00:00Z
content,"Whats so good about it?\r\nHas any Dublin neighbourhood undergone such a dramatic shift in the last couple of decades? Ringsends history is fantastically rich it was famously the point where Cromwell arrived in 1649. Several centuries later, it was the turn of … [+5720 chars]"


## Fill missing values

In [21]:
df.isna().mean()

Unnamed: 0                        0.00
source_id                         0.00
source_name                       0.00
author                            0.10
title                             0.00
description                       0.00
url                               0.00
url_to_image                      0.06
published_at                      0.00
content                           0.12
top_article                       0.00
engagement_reaction_count         0.01
engagement_comment_count          0.01
engagement_share_count            0.01
engagement_comment_plugin_count   0.01
dtype: float64

In [22]:
df["content"] = df["content"].fillna("")

## Generate tokens

In [23]:
df.sample(1).apply(lambda x: x["title"] + " | " + x["description"] + " | " + x["content"], axis=1).values[0]

'Environment being changed at unparalled rate, warns Higgins | President urges people to mow grass less and let weeds flower to protect biodiversity | Nature, biodiversity and the contributions they made to human existence are in trouble, President Michael D Higgins has said.\r\nSpeaking at the opening of the inaugural Phoenix Park Biodiversity Festival and Honey Show in Dublin, Mr Higgins said the number of … [+2888 chars]'

In [24]:
stop_words = set(stopwords.words("english") + ["news", "new", "top"])

def generate_tokens(text, tokenizer=word_tokenize, stop_words=stop_words):
    text = str(text).lower() # Lowercase words
    text = re.sub(r"\[(.*?)\]", "", text) # Remove [+XYZ chars] in content
    text = re.sub(r"\s+", " ", text) # Remove multiple spaces in content
    text = re.sub(r"\w+…|…", "", text) # Remove ellipsis (and last word)
    text = re.sub(r"(?<=\w)-(?=\w)", " ", text) # Replace dash between words
    text = re.sub(f"[{re.escape(string.punctuation)}]", "", text) # Remove punctuation
    
    tokens = tokenizer(text) # Get tokens from text
    tokens = [t for t in tokens if not t in stop_words] # Remove stopwords
    tokens = ["" if t.isdigit() else t for t in tokens] # Remove digits
    tokens = [t for t in tokens if len(t) > 1] # Remove short tokens
    return tokens

for _ in range(5):
    sample_text = df.sample(1).apply(lambda x: x["title"] + " | " + x["description"], axis=1).values[0]
    print(f"SAMPLE TEXT: {sample_text}")
    print(f"TOKENS: {generate_tokens(sample_text)}")
    print(f"------")

SAMPLE TEXT: Michelin-starred chef: 'I'm lucky I found what I love young' | Curtis Stone's 24-seat Beverly Hills, California, restaurant, Maude, recently received a Michelin star. Find out what's behind his success.
TOKENS: ['michelin', 'starred', 'chef', 'im', 'lucky', 'found', 'love', 'young', 'curtis', 'stones', 'seat', 'beverly', 'hills', 'california', 'restaurant', 'maude', 'recently', 'received', 'michelin', 'star', 'find', 'whats', 'behind', 'success']
------
SAMPLE TEXT: EU official: Brexit negotiations will take place ‘over coming days’ following UK government proposal | Get breaking national and world news, broadcast video coverage, and exclusive interviews. Find the top news online at ABC news.
TOKENS: ['eu', 'official', 'brexit', 'negotiations', 'take', 'place', 'coming', 'days', 'following', 'uk', 'government', 'proposal', 'get', 'breaking', 'national', 'world', 'broadcast', 'video', 'coverage', 'exclusive', 'interviews', 'find', 'online', 'abc']
------
SAMPLE TEXT: NOAA a

In [25]:
text_columns = ["title", "description", "content"]

for col in text_columns:
    df[col] = df[col].astype(str)

# Create text column based on title, description, and content
df["text"] = df[text_columns].apply(lambda x: ' | '.join(x), axis=1)
df["tokens"] = df["text"].map(lambda x: generate_tokens(x))

# Remove duplicated after preprocessing
_, idx = np.unique(df["tokens"], return_index=True)
df_proc = df.iloc[idx, :]

# Remove empty values
df_proc = df_proc.loc[df_proc.tokens.map(lambda x: len(x) > 0)]

df.shape, df_proc.shape

((10437, 17), (9882, 17))

## Review vocabulary

In [26]:
docs = df_proc["text"].values
tokenized_docs = df_proc["tokens"].values
vocab = Counter()
for token in tokenized_docs:
    vocab.update(token)

In [27]:
len(vocab)

32454

In [28]:
vocab.most_common(10)

[('us', 2757),
 ('said', 2519),
 ('year', 1781),
 ('president', 1756),
 ('trump', 1705),
 ('world', 1620),
 ('says', 1511),
 ('one', 1418),
 ('two', 1284),
 ('first', 1195)]

## Load pre-trained FastText model

In [29]:
model = api.load("fasttext-wiki-news-subwords-300")

In [35]:
model.most_similar("trump")

[('trumps', 0.8457011580467224),
 ('trumping', 0.7876768708229065),
 ('non-trump', 0.7490020394325256),
 ('trumped', 0.7124733328819275),
 ('notrump', 0.6544546484947205),
 ('supercede', 0.6326021552085876),
 ('overrule', 0.6288058161735535),
 ('no-trump', 0.627895176410675),
 ('override', 0.6258442401885986),
 ('supersede', 0.6115216016769409)]

In [36]:
model.most_similar("facebook")

[('facebook.', 0.8114862442016602),
 ('facebooks', 0.7959319949150085),
 ('Facebook', 0.7885890007019043),
 ('twitter', 0.7708379030227661),
 ('facebook.com', 0.7622057199478149),
 ('facebooking', 0.737972617149353),
 ('non-Facebook', 0.7352786064147949),
 ('instagram', 0.7332959771156311),
 ('Facebook.', 0.7232986688613892),
 ('myspace', 0.7073581218719482)]

## Generate vectors from documents

In [37]:
def vectorize(list_of_docs, model, strategy):
    features = []
    size_output = model.vector_size
    embedding_dict = model

    if strategy == "min-max":
        size_output *= 2

    if hasattr(model, "wv"):
        embedding_dict = model.wv

    for tokens in list_of_docs:
        zero_vector = np.zeros(size_output)
        vectors = []
        for token in tokens:
            if token in embedding_dict:
                try:
                    vectors.append(embedding_dict[token])
                except KeyError:
                    continue
        if vectors:
            vectors = np.asarray(vectors)
            if strategy == "min-max":
                min_vec = vectors.min(axis=0)
                max_vec = vectors.max(axis=0)
                features.append(np.concatenate((min_vec, max_vec)))
            elif strategy == "average":
                avg_vec = vectors.mean(axis=0)
                features.append(avg_vec)
            else:
                raise ValueError(f"Aggregation strategy {strategy} does not exist!")
        else:
            features.append(zero_vector)
    return features

In [38]:
vectorized_docs = vectorize(tokenized_docs, model=model, strategy="min-max")
len(vectorized_docs), len(vectorized_docs[0])

(9882, 600)

## Choose number of clusters

In [39]:
def generate_clusters(X, k, mb=500, random_state=42):
    clustering = MiniBatchKMeans(n_clusters=k, batch_size=mb, random_state=random_state)
    cluster_labels = clustering.fit_predict(X)
    print(f"For n_clusters = {k}")
    silhouette_avg = silhouette_score(X, cluster_labels)
    print(f"The average Silhouette_score is: {silhouette_avg:.2f}")
    sample_silhouette_values = silhouette_samples(X, cluster_labels)
    for i in range(k):
        ith_cluster_silhouette_values = sample_silhouette_values[cluster_labels == i]
        print(
            f"    Silhoute values for cluster {i}: "
            f"Size:{ith_cluster_silhouette_values.shape[0]}"
            f"| Min:{ith_cluster_silhouette_values.min():.2f}"
            f"| Avg:{ith_cluster_silhouette_values.mean():.2f}"
            f"| Max: {ith_cluster_silhouette_values.max():.2f}"
        )
    try:
        print(f"The Inertia is :{clustering.inertia_}")
        distorsions.append(clustering.inertia_)
    except:
        pass
    return clustering, cluster_labels

In [40]:
distorsions = []
for k in tqdm(range(2, 10)):
    generate_clusters(vectorized_docs, k)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=8.0), HTML(value='')))

For n_clusters = 2
The average Silhouette_score is: 0.08
    Silhoute values for cluster 0: Size:6113| Min:0.05| Avg:0.13| Max: 0.22
    Silhoute values for cluster 1: Size:3769| Min:-0.08| Avg:-0.01| Max: 0.11
The Inertia is :8074.997826727096
For n_clusters = 3
The average Silhouette_score is: 0.07
    Silhoute values for cluster 0: Size:4477| Min:0.04| Avg:0.12| Max: 0.22
    Silhoute values for cluster 1: Size:3846| Min:-0.09| Avg:-0.01| Max: 0.10
    Silhoute values for cluster 2: Size:1559| Min:0.01| Avg:0.11| Max: 0.15
The Inertia is :7698.559388467913
For n_clusters = 4
The average Silhouette_score is: 0.06
    Silhoute values for cluster 0: Size:725| Min:-0.10| Avg:-0.02| Max: 0.08
    Silhoute values for cluster 1: Size:1469| Min:0.03| Avg:0.12| Max: 0.17
    Silhoute values for cluster 2: Size:4519| Min:0.02| Avg:0.10| Max: 0.20
    Silhoute values for cluster 3: Size:3169| Min:-0.05| Avg:0.00| Max: 0.08
The Inertia is :7576.4976268001055
For n_clusters = 5
The average Silho

## Analyze generated clusters

In [41]:
clustering, cluster_labels = generate_clusters(vectorized_docs, 5)

For n_clusters = 5
The average Silhouette_score is: 0.04
    Silhoute values for cluster 0: Size:1913| Min:-0.09| Avg:-0.02| Max: 0.10
    Silhoute values for cluster 1: Size:220| Min:0.07| Avg:0.14| Max: 0.18
    Silhoute values for cluster 2: Size:1475| Min:0.01| Avg:0.11| Max: 0.16
    Silhoute values for cluster 3: Size:3553| Min:-0.03| Avg:0.02| Max: 0.10
    Silhoute values for cluster 4: Size:2721| Min:0.00| Avg:0.07| Max: 0.18
The Inertia is :7433.064355084163


In [42]:
df_clusters = pd.DataFrame({
    "text": docs,
    "tokens": [" ".join(text) for text in tokenized_docs],
    "cluster": cluster_labels
})

### Most frequent tokens

In [43]:
test_cluster = 2
Counter(" ".join(df_clusters.query(f"cluster == {test_cluster}")["tokens"]).split()).most_common(50)

[('us', 2623),
 ('trump', 629),
 ('president', 498),
 ('said', 482),
 ('world', 331),
 ('reuters', 328),
 ('donald', 298),
 ('trade', 271),
 ('says', 268),
 ('states', 264),
 ('united', 259),
 ('year', 242),
 ('thursday', 217),
 ('china', 210),
 ('find', 209),
 ('house', 204),
 ('national', 204),
 ('tuesday', 189),
 ('facebook', 186),
 ('one', 178),
 ('hurricane', 177),
 ('state', 173),
 ('washington', 164),
 ('officials', 163),
 ('two', 162),
 ('former', 154),
 ('talks', 153),
 ('oil', 151),
 ('war', 148),
 ('trumps', 145),
 ('could', 145),
 ('first', 144),
 ('dorian', 144),
 ('video', 142),
 ('friday', 139),
 ('week', 130),
 ('according', 129),
 ('people', 128),
 ('happening', 126),
 ('since', 125),
 ('get', 125),
 ('whats', 125),
 ('last', 125),
 ('would', 125),
 ('million', 124),
 ('chat', 123),
 ('york', 122),
 ('messenger', 122),
 ('unfolds', 122),
 ('federal', 122)]

### Most representative documents

In [44]:
most_representative_docs = np.argsort(
    np.linalg.norm(vectorized_docs - clustering.cluster_centers_[test_cluster], axis=1)
)
for d in most_representative_docs[:10]:
    print(docs[d])
    print("-------------")

U.S. wants 'near term' results from new China trade talks: Kudlow | The Trump administration wants to see "near term results" from U.S.-China trade talks in September and October, White House economic adviser Larry Kudlow said on Friday, but he declined to predict any outcomes or say if U.S. tariff delays were possible. | WASHINGTON (Reuters) - The Trump administration wants to see “near term results” from U.S.-China trade talks in September and October, White House economic adviser Larry Kudlow said on Friday, but he declined to predict any outcomes or say if U.S. tariff dela… [+2318 chars]
-------------
China Wants to Change the Conversation Around Trade War to End Standoff with U.S. | "I think there's a couple more chapters yet to be written in the trade war," James McCormack, the global head of sovereign ratings at Fitch, told CNBC on Thursday. | China is seeking to shift the breadth of its trade talks with the United States in hopes of catalyzing an agreement to end the protracted

### Random sample of documents

In [45]:
for i,t in enumerate(df_clusters.query(f"cluster == {test_cluster}").sample(10).iterrows()):
    print(t[1]["text"])
    print("-------------")

Jay Inslee Won the 2020 Presidential Climate Change Town Hall and Four More Takeaways | Perhaps Washington Governor and 2020 dropout Jay Inslee has Washington D.C. in his future after all. Climate czar has a nice ring to it. | On Wednesday, the 10 top 2020 Democratic candidates took to New York City to participate in a seven hour series of CNN town halls on just one topic: climate change.
As Hurricane Dorian threatened the east coast of the U.S., the candidates answered a number o… [+5939 chars]
-------------
US impeachment inquiry: Trump calls investigation a 'coup' | Trump has repeatedly tried to discredit the whistle-blower, a US intelligence officer who remains anonymous. | US President Donald Trump has denounced an impeachment investigation against him as a coup.
A war of words between House Democrats and a key member of his cabinet has erupted in Washington, DC, as more witnesses are being called to testify in an impeachment … [+68 chars]
-------------
Trump wins biggest-ever WTO