## Import required libraries

In [1]:
import os
import re
import string
from collections import Counter
from string import punctuation
from time import time

import gensim
import nltk
import numpy as np
import pandas as pd
from gensim.models import Word2Vec
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics import silhouette_samples, silhouette_score
from tqdm.notebook import tqdm

from ds_utils.config import set_display_options
from ds_utils.data import NEWS_DATA
from ds_utils.functions import vectorize

nltk.download("stopwords")
nltk.download("punkt")
set_display_options()

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/dylancastillo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/dylancastillo/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Read Data

In [2]:
df = pd.read_csv(NEWS_DATA)

In [3]:
df.shape

(10437, 15)

In [4]:
df.columns

Index(['Unnamed: 0', 'source_id', 'source_name', 'author', 'title',
       'description', 'url', 'url_to_image', 'published_at', 'content',
       'top_article', 'engagement_reaction_count', 'engagement_comment_count',
       'engagement_share_count', 'engagement_comment_plugin_count'],
      dtype='object')

In [5]:
df.sample(1).T

Unnamed: 0,1846
Unnamed: 0,1846
source_id,abc-news
source_name,ABC News
author,The Associated Press
title,New sentencing date set for man involved in grisly killing
description,A new trial date has been set for a man convicted to help cover up the death of a North Dakota woman whose baby was cut from her womb
url,https://abcnews.go.com/US/wireStory/sentencing-date-set-man-involved-grisly-killing-65435099
url_to_image,
published_at,2019-09-06T17:06:26Z
content,A new sentencing hearing has been set for a man convicted of helping to cover up the death of a North Dakota woman whose baby was cut from her womb.\r\nThe North Dakota Supreme Court ruled last month that William Hoehn should not have been sentenced to life in … [+495 chars]


## Fill missing values

In [6]:
df.isna().mean()

Unnamed: 0                        0.00
source_id                         0.00
source_name                       0.00
author                            0.10
title                             0.00
description                       0.00
url                               0.00
url_to_image                      0.06
published_at                      0.00
content                           0.12
top_article                       0.00
engagement_reaction_count         0.01
engagement_comment_count          0.01
engagement_share_count            0.01
engagement_comment_plugin_count   0.01
dtype: float64

In [7]:
df["content"] = df["content"].fillna("")

## Generate tokens

In [8]:
df.sample(1).apply(lambda x: x["title"] + " | " + x["description"] + " | " + x["content"], axis=1).values[0]

'Reports say Boeing insider filed safety complaint about Max | Reports: Boeing engineer filed internal complaint that backup safety system on the 737 Max was rejected because of cost concerns | Boeings reputation is taking another hit, as published reports say a company engineer filed an internal complaint alleging that company managers rejected a backup system that might have alerted pilots to problems linked to two deadly crashes involving the 737… [+540 chars]'

In [9]:
stop_words = set(stopwords.words("english") + ["news", "new", "top"])

def generate_tokens(text, tokenizer=word_tokenize, stop_words=stop_words):
    text = str(text).lower() # Lowercase words
    text = re.sub(r"\[(.*?)\]", "", text) # Remove [+XYZ chars] in content
    text = re.sub(r"\s+", " ", text) # Remove multiple spaces in content
    text = re.sub(r"\w+…|…", "", text) # Remove ellipsis (and last word)
    text = re.sub(r"(?<=\w)-(?=\w)", " ", text) # Replace dash between words
    text = re.sub(f"[{re.escape(string.punctuation)}]", "", text) # Remove punctuation
    
    tokens = tokenizer(text) # Get tokens from text
    tokens = [t for t in tokens if not t in stop_words] # Remove stopwords
    tokens = ["" if t.isdigit() else t for t in tokens] # Remove digits
    tokens = [t for t in tokens if len(t) > 1] # Remove short tokens
    return tokens

for _ in range(5):
    sample_text = df.sample(1).apply(lambda x: x["title"] + " | " + x["description"], axis=1).values[0]
    print(f"SAMPLE TEXT: {sample_text}")
    print(f"TOKENS: {generate_tokens(sample_text)}")
    print(f"------")

SAMPLE TEXT: Polls close in first round of Tunisian presidential election | Polling booths closed at 6 pm (1700 GMT) in Tunisia on Sunday after voting in the first round of a presidential election in the young democracy.
TOKENS: ['polls', 'close', 'first', 'round', 'tunisian', 'presidential', 'election', 'polling', 'booths', 'closed', 'pm', 'gmt', 'tunisia', 'sunday', 'voting', 'first', 'round', 'presidential', 'election', 'young', 'democracy']
------
SAMPLE TEXT: "The Hollywood Ripper:" How "48 Hours" helped crack the case | Four young women attacked, only one survivor — inside the investigation and trial of the serial, sexual thrill killer
TOKENS: ['hollywood', 'ripper', 'hours', 'helped', 'crack', 'case', 'four', 'young', 'women', 'attacked', 'one', 'survivor', 'inside', 'investigation', 'trial', 'serial', 'sexual', 'thrill', 'killer']
------
SAMPLE TEXT: Xeris Pharma's low blood sugar treatment wins FDA approval | The U.S. Food and Drug Administration has approved Xeris Pharmaceuti

In [10]:
text_columns = ["title", "description", "content"]

for col in text_columns:
    df[col] = df[col].astype(str)

# Create text column based on title, description, and content
df["text"] = df[text_columns].apply(lambda x: ' | '.join(x), axis=1)
df["tokens"] = df["text"].map(lambda x: generate_tokens(x))

# Remove duplicated after preprocessing
_, idx = np.unique(df["tokens"], return_index=True)
df_proc = df.iloc[idx, :]

# Remove empty values
df_proc = df_proc.loc[df_proc.tokens.map(lambda x: len(x) > 0)]

df.shape, df_proc.shape

((10437, 17), (9882, 17))

## Review vocabulary

In [11]:
docs = df_proc["text"].values
tokenized_docs = df_proc["tokens"].values
vocab = Counter()
for token in tokenized_docs:
    vocab.update(token)

In [12]:
len(vocab)

32454

In [13]:
vocab.most_common(10)

[('us', 2757),
 ('said', 2519),
 ('year', 1781),
 ('president', 1756),
 ('trump', 1705),
 ('world', 1620),
 ('says', 1511),
 ('one', 1418),
 ('two', 1284),
 ('first', 1195)]

## Train Word2Vec model

In [14]:
model = Word2Vec(sentences=tokenized_docs, vector_size=100, workers=4, seed=42)

In [15]:
model.wv.most_similar("trump")

[('trumps', 0.9852380752563477),
 ('president', 0.9776293635368347),
 ('donald', 0.956746518611908),
 ('breitbart', 0.9487488269805908),
 ('ivanka', 0.939983606338501),
 ('administration', 0.9337033033370972),
 ('impeachment', 0.9273420572280884),
 ('turnberry', 0.9235938191413879),
 ('inquiry', 0.9205366373062134),
 ('biden', 0.9179183840751648)]

In [16]:
model.wv.most_similar("facebook")

[('chat', 0.9380522966384888),
 ('messenger', 0.9257147908210754),
 ('find', 0.908592939376831),
 ('gambling', 0.9063940048217773),
 ('google', 0.9059684872627258),
 ('online', 0.8690320253372192),
 ('wjla', 0.8537392020225525),
 ('interviews', 0.8390310406684875),
 ('whats', 0.8332228064537048),
 ('analysis', 0.8328741192817688)]

## Generate vectors from documents

In [17]:
vectorized_docs = vectorize(tokenized_docs, model=model, strategy="average")
len(vectorized_docs), len(vectorized_docs[0])

(9882, 100)

## Choose number of clusters

In [18]:
def generate_clusters(X, k, mb=500, random_state=42):
    clustering = MiniBatchKMeans(n_clusters=k, batch_size=mb, random_state=random_state)
    cluster_labels = clustering.fit_predict(X)
    print(f"For n_clusters = {k}")
    silhouette_avg = silhouette_score(X, cluster_labels)
    print(f"The average Silhouette_score is: {silhouette_avg:.2f}")
    sample_silhouette_values = silhouette_samples(X, cluster_labels)
    for i in range(k):
        ith_cluster_silhouette_values = sample_silhouette_values[cluster_labels == i]
        print(f"    Silhoute values for cluster {i}: "
        f"Size:{ith_cluster_silhouette_values.shape[0]}"
        f"| Min:{ith_cluster_silhouette_values.min():.2f}"
        f"| Avg:{ith_cluster_silhouette_values.mean():.2f}"
        f"| Max: {ith_cluster_silhouette_values.max():.2f}")
    try:
        print(f"The Inertia is :{clustering.inertia_}")
        distorsions.append(clustering.inertia_)
    except:
        pass
    return clustering, cluster_labels

In [19]:
distorsions = []
for k in tqdm(range(2, 25)):
    generate_clusters(vectorized_docs, k)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=23.0), HTML(value='')))

For n_clusters = 2
The average Silhouette_score is: 0.27
    Silhoute values for cluster 0: Size:7334| Min:0.01| Avg:0.33| Max: 0.50
    Silhoute values for cluster 1: Size:2548| Min:-0.14| Avg:0.11| Max: 0.33
The Inertia is :15430.023632544493
For n_clusters = 3
The average Silhouette_score is: 0.22
    Silhoute values for cluster 0: Size:5322| Min:-0.00| Avg:0.29| Max: 0.47
    Silhoute values for cluster 1: Size:1189| Min:-0.08| Avg:0.24| Max: 0.47
    Silhoute values for cluster 2: Size:3371| Min:-0.15| Avg:0.09| Max: 0.30
The Inertia is :13490.466629284378
For n_clusters = 4
The average Silhouette_score is: 0.24
    Silhoute values for cluster 0: Size:5106| Min:0.04| Avg:0.34| Max: 0.51
    Silhoute values for cluster 1: Size:3315| Min:-0.20| Avg:0.05| Max: 0.28
    Silhoute values for cluster 2: Size:315| Min:-0.02| Avg:0.39| Max: 0.60
    Silhoute values for cluster 3: Size:1146| Min:-0.07| Avg:0.28| Max: 0.50
The Inertia is :11770.854998793282
For n_clusters = 5
The average Sil

## Analyze generated clusters

In [20]:
clustering, cluster_labels = generate_clusters(vectorized_docs, 7)

For n_clusters = 7
The average Silhouette_score is: 0.26
    Silhoute values for cluster 0: Size:4025| Min:0.04| Avg:0.34| Max: 0.54
    Silhoute values for cluster 1: Size:673| Min:-0.01| Avg:0.31| Max: 0.52
    Silhoute values for cluster 2: Size:1498| Min:-0.17| Avg:0.16| Max: 0.43
    Silhoute values for cluster 3: Size:300| Min:-0.07| Avg:0.39| Max: 0.61
    Silhoute values for cluster 4: Size:2320| Min:-0.14| Avg:0.12| Max: 0.37
    Silhoute values for cluster 5: Size:647| Min:-0.19| Avg:0.21| Max: 0.44
    Silhoute values for cluster 6: Size:419| Min:-0.02| Avg:0.39| Max: 0.60
The Inertia is :7473.58840014711


In [21]:
df_clusters = pd.DataFrame({
    "text": docs,
    "tokens": [" ".join(text) for text in tokenized_docs],
    "cluster": cluster_labels
})

### Most frequent tokens

In [22]:
test_cluster = 2
Counter(" ".join(df_clusters.query(f"cluster == {test_cluster}")["tokens"]).split()).most_common(50)

[('police', 746),
 ('year', 674),
 ('man', 567),
 ('said', 508),
 ('old', 468),
 ('people', 389),
 ('two', 372),
 ('killed', 359),
 ('years', 278),
 ('say', 278),
 ('found', 276),
 ('died', 272),
 ('image', 267),
 ('death', 257),
 ('woman', 255),
 ('one', 243),
 ('three', 237),
 ('court', 233),
 ('caption', 224),
 ('last', 217),
 ('city', 202),
 ('fire', 201),
 ('murder', 197),
 ('former', 191),
 ('thursday', 189),
 ('shot', 188),
 ('says', 188),
 ('least', 186),
 ('hong', 185),
 ('authorities', 178),
 ('shooting', 175),
 ('us', 173),
 ('arrested', 170),
 ('kong', 168),
 ('school', 167),
 ('tuesday', 164),
 ('accused', 160),
 ('dead', 159),
 ('officials', 154),
 ('killing', 152),
 ('million', 151),
 ('charged', 150),
 ('officer', 148),
 ('california', 142),
 ('state', 141),
 ('four', 133),
 ('home', 130),
 ('saudi', 130),
 ('attack', 126),
 ('case', 125)]

### Most representative documents

In [23]:
most_representative_docs = np.argsort(
    np.linalg.norm(vectorized_docs - clustering.cluster_centers_[test_cluster], axis=1)
)
for d in most_representative_docs[:10]:
    print(docs[d])
    print("-------------")

Drugs worth €107,000 found after man searched in Cork train station | Cocaine and heroin seized as part of ongoing operation | A man in his 30s was arrested and drugs worth more than 100,000 were seized in Kent Railway Station in Cork on Wednesday evening. 
Gardaí said the seizure was part of an ongoing operation into the sale and supply of drugs in the Mayfield area of the city. 
… [+394 chars]
-------------
New Zealand mosque shooting suspect drops bid to move trial | Survivors and grieving families of victims welcome news that the trial will be in Christchurch when it starts in June. | The man accused of murdering 51 Muslim worshippers in March's New Zealand mosque attacks has dropped a bid to move his trial from the city where the mass shooting took place.
The High Court held a pre-trial hearing in Christchurch on Thursday to consider an … [+1240 chars]
-------------
Official: Feds search home for link to gun in Texas shooting | A federal law enforcement official says ATF agents ha

### Most representative tokens

In [24]:
model.wv.most_similar(positive=[clustering.cluster_centers_[test_cluster]], topn=10)

[('happened', 0.9976266026496887),
 ('chicago', 0.9946582317352295),
 ('misdemeanor', 0.994258463382721),
 ('identified', 0.994140088558197),
 ('courtroom', 0.9936190843582153),
 ('minnesota', 0.993247389793396),
 ('austin', 0.9930338263511658),
 ('synagogue', 0.9928783774375916),
 ('accident', 0.9927681684494019),
 ('assaulting', 0.9923295378684998)]

### Random sample of documents

In [25]:
for i,t in enumerate(df_clusters.query(f"cluster == {test_cluster}").sample(10).iterrows()):
    print(t[1]["text"])
    print("-------------")

Greenland Calls On Denmark to Help Fight Child Sexual Abuse | A former mayor of the community of Tasiilaq said, “I believe we are in a human, social and cultural death spiral if we don’t manage to stop the sexual abuse.” | Of the 191 cases reported, 152 have been prosecuted (some cases are still pending, and others have been dropped). Thirty-three percent of the prosecutions have led to convictions.
But perpetrators usually are not ostracized after convictions for molesting a … [+1788 chars]
-------------
Coast Guard officer accused of mass killing plot pleads guilty | A Coast Guard lieutenant accused of plotting a domestic terror attack pleaded guilty Thursday to four counts of weapons and drug charges. | 
-------------
Kevin Lunney was slashed, beaten and had bleach poured on him by kidnappers | QIH chief operations officer believed to have been held in a horsebox, investigators believe | Kevin Lunney, the Quinn Industrial Holdings chief operations officer kidnapped and imprisoned on