## Import required libraries

In [1]:
import os
import re
from collections import Counter
from time import time

import gensim
import numpy as np
import pandas as pd

from tqdm.notebook import tqdm

from ds_utils.config import set_display_options
from ds_utils.clustering import Tokenizer, load_data, clean_news_data, vectorize, mbkmeans_clusters

from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer

set_display_options()

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/dylancastillo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Read data

In [2]:
df = load_data("news")

In [3]:
df.shape

(10437, 15)

In [4]:
df.columns

Index(['Unnamed: 0', 'source_id', 'source_name', 'author', 'title',
       'description', 'url', 'url_to_image', 'published_at', 'content',
       'top_article', 'engagement_reaction_count', 'engagement_comment_count',
       'engagement_share_count', 'engagement_comment_plugin_count'],
      dtype='object')

## Clean data

In [None]:
df = clean_news_data(df)

In [None]:
df.sample(1).T

## Review tokens and vocabulary

### Tokens

In [None]:
sample_text = df.sample(1)
print(f"SAMPLE TEXT: {sample_text['text'].values[0]}")
print(f"------")
print(f"TOKENS: {sample_text['tokens'].values[0]}")

### Vocabulary

In [None]:
docs = df["text"].values
tokenized_docs = df["tokens"].values
vocab = Counter()
for token in tokenized_docs:
    vocab.update(token)

In [None]:
len(vocab)

In [None]:
vocab.most_common(10)

## BoW + SVD + Normalizer

In [None]:
analyzer = Tokenizer()
bow = TfidfVectorizer(analyzer=analyzer, max_df=.5, min_df=5)
svd = TruncatedSVD(200)
normalizer = Normalizer(copy=False)
vectorizer = make_pipeline(bow, svd, normalizer)
vectorized_docs = vectorizer.fit_transform(docs)

In [None]:
vectorized_docs.shape

## Generate clusters

In [None]:
clustering, cluster_labels = mbkmeans_clusters(vectorized_docs, 50, print_silhouette_values=True)
df_clusters = pd.DataFrame({
    "text": docs,
    "tokens": [" ".join(text) for text in tokenized_docs],
    "cluster": cluster_labels
})

### Evaluate top terms per cluster (based on clusters' centroids)

In [None]:
print("Top terms per cluster (based on centroids):")
original_space_centroids = svd.inverse_transform(clustering.cluster_centers_)
order_centroids = original_space_centroids.argsort()[:, ::-1]
terms = bow.get_feature_names()
for i in range(50):
    print("Cluster %d:" % i, end='')
    for ind in order_centroids[i, :5]:
        print(' %s' % terms[ind], end='')
    print()

### Evaluate top terms per cluster (based on words frequencies)

In [None]:
print("Top terms per cluster (based on words frequencies):")
for i in range(50):
    empty = ""
    most_frequent = Counter(" ".join(df_clusters.query(f"cluster == {i}")["tokens"]).split()).most_common(5)
    for t in most_frequent:
        empty += f"{t[0]}({str(t[1])}) "
    print(f"Cluster {i}: {empty}")

### Most representative documents (based on clusters' centroids)

In [None]:
test_cluster = 25
most_representative_docs = np.argsort(
    np.linalg.norm(vectorized_docs - clustering.cluster_centers_[test_cluster], axis=1)
)
for d in most_representative_docs[:10]:
    print(docs[d])
    print("-------------")

### Random sample of documents

In [None]:
for i,t in enumerate(df_clusters.query(f"cluster == {test_cluster}").sample(10).iterrows()):
    print(t[1]["text"])
    print("-------------")