<a href="https://colab.research.google.com/github/flaviorv/ml_clustering/blob/main/pb_clustering_at_part4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

###Downloads and imports

In [None]:
try:
  import spacy
except:
  !pip install -U spacy
  import spacy

try:
  spacy.load('en_core_web_sm')
except:
  !python -m spacy download en_core_web_sm

from collections import Counter
from sklearn.datasets import fetch_20newsgroups
from sklearn.cluster import KMeans
from sklearn.decomposition import LatentDirichletAllocation, NMF
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score

### 20 News Groups Dataset Preprocessing

In [None]:
# Loading dataset
data = fetch_20newsgroups()
x = data.data
y = data.target

# Lemmatization, stopwords and punctuation removal
nlp = spacy.load("en_core_web_sm")
def clean(text):
    doc = nlp(text)
    clean_words = []
    for token in doc:
        if not token.is_stop and not token.is_punct and token.is_alpha:
            clean_words.append(token.lemma_.lower())
    return " ".join(clean_words)

cleaned_docs = []
total_docs = len(x)

for i, doc in enumerate(x):
  cleaned_doc = clean(doc)
  cleaned_docs.append(cleaned_doc)
  if(i + 1) % 100 == 0:
    print(f'Processing document {i+1}/{total_docs}...')
  if i+1 == total_docs:
    print(f'All {total_docs} docs have been processed!')

# TFIDF
tfidf = TfidfVectorizer( min_df=0.005, max_df=0.70)
x_vec = tfidf.fit_transform(cleaned_docs)
terms = tfidf.get_feature_names_out()

Processing document 100/11314...
Processing document 200/11314...
Processing document 300/11314...
Processing document 400/11314...
Processing document 500/11314...
Processing document 600/11314...
Processing document 700/11314...
Processing document 800/11314...
Processing document 900/11314...
Processing document 1000/11314...
Processing document 1100/11314...
Processing document 1200/11314...
Processing document 1300/11314...
Processing document 1400/11314...
Processing document 1500/11314...
Processing document 1600/11314...
Processing document 1700/11314...
Processing document 1800/11314...
Processing document 1900/11314...
Processing document 2000/11314...
Processing document 2100/11314...
Processing document 2200/11314...
Processing document 2300/11314...
Processing document 2400/11314...
Processing document 2500/11314...
Processing document 2600/11314...
Processing document 2700/11314...
Processing document 2800/11314...
Processing document 2900/11314...
Processing document 300

###K-Means

In [None]:
# Clustering with K-Means
n_clusters = 30
kmeans = KMeans(n_clusters=n_clusters)
clusters = kmeans.fit_predict(x_vec)

# Show K-Means metrics
print('K-Means')
print(f'NMI {normalized_mutual_info_score(y, clusters):.2f}')
print(f'ARI {adjusted_rand_score(y, clusters):.2f}', end='\n\n')

# Show clusters words
for c in range(n_clusters):
  centroids = kmeans.cluster_centers_[c]
  top_indices = centroids.argsort()[-10:][::-1]
  top_words = [terms[i] for i in top_indices]
  print(f'Cluster {c}: {top_words}')

K-Means
NMI 0.48
ARI 0.16

Cluster 0: ['morality', 'keith', 'objective', 'moral', 'schneider', 'allan', 'atheists', 'value', 'jon', 'system']
Cluster 1: ['sale', 'offer', 'sell', 'condition', 'price', 'shipping', 'include', 'distribution', 'new', 'university']
Cluster 2: ['scsi', 'ide', 'controller', 'drive', 'bus', 'isa', 'device', 'mb', 'quadra', 'mac']
Cluster 3: ['israel', 'israeli', 'jews', 'arab', 'arabs', 'policy', 'jewish', 'israelis', 'kill', 'lebanon']
Cluster 4: ['monitor', 'mac', 'modem', 'port', 'apple', 'mouse', 'color', 'pc', 'serial', 'problem']
Cluster 5: ['gun', 'firearm', 'weapon', 'handgun', 'criminal', 'control', 'people', 'law', 'crime', 'police']
Cluster 6: ['god', 'atheist', 'believe', 'hell', 'bible', 'people', 'jesus', 'faith', 'atheism', 'exist']
Cluster 7: ['file', 'format', 'windows', 'program', 'directory', 'convert', 'image', 'ftp', 'help', 'use']
Cluster 8: ['card', 'driver', 'video', 'vga', 'diamond', 'ati', 'vesa', 'bus', 'color', 'mode']
Cluster 9: ['

###LDA

In [None]:
# Using LDA in the dataset
lda = LatentDirichletAllocation(n_components=25)
lda_matrix = lda.fit_transform(x_vec)

# Getting the dominant topic of each document
topics = lda_matrix.argmax(axis=1)

# LDA metrics
print('LDA')
print(f'NMI {normalized_mutual_info_score(y, topics):.2f}')
print(f'ARI {adjusted_rand_score(y, topics):.2f}', end='\n\n')

# Words by topics
topic_words = []
for i, topic in enumerate(lda.components_):
  top_indices = topic.argsort()[-10:][::-1]
  top_words = [terms[i] for i in top_indices]
  print(f'Topic {i+1}: {top_words}')

LDA
NMI 0.41
ARI 0.17

Topic 1: ['key', 'clipper', 'encryption', 'chip', 'government', 'escrow', 'nsa', 'crypto', 'system', 'security']
Topic 2: ['card', 'driver', 'monitor', 'video', 'vga', 'diamond', 'vesa', 'color', 'mouse', 'ati']
Topic 3: ['ring', 'testing', 'german', 'fri', 'andrew', 'corp', 'ed', 'packet', 'rush', 'university']
Topic 4: ['orbit', 'lunar', 'gm', 'moon', 'temporary', 'solar', 'hopkins', 'josh', 'earth', 'conference']
Topic 5: ['mellon', 'carnegie', 'pa', 'pittsburgh', 'gateway', 'engineering', 'electrical', 'host', 'nntp', 'posting']
Topic 6: ['jewish', 'linux', 'baseball', 'space', 'ticket', 'shuttle', 'vs', 'instruction', 'sender', 'digest']
Topic 7: ['sun', 'shaft', 'handling', 'max', 'com', 'eng', 'microsystems', 'george', 'matt', 'drive']
Topic 8: ['fbi', 'homosexual', 'batf', 'gay', 'atf', 'clayton', 'fire', 'cramer', 'waco', 'compound']
Topic 9: ['people', 'write', 'article', 'think', 'say', 'right', 'know', 'israel', 'law', 'state']
Topic 10: ['georgia', '

###NMF

In [None]:
# Using NMF on dataset
nmf = NMF(n_components=25)
nmf_matrix = nmf.fit_transform(x_vec)

# Getting the dominant topic of each doc
topics = nmf_matrix.argmax(axis=1)

# NMF metrics
print('NMF')
print(f'NMI {normalized_mutual_info_score(y, topics):.2f}')
print(f'ARI {adjusted_rand_score(y, topics):.2f}', end='\n\n')

# Getting top words by topic
topic_words = []
for i, topic in enumerate(nmf.components_):
  top_indices = topic.argsort()[-10:][::-1]
  top_words = [terms[i] for i in top_indices]
  print(f'Topic {i+1}: {top_words}')

NMF
NMI 0.45
ARI 0.28

Topic 1: ['people', 'think', 'go', 'write', 'like', 'know', 'thing', 'say', 'time', 'right']
Topic 2: ['file', 'windows', 'dos', 'program', 'format', 'directory', 'ms', 'run', 'ftp', 'disk']
Topic 3: ['team', 'player', 'play', 'year', 'nhl', 'hockey', 'league', 'season', 'good', 'win']
Topic 4: ['key', 'chip', 'clipper', 'encryption', 'escrow', 'government', 'algorithm', 'crypto', 'phone', 'secret']
Topic 5: ['drive', 'scsi', 'disk', 'ide', 'hard', 'controller', 'floppy', 'mb', 'boot', 'hd']
Topic 6: ['mac', 'use', 'apple', 'modem', 'simm', 'port', 'speed', 'problem', 'memory', 'work']
Topic 7: ['host', 'posting', 'nntp', 'distribution', 'university', 'world', 'lines', 'article', 'reply', 'usa']
Topic 8: ['israel', 'israeli', 'jews', 'arab', 'jewish', 'arabs', 'peace', 'policy', 'israelis', 'lebanon']
Topic 9: ['banks', 'gordon', 'chastity', 'shameful', 'skepticism', 'intellect', 'surrender', 'pittsburgh', 'science', 'univ']
Topic 10: ['card', 'driver', 'video', 