# Chapter 3 Clustering – Finding Related Posts

In [72]:
import os
import numpy as np

## Preprocessing – similarity measured as a similar number of common words

### Converting raw text into a bag of words

In [15]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(min_df=1)
vectorizer

CountVectorizer()

In [10]:
content = ["How to format my hard disk", " Hard disk format problems "]
content

['How to format my hard disk', ' Hard disk format problems ']

In [11]:
X  = vectorizer.fit_transform(content)
vectorizer.get_feature_names()

['disk', 'format', 'hard', 'how', 'my', 'problems', 'to']

In [17]:
X.toarray().transpose()

array([[1, 1],
       [1, 1],
       [1, 1],
       [1, 0],
       [1, 0],
       [0, 1],
       [1, 0]])

In [12]:
DIR = "./data/toy"
sorted(os.listdir(DIR))

['01.txt', '02.txt', '03.txt', '04.txt', '05.txt']

In [13]:
posts = [open(os.path.join(DIR, f)).read() for f in sorted(os.listdir(DIR))]
posts

['This is a toy post about machine learning. Actually, it contains not much interesting stuff.\n',
 'Imaging databases provide storage capabilities.\n',
 'Most imaging databases save images permanently.\n',
 'Imaging databases store data.\n',
 'Imaging databases store data. Imaging databases store data. Imaging databases store data.\n']

In [56]:
vectorizer = CountVectorizer(min_df=1)
X_train = vectorizer.fit_transform(posts)
num_samples, num_features = X_train.shape
print(f"#samples: {num_samples}, #features: {num_features}")

#samples: 5, #features: 25


In [57]:
print(vectorizer.get_feature_names())

['about', 'actually', 'capabilities', 'contains', 'data', 'databases', 'images', 'imaging', 'interesting', 'is', 'it', 'learning', 'machine', 'most', 'much', 'not', 'permanently', 'post', 'provide', 'save', 'storage', 'store', 'stuff', 'this', 'toy']


In [59]:
new_post = "imaging databases"
new_post_vec = vectorizer.transform([new_post])
print(new_post_vec)

  (0, 5)	1
  (0, 7)	1


In [35]:
new_post_vec.toarray()

array([[0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0]])

In [32]:
def dist_raw(v1, v2):
    delta = v1 - v2
    return np.linalg.norm(delta.toarray())

In [60]:
import sys
best_doc = None
best_dist = sys.maxsize
best_i = None

for i, post in enumerate(posts):
    if post == new_post:
        continue
        
    post_vec = X_train.getrow(i)
    
    d = dist_raw(post_vec, new_post_vec)
    
    print(f"=== Post {i} with dist = {d:.2f} : {post}")
    if d < best_dist:
        best_dist = d
        best_i = i 
        
print(f"Best post is {best_i} with dist = {best_dist:.2f}")

=== Post 0 with dist = 4.00 : This is a toy post about machine learning. Actually, it contains not much interesting stuff.

=== Post 1 with dist = 1.73 : Imaging databases provide storage capabilities.

=== Post 2 with dist = 2.00 : Most imaging databases save images permanently.

=== Post 3 with dist = 1.41 : Imaging databases store data.

=== Post 4 with dist = 5.10 : Imaging databases store data. Imaging databases store data. Imaging databases store data.

Best post is 3 with dist = 1.41


In [61]:
print(X_train.getrow(3).toarray())
print(X_train.getrow(4).toarray())

[[0 0 0 0 1 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0]]
[[0 0 0 0 3 3 0 3 0 0 0 0 0 0 0 0 0 0 0 0 0 3 0 0 0]]


In [43]:
def dist_norm(v1, v2):
    v1_normalized = v1/np.linalg.norm(v1.toarray())
    v2_normalized = v2/np.linalg.norm(v2.toarray())
    delta = v1_normalized - v2_normalized 
    return np.linalg.norm(delta.toarray())

In [44]:
best_doc = None
best_dist = sys.maxsize
best_i = None

for i, post in enumerate(posts):
    if post == new_post:
        continue
        
    post_vec = X_train.getrow(i)
    
    d = dist_norm(post_vec, new_post_vec)
    
    print(f"=== Post {i} with dist = {d:.2f} : {post}")
    if d < best_dist:
        best_dist = d
        best_i = i 
        
print(f"Best post is {best_i} with dist = {best_dist:.2f}")

=== Post 0 with dist = 1.41 : This is a toy post about machine learning. Actually, it contains not much interesting stuff.

=== Post 1 with dist = 0.86 : Imaging databases provide storage capabilities.

=== Post 2 with dist = 0.92 : Most imaging databases save images permanently.

=== Post 3 with dist = 0.77 : Imaging databases store data.

=== Post 4 with dist = 0.77 : Imaging databases store data. Imaging databases store data. Imaging databases store data.

Best post is 3 with dist = 0.77


In [62]:
vectorizer = CountVectorizer(min_df=1, stop_words='english')
print(sorted(vectorizer.get_stop_words())[0:20])

['a', 'about', 'above', 'across', 'after', 'afterwards', 'again', 'against', 'all', 'almost', 'alone', 'along', 'already', 'also', 'although', 'always', 'am', 'among', 'amongst', 'amoungst']


In [63]:
X_train = vectorizer.fit_transform(posts)
num_samples, num_features = X_train.shape
print(f"#samples: {num_samples}, #features: {num_features}")

#samples: 5, #features: 18


In [65]:
new_post = "imaging databases"
new_post_vec = vectorizer.transform([new_post])
print(new_post_vec)

  (0, 4)	1
  (0, 6)	1


In [66]:
best_doc = None
best_dist = sys.maxsize
best_i = None

for i, post in enumerate(posts):
    if post == new_post:
        continue
        
    post_vec = X_train.getrow(i)
    
    d = dist_norm(post_vec, new_post_vec)
    
    print(f"=== Post {i} with dist = {d:.2f} : {post}")
    if d < best_dist:
        best_dist = d
        best_i = i 
        
print(f"Best post is {best_i} with dist = {best_dist:.2f}")

=== Post 0 with dist = 1.41 : This is a toy post about machine learning. Actually, it contains not much interesting stuff.

=== Post 1 with dist = 0.86 : Imaging databases provide storage capabilities.

=== Post 2 with dist = 0.86 : Most imaging databases save images permanently.

=== Post 3 with dist = 0.77 : Imaging databases store data.

=== Post 4 with dist = 0.77 : Imaging databases store data. Imaging databases store data. Imaging databases store data.

Best post is 3 with dist = 0.77


In [23]:
import nltk



In [70]:
s = nltk.stem.SnowballStemmer('english')
s.stem("graphics")

'graphic'

In [71]:
s.stem("imaging")

'imag'

In [72]:
s.stem("image")

'imag'

In [74]:
s.stem("imagination")

'imagin'

In [75]:
s.stem("imagine")

'imagin'

In [76]:
s.stem("buys")

'buy'

In [77]:
s.stem("buying")

'buy'

In [78]:
s.stem("bought")

'bought'

In [24]:
english_stemmer = nltk.stem.SnowballStemmer('english')

In [85]:
class StemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        return lambda doc: (english_stemmer.stem(w) for w in analyzer(doc))

In [86]:
vectorizer = StemmedCountVectorizer(min_df=1, stop_words='english')

In [87]:
X_train = vectorizer.fit_transform(posts)
num_samples, num_features = X_train.shape
print(f"#samples: {num_samples}, #features: {num_features}")

#samples: 5, #features: 17


In [89]:
new_post = "imaging databases"
new_post_vec = vectorizer.transform([new_post])
print(new_post_vec)

  (0, 4)	1
  (0, 5)	1


In [90]:
best_doc = None
best_dist = sys.maxsize
best_i = None

for i, post in enumerate(posts):
    if post == new_post:
        continue
        
    post_vec = X_train.getrow(i)
    
    d = dist_norm(post_vec, new_post_vec)
    
    print(f"=== Post {i} with dist = {d:.2f} : {post}")
    if d < best_dist:
        best_dist = d
        best_i = i 
        
print(f"Best post is {best_i} with dist = {best_dist:.2f}")

=== Post 0 with dist = 1.41 : This is a toy post about machine learning. Actually, it contains not much interesting stuff.

=== Post 1 with dist = 0.86 : Imaging databases provide storage capabilities.

=== Post 2 with dist = 0.63 : Most imaging databases save images permanently.

=== Post 3 with dist = 0.77 : Imaging databases store data.

=== Post 4 with dist = 0.77 : Imaging databases store data. Imaging databases store data. Imaging databases store data.

Best post is 2 with dist = 0.63


In [91]:
def tfidf(term, doc, corpus):
    tf = doc.count(term)/len(doc)
    num_docs_with_term = len([d for d in corpus if term in d])
    idf = np.log(len(corpus)/ num_docs_with_term)
    return tf * idf

In [92]:
a, abb, abc = ["a"], ["a", "b", "b"], ["a", "b", "c"]
D = [a, abb, abc]

In [93]:
tfidf("a", a, D)

0.0

In [94]:
tfidf("a", abb, D)

0.0

In [95]:
tfidf("a", abc, D)

0.0

In [97]:
tfidf("b", abb, D)

0.27031007207210955

In [98]:
tfidf("b", abc, D)

0.13515503603605478

In [99]:
tfidf("c", abc, D)

0.3662040962227032

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [19]:
class StemmedTfidfVectorizer(TfidfVectorizer):
    def build_analyzer(self):
        analyzer = super(TfidfVectorizer, self).build_analyzer()
        return lambda doc : (
            english_stemmer.stem(w) for w in analyzer(doc)
        )

In [107]:
vectorizer = StemmedTfidfVectorizer(min_df=1, 
                                    stop_words='english',
                                    decode_error='ignore')

In [108]:
X_train = vectorizer.fit_transform(posts)
num_samples, num_features = X_train.shape
print(f"#samples: {num_samples}, #features: {num_features}")

#samples: 5, #features: 17


In [109]:
new_post = "imaging databases"
new_post_vec = vectorizer.transform([new_post])
print(new_post_vec)

  (0, 5)	0.7071067811865476
  (0, 4)	0.7071067811865476


In [110]:
best_doc = None
best_dist = sys.maxsize
best_i = None

for i, post in enumerate(posts):
    if post == new_post:
        continue
        
    post_vec = X_train.getrow(i)
    
    d = dist_norm(post_vec, new_post_vec)
    
    print(f"=== Post {i} with dist = {d:.2f} : {post}")
    if d < best_dist:
        best_dist = d
        best_i = i 
        
print(f"Best post is {best_i} with dist = {best_dist:.2f}")

=== Post 0 with dist = 1.41 : This is a toy post about machine learning. Actually, it contains not much interesting stuff.

=== Post 1 with dist = 1.08 : Imaging databases provide storage capabilities.

=== Post 2 with dist = 0.86 : Most imaging databases save images permanently.

=== Post 3 with dist = 0.92 : Imaging databases store data.

=== Post 4 with dist = 0.92 : Imaging databases store data. Imaging databases store data. Imaging databases store data.

Best post is 2 with dist = 0.86


## Clustering

### Getting test data to evaluate our ideas on

In [1]:
import sklearn.datasets
all_data = sklearn.datasets.fetch_20newsgroups(subset='all')
len(all_data.filenames)



18846

In [3]:
print(all_data.target_names)

['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']


In [11]:
train_data = sklearn.datasets.fetch_20newsgroups(subset='train')
len(train_data.filenames)

11314

In [13]:
test_data = sklearn.datasets.fetch_20newsgroups(subset='test')
len(test_data.filenames)

7532

In [14]:
groups = [
    'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware',
    'comp.sys.mac.hardware', 'comp.windows.x', 'sci.space']

In [15]:
train_data = sklearn.datasets.fetch_20newsgroups(subset='train', categories=groups)
len(train_data.filenames)

3529

In [16]:
test_data = sklearn.datasets.fetch_20newsgroups(subset='test', categories=groups)
len(test_data.filenames)

2349

### Clustering Posts

In [25]:
vectorizer = StemmedTfidfVectorizer(min_df=10, 
                                    max_df=0.5,
                                    stop_words='english', 
                                    decode_error='ignore')
vectorized = vectorizer.fit_transform(train_data.data)

In [26]:
num_samples, num_features = vectorized.shape
print(f"#samples: {num_samples}, #features: {num_features}")

#samples: 3529, #features: 4712


In [27]:
from sklearn.cluster import KMeans

In [28]:
num_clusters = 50

km = KMeans(n_clusters=num_clusters, 
            init='random', 
            n_init=1,
           verbose=1,
           random_state=3)

km.fit(vectorized)

Initialization complete
Iteration 0, inertia 5925.977358309683
Iteration 1, inertia 3215.2590433087403
Iteration 2, inertia 3179.199640805428
Iteration 3, inertia 3157.765345523002
Iteration 4, inertia 3144.332597910992
Iteration 5, inertia 3135.2342147507306
Iteration 6, inertia 3128.664551177874
Iteration 7, inertia 3124.756954533032
Iteration 8, inertia 3121.5927166252527
Iteration 9, inertia 3118.7882911899205
Iteration 10, inertia 3116.091638786711
Iteration 11, inertia 3114.0968233171616
Iteration 12, inertia 3113.016740705259
Iteration 13, inertia 3112.1027671605866
Iteration 14, inertia 3111.131954036844
Iteration 15, inertia 3110.593491559183
Iteration 16, inertia 3110.3333672920667
Iteration 17, inertia 3110.2593476388543
Iteration 18, inertia 3110.232825401951
Iteration 19, inertia 3110.184785024758
Converged at iteration 19: strict convergence.


KMeans(init='random', n_clusters=50, n_init=1, random_state=3, verbose=1)

In [30]:
print(km.labels_)

[26 12 25 ... 42  9  8]


In [31]:
print(km.labels_.shape)

(3529,)


## Solving our initial challenge

In [33]:
new_post = "Disk drive problems. Hi, I have a problem with my hard \
disk. After 1 year it is working only sporadically now.\
I tried to format it, but now it doesn't boot any more.\
Any ideas? Thanks."
new_post 

"Disk drive problems. Hi, I have a problem with my hard disk. After 1 year it is working only sporadically now.I tried to format it, but now it doesn't boot any more.Any ideas? Thanks."

In [34]:
new_post_vec = vectorizer.transform([new_post])
new_post_label = km.predict(new_post_vec)[0]
new_post_label

38

In [38]:
similar_indices = (km.labels_==new_post_label).nonzero()[0]
len(similar_indices)

113

In [45]:
similar = []

for i in similar_indices:
    dist = np.linalg.norm((new_post_vec - vectorized[i]).toarray())
    similar.append((dist, train_data.data[i]))
                   
similar = sorted(similar)
len(similar)

113

In [47]:
show_at_1 = similar[0]
show_at_2 = similar[int(len(similar)/10)]
show_at_3 = similar[int(len(similar)/2)]


In [51]:
for i in [show_at_1, show_at_2, show_at_2]:
    print(f"{i[0]} \t {i[1]}")
    print("-------------------")
    break

1.0378441731334072 	 From: Thomas Dachsel <GERTHD@mvs.sas.com>
Subject: BOOT PROBLEM with IDE controller
Nntp-Posting-Host: sdcmvs.mvs.sas.com
Organization: SAS Institute Inc.
Lines: 25

Hi,
I've got a Multi I/O card (IDE controller + serial/parallel
interface) and two floppy drives (5 1/4, 3 1/2) and a
Quantum ProDrive 80AT connected to it.
I was able to format the hard disk, but I could not boot from
it. I can boot from drive A: (which disk drive does not matter)
but if I remove the disk from drive A and press the reset switch,
the LED of drive A: continues to glow, and the hard disk is
not accessed at all.
I guess this must be a problem of either the Multi I/o card
or floppy disk drive settings (jumper configuration?)
Does someone have any hint what could be the reason for it.
Please reply by email to GERTHD@MVS.SAS.COM
Thanks,
Thomas
+-------------------------------------------------------------------+
| Thomas Dachsel                                                    |
| Internet

#### Another Look at Noise

In [55]:
post_group = zip(train_data.data, train_data.target)
all = [(len(post[0]), post[0], train_data.target_names[post[1]]) 
       for post in post_group]

graphics = sorted([post for post in all if post[2]=='comp.graphics'])
graphics[5]

(245,
 'comp.graphics')

In [57]:
noise_post = graphics[5][1]
analyzer = vectorizer.build_analyzer()
print(list(analyzer(noise_post)))

['situnaya', 'ibm3090', 'bham', 'ac', 'uk', 'subject', 'test', 'sorri', 'organ', 'univers', 'birmingham', 'unit', 'kingdom', 'line', 'nntp', 'post', 'host', 'ibm3090', 'bham', 'ac', 'uk']


In [58]:
useful = set(analyzer(noise_post)).intersection(vectorizer.get_feature_names())
print(sorted(useful))

['ac', 'birmingham', 'host', 'kingdom', 'nntp', 'sorri', 'test', 'uk', 'unit', 'univers']


In [71]:
for term in sorted(useful):
    print(f"IDF({term}) =\
 {vectorizer._tfidf.idf_[vectorizer.vocabulary_[term]]:.2f}\
    ")


IDF(ac) = 3.51    
IDF(birmingham) = 6.77    
IDF(host) = 1.74    
IDF(kingdom) = 6.68    
IDF(nntp) = 1.77    
IDF(sorri) = 4.14    
IDF(test) = 3.83    
IDF(uk) = 3.70    
IDF(unit) = 4.42    
IDF(univers) = 1.91    
