# Chapter 3: Clustering - Finding related posts 

- Name: Gyanas Luitel
- Group: Computer Science
- Roll No: 27

## Measuring the relatedness of posts

In [1]:
import os
import numpy as np

## Preprocessing - similarlity measured as a similar number of common words

### Converting raw text into a bag of words

In [2]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(min_df = 1)

In [3]:
print(vectorizer)

CountVectorizer()


In [4]:
content = ["How to format my hard disk", "Hard disk format problems"]
content

['How to format my hard disk', 'Hard disk format problems']

In [5]:
x = vectorizer.fit_transform(content)
vectorizer.get_feature_names()

['disk', 'format', 'hard', 'how', 'my', 'problems', 'to']

In [6]:
print(x.toarray().transpose())

[[1 1]
 [1 1]
 [1 1]
 [1 0]
 [1 0]
 [0 1]
 [1 0]]


**Counting words**

In [7]:
DIR = "./Chapter03/data/toy"
sorted(os.listdir(DIR))

['01.txt', '02.txt', '03.txt', '04.txt', '05.txt']

In [8]:
posts = [open(os.path.join(DIR, f)).read() for f in os.listdir(DIR)]
posts

['This is a toy post about machine learning. Actually, it contains not much interesting stuff.',
 'Imaging databases provide storage capabilities.',
 'Most imaging databases save images permanently.\n',
 'Imaging databases store data.',
 'Imaging databases store data. Imaging databases store data. Imaging databases store data.']

In [9]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(min_df = 1)

In [10]:
X_train = vectorizer.fit_transform(posts)
num_samples, num_features = X_train.shape
print("#samples: %d, #features: %d" % (num_samples, num_features))

#samples: 5, #features: 25


In [11]:
print(vectorizer.get_feature_names())

['about', 'actually', 'capabilities', 'contains', 'data', 'databases', 'images', 'imaging', 'interesting', 'is', 'it', 'learning', 'machine', 'most', 'much', 'not', 'permanently', 'post', 'provide', 'save', 'storage', 'store', 'stuff', 'this', 'toy']


In [12]:
new_post = "imaging databases"

In [13]:
new_post_vec = vectorizer.transform([new_post])

In [14]:
print(new_post_vec)

  (0, 5)	1
  (0, 7)	1


In [15]:
print(new_post_vec.toarray())

[[0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]


Similarlilty measurement (the naive one) using Euclidean distance between the count vectors of the new post and all the old posts.

In [16]:
import scipy as sp
def dist_raw(v1, v2):
    delta = v1 - v2
    return sp.linalg.norm(delta.toarray())

In [17]:
import sys
best_doc = None
best_dist = sys.maxsize
best_i = None

for i, post in enumerate(posts):
    if post == new_post:
        continue
        
    post_vec = X_train.getrow(i)
    
    d = dist_raw(post_vec, new_post_vec)
    
    print(f"=== Post {i} with dist = {d:.2f}: {post}")
    if d < best_dist:
        best_dist = d
        best_i = i 

print()
print(f"Best post is {best_i} with dist = {best_dist:.2f}")

=== Post 0 with dist = 4.00: This is a toy post about machine learning. Actually, it contains not much interesting stuff.
=== Post 1 with dist = 1.73: Imaging databases provide storage capabilities.
=== Post 2 with dist = 2.00: Most imaging databases save images permanently.

=== Post 3 with dist = 1.41: Imaging databases store data.
=== Post 4 with dist = 5.10: Imaging databases store data. Imaging databases store data. Imaging databases store data.

Best post is 3 with dist = 1.41


In [18]:
print(X_train.getrow(3).toarray())
print(X_train.getrow(4).toarray())

[[0 0 0 0 1 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0]]
[[0 0 0 0 3 3 0 3 0 0 0 0 0 0 0 0 0 0 0 0 0 3 0 0 0]]


**Normalizing word count vectors**

In [19]:
def dist_norm(v1, v2):
    v1_normalized = v1/sp.linalg.norm(v1.toarray())
    v2_normalized = v2/sp.linalg.norm(v2.toarray())
    delta = v1_normalized - v2_normalized 
    return sp.linalg.norm(delta.toarray())

In [20]:
best_doc = None
best_dist = sys.maxsize
best_i = None

for i, post in enumerate(posts):
    if post == new_post:
        continue
        
    post_vec = X_train.getrow(i)
    
    d = dist_norm(post_vec, new_post_vec)
    
    print(f"=== Post {i} with dist = {d:.2f}: {post}")
    if d < best_dist:
        best_dist = d
        best_i = i 

print()
print(f"Best post is {best_i} with dist = {best_dist:.2f}")

=== Post 0 with dist = 1.41: This is a toy post about machine learning. Actually, it contains not much interesting stuff.
=== Post 1 with dist = 0.86: Imaging databases provide storage capabilities.
=== Post 2 with dist = 0.92: Most imaging databases save images permanently.

=== Post 3 with dist = 0.77: Imaging databases store data.
=== Post 4 with dist = 0.77: Imaging databases store data. Imaging databases store data. Imaging databases store data.

Best post is 3 with dist = 0.77


**Removing less important words**

In [21]:
vectorizer = CountVectorizer(min_df = 1, stop_words = "english")

In [22]:
print(sorted(vectorizer.get_stop_words())[0:20])

['a', 'about', 'above', 'across', 'after', 'afterwards', 'again', 'against', 'all', 'almost', 'alone', 'along', 'already', 'also', 'although', 'always', 'am', 'among', 'amongst', 'amoungst']


In [23]:
X_train = vectorizer.fit_transform(posts)
num_samples, num_features = X_train.shape
print(f"samples: {num_samples}, #features: {num_features}")

samples: 5, #features: 18


In [24]:
new_post = "imaging databases"
new_post_vec = vectorizer.transform([new_post])
print(new_post_vec)

  (0, 4)	1
  (0, 6)	1


In [25]:
best_doc = None
best_dist = sys.maxsize
best_i = None

for i, post in enumerate(posts):
    if post == new_post:
        continue
        
    post_vec = X_train.getrow(i)
    
    d = dist_norm(post_vec, new_post_vec)
    
    print(f"=== Post {i} with dist = {d:.2f}: {post}")
    if d < best_dist:
        best_dist = d
        best_i = i 

print()
print(f"Best post is {best_i} with dist = {best_dist:.2f}")

=== Post 0 with dist = 1.41: This is a toy post about machine learning. Actually, it contains not much interesting stuff.
=== Post 1 with dist = 0.86: Imaging databases provide storage capabilities.
=== Post 2 with dist = 0.86: Most imaging databases save images permanently.

=== Post 3 with dist = 0.77: Imaging databases store data.
=== Post 4 with dist = 0.77: Imaging databases store data. Imaging databases store data. Imaging databases store data.

Best post is 3 with dist = 0.77


**Stemming**

In [26]:
import nltk

In [27]:
s = nltk.stem.SnowballStemmer('english')
s.stem("graphics")

'graphic'

In [28]:
s.stem("imaging")

'imag'

In [29]:
s.stem("image")

'imag'

In [30]:
s.stem("imagination")

'imagin'

In [31]:
s.stem('imagine')

'imagin'

In [32]:
s.stem("buys")

'buy'

In [33]:
s.stem("buying")

'buy'

In [34]:
s.stem("bought")

'bought'

**Extending the vectorizer with NLTK's stemmer**

In [35]:
import nltk.stem
english_stemmer = nltk.stem.SnowballStemmer("english")

In [36]:
class StemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        return lambda doc: (english_stemmer.stem(w) for w in analyzer(doc))

vectorizer = StemmedCountVectorizer(min_df = 1, stop_words = "english")

In [37]:
X_train = vectorizer.fit_transform(posts)
num_samples, num_features = X_train.shape
print(f"samples: {num_samples}, #features: {num_features}")

samples: 5, #features: 17


In [38]:
new_post = "imaging databases"
new_post_vec = vectorizer.transform([new_post])
print(new_post_vec)

  (0, 4)	1
  (0, 5)	1


In [39]:
best_doc = None
best_dist = sys.maxsize
best_i = None

for i, post in enumerate(posts):
    if post == new_post:
        continue
        
    post_vec = X_train.getrow(i)
    
    d = dist_norm(post_vec, new_post_vec)
    
    print(f"=== Post {i} with dist = {d:.2f}: {post}")
    if d < best_dist:
        best_dist = d
        best_i = i 

print()
print(f"Best post is {best_i} with dist = {best_dist:.2f}")

=== Post 0 with dist = 1.41: This is a toy post about machine learning. Actually, it contains not much interesting stuff.
=== Post 1 with dist = 0.86: Imaging databases provide storage capabilities.
=== Post 2 with dist = 0.63: Most imaging databases save images permanently.

=== Post 3 with dist = 0.77: Imaging databases store data.
=== Post 4 with dist = 0.77: Imaging databases store data. Imaging databases store data. Imaging databases store data.

Best post is 2 with dist = 0.63


**Stop words on steroids**

In [40]:
def tfidf(term, doc, corpus):
    tf = doc.count(term) / len(doc)
    num_docs_with_term = len([d for d in corpus if term in d])
    idf = np.log(len(corpus) / num_docs_with_term)
    return tf * idf

In [41]:
a, abb, abc = ["a"], ["a", "b", "b"], ["a", "b", "c"]
D = [a, abb, abc]

In [42]:
print(tfidf("a", a, D))

0.0


In [43]:
print(tfidf("a", abb, D))

0.0


In [44]:
print(tfidf("a", abc, D))

0.0


In [45]:
print(tfidf("b", abb, D))

0.27031007207210955


In [46]:
print(tfidf("a", abc, D))

0.0


In [47]:
print(tfidf("b", abc, D))

0.13515503603605478


In [48]:
print(tfidf("c", abc, D))

0.3662040962227032


In [49]:
from sklearn.feature_extraction.text import TfidfVectorizer
class StemmedTfidfVectorizer(TfidfVectorizer):
    def build_analyzer(self):
        analyzer = super(TfidfVectorizer, self).build_analyzer()
        return lambda doc: (english_stemmer.stem(w) for w in analyzer(doc))

vectorizer = StemmedTfidfVectorizer(min_df = 1, stop_words = "english", decode_error = "ignore")

In [50]:
X_train = vectorizer.fit_transform(posts)
num_samples, num_features = X_train.shape
print(f"#samples: {num_samples}, #features: {num_features}")

#samples: 5, #features: 17


In [51]:
new_post = "imaging databases"
new_post_vec = vectorizer.transform([new_post])
print(new_post_vec)

  (0, 5)	0.7071067811865476
  (0, 4)	0.7071067811865476


In [52]:
best_doc = None
best_dist = sys.maxsize
best_i = None

for i, post in enumerate(posts):
    if post == new_post:
        continue
        
    post_vec = X_train.getrow(i)
    
    d = dist_norm(post_vec, new_post_vec)
    
    print(f"=== Post {i} with dist = {d:.2f} : {post}")
    if d < best_dist:
        best_dist = d
        best_i = i 
        
print()
print(f"Best post is {best_i} with dist = {best_dist:.2f}")

=== Post 0 with dist = 1.41 : This is a toy post about machine learning. Actually, it contains not much interesting stuff.
=== Post 1 with dist = 1.08 : Imaging databases provide storage capabilities.
=== Post 2 with dist = 0.86 : Most imaging databases save images permanently.

=== Post 3 with dist = 0.92 : Imaging databases store data.
=== Post 4 with dist = 0.92 : Imaging databases store data. Imaging databases store data. Imaging databases store data.

Best post is 2 with dist = 0.86


## Clustering

### K-means

### Getting test data to evaluate our ideas on

In [53]:
import sklearn.datasets

In [54]:
all_data = sklearn.datasets.fetch_20newsgroups(subset = "all")
print(len(all_data.filenames))

18846


In [55]:
print(all_data.target_names)

['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']


In [56]:
train_data = sklearn.datasets.fetch_20newsgroups(subset = 'train')
print(len(train_data.filenames))

11314


In [57]:
test_data = sklearn.datasets.fetch_20newsgroups(subset = 'test')
print(len(test_data.filenames))

7532


In [58]:
groups = ['comp.graphics' , 'comp.os.ms-windows.misc', 
          'comp.sys.ibm.pc.hardware','comp.sys.mac.hardware',
          'comp.windows.x', 'sci.space'
         ]
train_data = sklearn.datasets.fetch_20newsgroups(subset="train", categories = groups)
print(len(train_data.filenames))

3529


In [59]:
test_data = sklearn.datasets.fetch_20newsgroups(subset = "test", categories = groups)
print(len(test_data.filenames))

2349


### Clustering Posts

In [60]:
vectorizer = StemmedTfidfVectorizer(min_df = 10, 
                                    max_df = 0.5, stop_words = 'english',
                                   decode_error = 'ignore')
vectorized = vectorizer.fit_transform(train_data.data)
num_samples, num_features = vectorized.shape
print('#samples: %d, #features: %d' % (num_samples, num_features))

#samples: 3529, #features: 4712


In [61]:
num_clusters = 50
from sklearn.cluster import KMeans
km = KMeans(n_clusters = num_clusters, init = 'random', n_init = 1, verbose = 1, random_state = 3)
km.fit(vectorized)

Initialization complete
Iteration 0, inertia 5899.5595831471655
Iteration 1, inertia 3218.297747726279
Iteration 2, inertia 3184.3328334733214
Iteration 3, inertia 3164.867358130041
Iteration 4, inertia 3152.003949571175
Iteration 5, inertia 3143.1109963529184
Iteration 6, inertia 3136.2559774422048
Iteration 7, inertia 3129.3248717684405
Iteration 8, inertia 3124.56747982014
Iteration 9, inertia 3121.9001105797406
Iteration 10, inertia 3120.209894571872
Iteration 11, inertia 3118.62745619288
Iteration 12, inertia 3117.362525978361
Iteration 13, inertia 3116.8112664390364
Iteration 14, inertia 3116.587892365764
Iteration 15, inertia 3116.417048753848
Iteration 16, inertia 3115.760414808626
Iteration 17, inertia 3115.3736535034473
Iteration 18, inertia 3115.155454436256
Iteration 19, inertia 3114.9491175607545
Iteration 20, inertia 3114.5149932662175
Iteration 21, inertia 3113.9369169464094
Iteration 22, inertia 3113.719999300366
Iteration 23, inertia 3113.547519005385
Iteration 24, ine

KMeans(init='random', n_clusters=50, n_init=1, random_state=3, verbose=1)

In [62]:
print(km.labels_)

[38 17 47 ... 41 14 16]


In [63]:
print(km.labels_.shape)

(3529,)


## Solving our initial challenge

In [64]:
new_post = "Disk drive problems. Hi, I have a problem with my hard \
disk. After 1 year it is working only sporadically now.\
I tried to format it, but now it doesn't boot any more.\
Any ideas? Thanks."
new_post

"Disk drive problems. Hi, I have a problem with my hard disk. After 1 year it is working only sporadically now.I tried to format it, but now it doesn't boot any more.Any ideas? Thanks."

In [65]:
new_post_vec = vectorizer.transform([new_post])
new_post_label = km.predict(new_post_vec)[0]
new_post_label

7

In [66]:
similar_indices = (km.labels_ == new_post_label).nonzero()[0]
# similar_indices
len(similar_indices)

166

In [67]:
similar = []

for i in similar_indices:
    dist = sp.linalg.norm(new_post_vec - vectorized[i].toarray())
    similar.append((dist, train_data.data[i]))
similar = sorted(similar)
print(len(similar))

166


In [68]:
show_at_1 = similar[0]
show_at_2 = similar[int(len(similar)/10)]
show_at_3 = similar[int(len(similar)/2)]

In [69]:
for i in [show_at_1, show_at_2, show_at_3]:
    print(f"{i[0]} \t {i[1]}")
    print("--------------------------------------------------------------------------------------------------------")
    

1.0378441731334074 	 From: Thomas Dachsel <GERTHD@mvs.sas.com>
Subject: BOOT PROBLEM with IDE controller
Nntp-Posting-Host: sdcmvs.mvs.sas.com
Organization: SAS Institute Inc.
Lines: 25

Hi,
I've got a Multi I/O card (IDE controller + serial/parallel
interface) and two floppy drives (5 1/4, 3 1/2) and a
Quantum ProDrive 80AT connected to it.
I was able to format the hard disk, but I could not boot from
it. I can boot from drive A: (which disk drive does not matter)
but if I remove the disk from drive A and press the reset switch,
the LED of drive A: continues to glow, and the hard disk is
not accessed at all.
I guess this must be a problem of either the Multi I/o card
or floppy disk drive settings (jumper configuration?)
Does someone have any hint what could be the reason for it.
Please reply by email to GERTHD@MVS.SAS.COM
Thanks,
Thomas
+-------------------------------------------------------------------+
| Thomas Dachsel                                                    |
| Internet

### Another Look at Noise

In [70]:
post_group = zip(train_data.data, train_data.target)
all = [(len(post[0]), post[0], train_data.target_names[post[1]]) for post in post_group]
graphics = sorted([post for post in all if post[2] == 'comp.graphics'])
graphics[5]

(245,
 'comp.graphics')

In [71]:
noise_post = graphics[5][1]
analyzer = vectorizer.build_analyzer()
print(list(analyzer(noise_post)))

['situnaya', 'ibm3090', 'bham', 'ac', 'uk', 'subject', 'test', 'sorri', 'organ', 'univers', 'birmingham', 'unit', 'kingdom', 'line', 'nntp', 'post', 'host', 'ibm3090', 'bham', 'ac', 'uk']


In [72]:
useful = set(analyzer(noise_post)).intersection(vectorizer.get_feature_names())
print(sorted(useful))

['ac', 'birmingham', 'host', 'kingdom', 'nntp', 'sorri', 'test', 'uk', 'unit', 'univers']


In [73]:
for term in sorted(useful):
    print(f"IDF({term}) =\
 {vectorizer._tfidf.idf_[vectorizer.vocabulary_[term]]:.2f}\
    ")

IDF(ac) = 3.51    
IDF(birmingham) = 6.77    
IDF(host) = 1.74    
IDF(kingdom) = 6.68    
IDF(nntp) = 1.77    
IDF(sorri) = 4.14    
IDF(test) = 3.83    
IDF(uk) = 3.70    
IDF(unit) = 4.42    
IDF(univers) = 1.91    
