In [1]:
#from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import scipy as sp
import nltk.stem
import sklearn.datasets
import pickle
from sklearn.cluster import KMeans

In [2]:
# f = open('textmine.pickle', 'wb')  
# save = {
#     'all_data': all_data,
#     'train_data' : train_data,
#     'test_data' : test_data
#     }
# pickle.dump(save,f)
# f.close()


with open("textmine.pickle", 'rb') as f:
  save = pickle.load(f)
  all_data = save['all_data']

In [3]:
# print(np.__file__)

In [6]:
s = nltk.stem.SnowballStemmer('english')
class StemmedTfidfVectorizer(TfidfVectorizer):
    def build_analyzer(self):
        #StemmedCountVectorizer will return CountVectorizer build_analyzer code
        analyzer = super(TfidfVectorizer,self).build_analyzer()
        return lambda doc: (s.stem(w) for w in analyzer(doc))
    
vectorizer = StemmedTfidfVectorizer(min_df=1,stop_words='english',decode_error='ignore')


In [24]:
posts = ["This is a toy post about machine learning. Actually, it contains not much interesting stuff.", 
         "Imaging databases can get huge.",
         "Most imaging databases save images permanently.",
         "Imaging databases store images.",
         "Imaging databases store images. Imaging databases store images.Imaging databases store images."]

In [30]:
X_train = vectorizer.fit_transform(posts)
vectorizer.get_feature_names()

['actual',
 'contain',
 'databas',
 'huge',
 'imag',
 'interest',
 'learn',
 'machin',
 'perman',
 'post',
 'save',
 'store',
 'stuff',
 'toy']

In [31]:
X_train.toarray()
num_samples, num_features = X_train.shape

In [32]:
X_train.shape

(5, 14)

In [5]:
def dist_raw(v1,v2):
    delta = v1-v2
    return sp.linalg.norm(delta.toarray())
# Distance Coefficients between Two Lists or Sets in The Python Papers Source Codes

best_doc = None
best_dist = 300000

for i,post in enumerate(X_train):
    post_vec = X_train.getrow(i)
    d = dist_raw(post_vec,new_post_vec)
    if d < best_dist:
        best_dist = d
        best_i = i
print("Best Post is %i with dist=%.2f"%(best_i,best_dist))

NameError: name 'X_train' is not defined

In [8]:
all_data = sklearn.datasets.fetch_20newsgroups(subset='all')

In [7]:
groups = ['comp.graphics', 'comp.os.ms-windows.misc','comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware','comp.windows.x', 'sci.space']
train_data = sklearn.datasets.fetch_20newsgroups(subset='train',categories=groups)
test_data = sklearn.datasets.fetch_20newsgroups(subset='test',categories=groups)

In [8]:
vectorizer = StemmedTfidfVectorizer(min_df=10,max_df=0.5,stop_words='english',decode_error='ignore')
vectorized = vectorizer.fit_transform(train_data.data)

In [9]:
num_samples, num_features = vectorized.shape
print("#samples: %d, #features: %d" % (num_samples,num_features))

#samples: 3529, #features: 4712


In [10]:
km = KMeans(n_clusters=50, init='random', n_init=1,verbose=1, random_state=3)
km.fit(vectorized)

Initialization complete
Iteration  0, inertia 5899.560
Iteration  1, inertia 3218.298
Iteration  2, inertia 3184.333
Iteration  3, inertia 3164.867
Iteration  4, inertia 3152.004
Iteration  5, inertia 3143.111
Iteration  6, inertia 3136.256
Iteration  7, inertia 3129.325
Iteration  8, inertia 3124.567
Iteration  9, inertia 3121.900
Iteration 10, inertia 3120.210
Iteration 11, inertia 3118.627
Iteration 12, inertia 3117.363
Iteration 13, inertia 3116.811
Iteration 14, inertia 3116.588
Iteration 15, inertia 3116.417
Iteration 16, inertia 3115.760
Iteration 17, inertia 3115.374
Iteration 18, inertia 3115.155
Iteration 19, inertia 3114.949
Iteration 20, inertia 3114.515
Iteration 21, inertia 3113.937
Iteration 22, inertia 3113.720
Iteration 23, inertia 3113.548
Iteration 24, inertia 3113.475
Iteration 25, inertia 3113.447
Converged at iteration 25


KMeans(copy_x=True, init='random', max_iter=300, n_clusters=50, n_init=1,
    n_jobs=1, precompute_distances='auto', random_state=3, tol=0.0001,
    verbose=1)

In [11]:
print(km.labels_)
print(km.labels_.shape)
print(km.cluster_centers_)

[38 17 47 ..., 41 14 16]
(3529,)
[[ 0.          0.          0.         ...,  0.          0.          0.05115952]
 [ 0.00255397  0.          0.         ...,  0.          0.          0.        ]
 [ 0.00585929  0.          0.         ...,  0.          0.          0.        ]
 ..., 
 [ 0.          0.          0.         ...,  0.          0.          0.        ]
 [ 0.          0.          0.         ...,  0.          0.          0.        ]
 [ 0.          0.07535044  0.         ...,  0.          0.          0.        ]]


In [12]:
new_post = "Disk drive problems. Hi, I have a problem with my hard disk. After 1 year it is working only sporadically now. I tried to format it, but now it doesn't boot any more.Any ideas? Thanks."
new_post_vec = vectorizer.transform([new_post])
new_post_label = km.predict(new_post_vec)[0]
similar_indices = (km.labels_==new_post_label).nonzero()[0]

In [13]:
similar = []
for i in similar_indices:
    dist = sp.linalg.norm((new_post_vec - vectorized[i].toarray()))
    similar.append((dist, train_data.data[i]))

similar = sorted(similar)

In [14]:
print(similar[0])
print(similar[1])
print(similar[2])

(1.0378441731334074, "From: Thomas Dachsel <GERTHD@mvs.sas.com>\nSubject: BOOT PROBLEM with IDE controller\nNntp-Posting-Host: sdcmvs.mvs.sas.com\nOrganization: SAS Institute Inc.\nLines: 25\n\nHi,\nI've got a Multi I/O card (IDE controller + serial/parallel\ninterface) and two floppy drives (5 1/4, 3 1/2) and a\nQuantum ProDrive 80AT connected to it.\nI was able to format the hard disk, but I could not boot from\nit. I can boot from drive A: (which disk drive does not matter)\nbut if I remove the disk from drive A and press the reset switch,\nthe LED of drive A: continues to glow, and the hard disk is\nnot accessed at all.\nI guess this must be a problem of either the Multi I/o card\nor floppy disk drive settings (jumper configuration?)\nDoes someone have any hint what could be the reason for it.\nPlease reply by email to GERTHD@MVS.SAS.COM\nThanks,\nThomas\n+-------------------------------------------------------------------+\n| Thomas Dachsel                                         

In [15]:
post_group = zip(train_data.data, train_data.target)
all = [(len(post[0]), post[0], train_data.target_names[post[1]]) for post in post_group]
graphics = sorted([post for post in all if post[2]=='comp.graphics'])
print(graphics[5])



In [18]:
noise_post = graphics[5][1]
analyzer = vectorizer.build_analyzer()
print(list(analyzer(noise_post)))

['situnaya', 'ibm3090', 'bham', 'ac', 'uk', 'subject', 'test', 'sorri', 'organ', 'univers', 'birmingham', 'unit', 'kingdom', 'line', 'nntp', 'post', 'host', 'ibm3090', 'bham', 'ac', 'uk']


In [24]:
useful = set(analyzer(noise_post)).intersection(vectorizer.get_feature_names())

In [25]:
useful

{'ac',
 'birmingham',
 'host',
 'kingdom',
 'nntp',
 'sorri',
 'test',
 'uk',
 'unit',
 'univers'}

21