In [16]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy as sp
import sys
import os

# クラスタリングを用いて, 関連する文書を見つける

Q&Aサイトにおいて, ユーザーが入力したQuestionに関連するQuestionを提示する問題について考える.

Reference : 実践 機械学習システム Willi Richert, Luis Pedro Coelho著

## load data

In [3]:
from sklearn.feature_extraction.text import CountVectorizer

In [5]:
DIR = "./textdata/"
posts = [open(os.path.join(DIR,f)).read() for f in os.listdir(DIR)]

In [7]:
posts

['This is a toy post about machine learning. Actually, it contains not much interesting stuff.',
 'Imaging databases provide storage capabilities.',
 'Most imaging databases save images permanently.',
 'Imaging databases store data.',
 'Imaging databases store data. Imaging databases store data. Imaging databases store data.']

## BoWの計算

In [8]:
vectorizer = CountVectorizer(min_df=1) # 1回のみ出現する単語を除く
X_train = vectorizer.fit_transform(posts)
num_samples,num_features = X_train.shape
print("samples %d , features %d" %(num_samples,num_features))

samples 5 , features 25


In [10]:
vectorizer.get_feature_names()

['about',
 'actually',
 'capabilities',
 'contains',
 'data',
 'databases',
 'images',
 'imaging',
 'interesting',
 'is',
 'it',
 'learning',
 'machine',
 'most',
 'much',
 'not',
 'permanently',
 'post',
 'provide',
 'save',
 'storage',
 'store',
 'stuff',
 'this',
 'toy']

## ユークリッド距離を用いた類似度の計算

In [11]:
new_post = "imaging databases" # 新しい文書
new_post_vec = vectorizer.transform([new_post])
new_post_vec.toarray()

array([[0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0]], dtype=int64)

In [15]:
def dist_raw(v1,v2):
    """
    ユークリッド距離を計算する関数
    """
    delta = v1-v2
    return sp.linalg.norm(delta.toarray())

In [21]:
best_doc = None
best_dist = sys.maxsize # intの最大値を代入
best_i = None

# postに最も類似するtrainの文書を探索
for i in range(0,num_samples):
    post = posts[i]
    if post ==new_post: # postとnew_postが一致していろとき, 処理をスキップ
        continue
    post_vec = X_train.getrow(i)
    d = dist_raw(post_vec,new_post_vec)
    print("=== Post %i with dist=%.2f: %s" %(i,d,post))
    if d<best_dist:
        best_dist = d
        best_i = i
print("Best post is %i with dist=%.2f" %(best_i,best_dist))

=== Post 0 with dist=4.00: This is a toy post about machine learning. Actually, it contains not much interesting stuff.
=== Post 1 with dist=1.73: Imaging databases provide storage capabilities.
=== Post 2 with dist=2.00: Most imaging databases save images permanently.
=== Post 3 with dist=1.41: Imaging databases store data.
=== Post 4 with dist=5.10: Imaging databases store data. Imaging databases store data. Imaging databases store data.
Best post is 3 with dist=1.41


文書3が最も類似している文章であるという結果がえられた. また, 文書4が最も類似していない文書であるという結果が得られた. しかし文書4は, 文書3を3回繰り返した文書であるから単語の出現回数だけを特徴量として用いるのは単純すぎると考えられる. そこで特徴量ベクトルを正規化する処理を行う.

In [22]:
def dist_norm(v1,v2):
    """
    ユークリッド距離を計算して正規化する関数
    """
    v1_norm = v1/sp.linalg.norm(v1.toarray())
    v2_norm = v2/sp.linalg.norm(v2.toarray())
    delta = v1_norm-v2_norm
    return sp.linalg.norm(delta.toarray())

In [23]:
best_doc = None
best_dist = sys.maxsize # intの最大値を代入
best_i = None

# postに最も類似するtrainの文書を探索
for i in range(0,num_samples):
    post = posts[i]
    if post ==new_post: # postとnew_postが一致していろとき, 処理をスキップ
        continue
    post_vec = X_train.getrow(i)
    d = dist_norm(post_vec,new_post_vec)
    print("=== Post %i with dist=%.2f: %s" %(i,d,post))
    if d<best_dist:
        best_dist = d
        best_i = i
print("Best post is %i with dist=%.2f" %(best_i,best_dist))

=== Post 0 with dist=1.41: This is a toy post about machine learning. Actually, it contains not much interesting stuff.
=== Post 1 with dist=0.86: Imaging databases provide storage capabilities.
=== Post 2 with dist=0.92: Most imaging databases save images permanently.
=== Post 3 with dist=0.77: Imaging databases store data.
=== Post 4 with dist=0.77: Imaging databases store data. Imaging databases store data. Imaging databases store data.
Best post is 3 with dist=0.77


正規化を行うことで, 文書3と文書4の類似度が同じになったことが読み取れる.

## 重要度の低い単語の除去

In [24]:
vectorizer = CountVectorizer(min_df=1,stop_words="english") # 1回のみ出現する単語を除く,英語のストップワード辞書を使用
X_train = vectorizer.fit_transform(posts)
num_samples,num_features = X_train.shape
print("samples %d , features %d" %(num_samples,num_features))

samples 5 , features 18


In [25]:
vectorizer.get_feature_names()

['actually',
 'capabilities',
 'contains',
 'data',
 'databases',
 'images',
 'imaging',
 'interesting',
 'learning',
 'machine',
 'permanently',
 'post',
 'provide',
 'save',
 'storage',
 'store',
 'stuff',
 'toy']

In [27]:
new_post = "imaging databases" # 新しい文書
new_post_vec = vectorizer.transform([new_post])
new_post_vec.toarray()

array([[0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]],
      dtype=int64)

In [28]:
best_doc = None
best_dist = sys.maxsize # intの最大値を代入
best_i = None

# postに最も類似するtrainの文書を探索
for i in range(0,num_samples):
    post = posts[i]
    if post ==new_post: # postとnew_postが一致していろとき, 処理をスキップ
        continue
    post_vec = X_train.getrow(i)
    d = dist_norm(post_vec,new_post_vec)
    print("=== Post %i with dist=%.2f: %s" %(i,d,post))
    if d<best_dist:
        best_dist = d
        best_i = i
print("Best post is %i with dist=%.2f" %(best_i,best_dist))

=== Post 0 with dist=1.41: This is a toy post about machine learning. Actually, it contains not much interesting stuff.
=== Post 1 with dist=0.86: Imaging databases provide storage capabilities.
=== Post 2 with dist=0.86: Most imaging databases save images permanently.
=== Post 3 with dist=0.77: Imaging databases store data.
=== Post 4 with dist=0.77: Imaging databases store data. Imaging databases store data. Imaging databases store data.
Best post is 3 with dist=0.77


## stemming
images, imagingの語幹を揃える処理を行う.

In [29]:
import nltk.stem

In [31]:
class StemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer,self).build_analyzer()
        return lambda doc:(english_stemmer.stem(w) for w in analyzer(doc))

In [32]:
english_stemmer = nltk.stem.SnowballStemmer("english")
vectorizer = StemmedCountVectorizer(min_df=1,stop_words="english")
X_train = vectorizer.fit_transform(posts)
num_samples,num_features = X_train.shape
print("samples %d , features %d" %(num_samples,num_features))

samples 5 , features 17


In [33]:
vectorizer.get_feature_names()

['actual',
 'capabl',
 'contain',
 'data',
 'databas',
 'imag',
 'interest',
 'learn',
 'machin',
 'perman',
 'post',
 'provid',
 'save',
 'storag',
 'store',
 'stuff',
 'toy']

In [34]:
new_post = "imaging databases" # 新しい文書
new_post_vec = vectorizer.transform([new_post])
new_post_vec.toarray()

array([[0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=int64)

In [35]:
best_doc = None
best_dist = sys.maxsize # intの最大値を代入
best_i = None

# postに最も類似するtrainの文書を探索
for i in range(0,num_samples):
    post = posts[i]
    if post ==new_post: # postとnew_postが一致していろとき, 処理をスキップ
        continue
    post_vec = X_train.getrow(i)
    d = dist_norm(post_vec,new_post_vec)
    print("=== Post %i with dist=%.2f: %s" %(i,d,post))
    if d<best_dist:
        best_dist = d
        best_i = i
print("Best post is %i with dist=%.2f" %(best_i,best_dist))

=== Post 0 with dist=1.41: This is a toy post about machine learning. Actually, it contains not much interesting stuff.
=== Post 1 with dist=0.86: Imaging databases provide storage capabilities.
=== Post 2 with dist=0.63: Most imaging databases save images permanently.
=== Post 3 with dist=0.77: Imaging databases store data.
=== Post 4 with dist=0.77: Imaging databases store data. Imaging databases store data. Imaging databases store data.
Best post is 2 with dist=0.63


## TF-IDFを用いる

In [36]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [53]:
class StemmedTfidfVectorizer(TfidfVectorizer):
    def build_analyzer(self):
        analyzer = super(TfidfVectorizer,self).build_analyzer()
        return lambda doc:(english_stemmer.stem(w) for w in analyzer(doc))

In [54]:
vectorizer = StemmedTfidfVectorizer(min_df=1,stop_words="english")
X_train = vectorizer.fit_transform(posts)
num_samples,num_features = X_train.shape
print("samples %d , features %d" %(num_samples,num_features))

samples 5 , features 17


In [55]:
best_doc = None
best_dist = sys.maxsize # intの最大値を代入
best_i = None

# postに最も類似するtrainの文書を探索
for i in range(0,num_samples):
    post = posts[i]
    if post ==new_post: # postとnew_postが一致していろとき, 処理をスキップ
        continue
    post_vec = X_train.getrow(i)
    d = dist_norm(post_vec,new_post_vec)
    print("=== Post %i with dist=%.2f: %s" %(i,d,post))
    if d<best_dist:
        best_dist = d
        best_i = i
print("Best post is %i with dist=%.2f" %(best_i,best_dist))

=== Post 0 with dist=1.41: This is a toy post about machine learning. Actually, it contains not much interesting stuff.
=== Post 1 with dist=1.08: Imaging databases provide storage capabilities.
=== Post 2 with dist=0.86: Most imaging databases save images permanently.
=== Post 3 with dist=0.92: Imaging databases store data.
=== Post 4 with dist=0.92: Imaging databases store data. Imaging databases store data. Imaging databases store data.
Best post is 2 with dist=0.86


## newsデータを使用して関連文書を見つけるプログラムを実行する

In [48]:
import sklearn.datasets
groups = ['comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware',
          'comp.sys.mac.hardware', 'comp.windows.x', 'sci.space']
train_data = sklearn.datasets.load_files("./20news-bydate-train",categories=groups)

In [51]:
len(train_data.filenames)

3529

In [60]:
vectorizer = StemmedTfidfVectorizer(min_df=10,max_df=0.5,stop_words="english",decode_error="ignore") # 不適切な文字を無視する
vectorized = vectorizer.fit_transform(train_data.data)
num_samples,num_features = vectorized.shape
print("samples %d , features %d" %(num_samples,num_features))

samples 3529 , features 4713


In [58]:
from sklearn.cluster import KMeans

In [61]:
n_clusters=50
model = KMeans(n_clusters=n_clusters,init="random",n_init=1,verbose=1)
model.fit(vectorized)

Initialization complete
Iteration 0, inertia 5923.803883312907
Iteration 1, inertia 3207.479772762691
Iteration 2, inertia 3170.175656054051
Iteration 3, inertia 3151.09387971308
Iteration 4, inertia 3139.2613820461893
Iteration 5, inertia 3131.7715363446837
Iteration 6, inertia 3128.3920958981525
Iteration 7, inertia 3124.970058624086
Iteration 8, inertia 3123.5550620211015
Iteration 9, inertia 3122.466962700782
Iteration 10, inertia 3121.746288937305
Iteration 11, inertia 3121.1483608070535
Iteration 12, inertia 3120.099768227793
Iteration 13, inertia 3119.140247495304
Iteration 14, inertia 3118.702072741001
Iteration 15, inertia 3118.6756902346233
Converged at iteration 15: strict convergence.


KMeans(init='random', n_clusters=50, n_init=1, verbose=1)

verboseで表示される数字は重心からインスタンスまでのユークリッド距離を総和

In [62]:
new_post = "Disk drive problem. Hi, I have a problem with my hard disk. After 1 year it is working only sporadically now. I tried to format it, but now it dosen't boot any more. Any ideas? Thanks."

In [64]:
new_post_vec = vectorizer.transform([new_post])
new_post_label = model.predict(new_post_vec)[0]
similar_indices = (model.labels_ == new_post_label).nonzero()[0]

In [67]:
similar = []
for i in similar_indices:
    dist = sp.linalg.norm((new_post_vec-vectorized[i]).toarray())
    similar.append((dist,dataset.data[i]))
similar = sorted(similar)
print(len(similar))

112


In [68]:
# 最も類似度が高い文書を表示
similar[0]

(1.0236597431380714,
 b"From: Thomas Dachsel <GERTHD@mvs.sas.com>\nSubject: BOOT PROBLEM with IDE controller\nNntp-Posting-Host: sdcmvs.mvs.sas.com\nOrganization: SAS Institute Inc.\nLines: 25\n\nHi,\nI've got a Multi I/O card (IDE controller + serial/parallel\ninterface) and two floppy drives (5 1/4, 3 1/2) and a\nQuantum ProDrive 80AT connected to it.\nI was able to format the hard disk, but I could not boot from\nit. I can boot from drive A: (which disk drive does not matter)\nbut if I remove the disk from drive A and press the reset switch,\nthe LED of drive A: continues to glow, and the hard disk is\nnot accessed at all.\nI guess this must be a problem of either the Multi I/o card\nor floppy disk drive settings (jumper configuration?)\nDoes someone have any hint what could be the reason for it.\nPlease reply by email to GERTHD@MVS.SAS.COM\nThanks,\nThomas\n+-------------------------------------------------------------------+\n| Thomas Dachsel                                       

In [70]:
# 類似度が中間の文書を表示
similar[int(len(similar)/2)]

(1.2895660283716477,
 b"From: ksc@cbnewsk.cb.att.com (kenneth.s.cobler)\nSubject: XFree86 and Esix 4.0.4\nOrganization: AT&T\nDistribution: na\nKeywords: esix\nLines: 39\n\nHello Netlanders:\n\n       I am a novice X user with a question for any Xgod.\n\n       My computer configuration with the X problem is as follows:\n\n       486DX50/256/16RAM  running Esix 4.0.4 \n       Wangtek AT-style interface 250 M tape drive.\n       I have loaded the Basic OS (which includes nsu) and\n       inet utilities (tcp/ip).\n       I ftp-ed the XFree86 (X11R5) binaries and installed properly.\n \n I can execute startx and run X-windows with no problems.\n However, if I try to access the tape drive while in X, the\n machine locks up instantly.  If I am out of X and access the\n tape, the tape drive works fine.  Soon as I try to\n startx again; the screen changes modes, but, the grey background\n pattern does not come up and no xterm is forked.  I have to login\n from another terminal and execute a s

In [71]:
# 類似度が最も低い文書を表示
similar[-1]

(1.3881773610496,
 b'From: thorf@csa.bu.edu (Thor Farrish)\nSubject: Maxtor drive geometry/jumpers\nDistribution: usa\nOrganization: Computer Science Department, Boston University, Boston, MA, USA\nLines: 1\n\n\n')