# データマイニング Report3

+ 全体の流れ
    + NLTKの解説本の0章〜12章まで、計13個のHTMLファイルをダウンロードせよ。
    + BoWベースの特徴ベクトル（Level 1 もしくは Level 2）を生成せよ。
    + 共起行列ベースの特徴ベクトル（Level3）を生成せよ。
    + ラベル付き文書に対して分類タスク（Level4）を実行せよ。
+ Level 1: 文書ファイル毎に、``Bag-of-Words``で特徴ベクトルを生成せよ。
+ Level 2: ``BoW``に``TF-IDF``で重み調整した特徴ベクトルを生成せよ。
+ Level 3: 単語の``共起行列``から特徴ベクトルを生成せよ。
+ Level 4: ``文書分類``せよ。
+ オプション例
    + 相互情報量から``特徴ベクトル``を生成してみよう。
    + 共起行列に基づいた特徴ベクトル、もしくは相互特徴量に基づいた特徴ベクトルを``SVD``により``次元削減``してみよう。
    + SVDによる次元削減時に``2次元``とせよ。気になる単語1つを選び、上位10件と下位10件を2次元空間にマッピングせよ。マッピング結果、どのように散らばっているか観察し、想定とどのぐらい似通っているか考察してみよう。
    + ``日本語文書``について自然言語処理してみよう。

In [75]:
import os
import nltk
from nltk.tokenize import wordpunct_tokenize, sent_tokenize
import numpy as np
import glob
import scipy.spatial.distance as distance

# LEVEL1

+ collect_words_eng(): 英文書集合から単語コードブック作成
    

nltkのdownloadするべきmoudle

In [2]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/e175751/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/e175751/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /Users/e175751/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# Bag-of-Words

## 文書集合からターム素性集合（コードブック）を作る

In [3]:

def collect_words_eng(docs):
    '''
    英文書集合から単語コードブック作成。
    シンプルに文書集合を予め決めうちした方式で処理する。
    必要に応じて指定できるようにしていた方が使い易いかも。

    :param docs(list): 1文書1文字列で保存。複数文書をリストとして並べたもの。
    :return (list): 文分割、単語分割、基本形、ストップワード除去した、ユニークな単語一覧。
    '''
    
    codebook = []
    stopwords = nltk.corpus.stopwords.words('english') 
    
    #stopwords.append('.')   # ピリオドを追加。
    #stopwords.append(',')   # カンマを追加。
    #stopwords.append('')    # 空文字を追加。
    
    symbol = ["'", '"', ':', ';', '.', ',', '-', '!', '?', "'s","<",">","_"]
    SWList = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've",
              "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 
              'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 
              'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 
              'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 
              'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 
              'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 
              'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on',
              'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 
              'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 
              'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will',
              'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain',
              'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', 
              "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't",
              'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn',
              "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"]
    
    clean_frequency = nltk.FreqDist(w.lower() for w in docs if w.lower() not in stopwords + symbol + SWList)
    
    wnl = nltk.stem.wordnet.WordNetLemmatizer()
    
    for doc in docs:
        for sent in sent_tokenize(doc):
            for word in wordpunct_tokenize(sent):
                this_word = wnl.lemmatize(word.lower())
                if this_word not in codebook and this_word not in clean_frequency:
                    codebook.append(this_word)
    return codebook

サンプル(test)

In [101]:
docs3 = []
docs3.append("This is test.")
docs3.append("That is test too.")
docs3.append("There are so many many tests.")

``clean_frequencya``を使った場合
これにより、vector数が10個になる

In [102]:
codebook = collect_words_eng(docs3)
print('codebook = ',codebook)

codebook =  ['this', 'is', 'test', '.', 'that', 'too', 'there', 'are', 'so', 'many']


``stopwords``のままの場合
これにより、vector数が2個となる

In [103]:
codebook = collect_words_eng(docs3)
print('codebook = ',codebook)

codebook =  ['this', 'is', 'test', '.', 'that', 'too', 'there', 'are', 'so', 'many']


## コードブックを素性とする文書ベクトルを作る (直接ベクトル生成)

In [4]:
def make_vectors_eng(docs, codebook):
    '''コードブックを素性とする文書ベクトルを作る（直接ベクトル生成）

    :param docs(list): 1文書1文字列で保存。複数文書をリストとして並べたもの。
    :param codebook(list): ユニークな単語一覧。
    :return (list): コードブックを元に、出現回数を特徴量とするベクトルを返す。
    '''
    vectors = []
    wnl = nltk.stem.wordnet.WordNetLemmatizer()
    for doc in docs:
        this_vector = []
        fdist = nltk.FreqDist()
        for sent in sent_tokenize(doc):
            for word in wordpunct_tokenize(sent):
                this_word = wnl.lemmatize(word.lower())
                fdist[this_word] += 1
        for word in codebook:
            this_vector.append(fdist[word])
        vectors.append(this_vector)
    return vectors


In [57]:
vectors = make_vectors_eng(docs3, codebook)
for index in range(len(docs3)):
    print('docs[{}] = {}'.format(index,docs3[index]))
    print('vectors[{}] = {}'.format(index,vectors[index]))
    print('----')

docs[0] = This is test.
vectors[0] = [1, 1, 1, 1, 0, 0, 0, 0, 0, 0]
----
docs[1] = That is test too.
vectors[1] = [0, 1, 1, 1, 1, 1, 0, 0, 0, 0]
----
docs[2] = There are so many many tests.
vectors[2] = [0, 0, 1, 1, 0, 0, 1, 1, 1, 2]
----


## ユークリッド距離

In [5]:
def euclidean_distance(vectors):
    vectors = np.array(vectors)
    distances = []
    for i in range(len(vectors)):
        temp = []
        for j in range(len(vectors)):
            temp.append(np.linalg.norm(vectors[i] - vectors[j]))
        distances.append(temp)
    return distances

In [59]:
distances = euclidean_distance(vectors)
print('# euclidean_distance')
for index in range(len(distances)):
    print(distances[index])


# euclidean_distance
[0.0, 1.7320508075688772, 3.0]
[1.7320508075688772, 0.0, 3.1622776601683795]
[3.0, 3.1622776601683795, 0.0]


## コサイン類似度

In [6]:
def cosine_similarity(vectors):
    vectors = np.array(vectors)
    distances = []
    for i in range(len(vectors)):
        temp = []
        for j in range(len(vectors)):
            temp.append(distance.cosine(vectors[i], vectors[j]))
        distances.append(temp)
    return distances

In [61]:
similarities = cosine_similarity(vectors)
print('# cosine_similarity')
for index in range(len(similarities)):
    print(similarities[index])

# cosine_similarity
[0.0, 0.3291796067500631, 0.6666666666666667]
[0.3291796067500631, 0.0, 0.7018576030000281]
[0.6666666666666667, 0.7018576030000281, 0.0]


## それでは実際に文章を分類する

fileのpathを配列に格納する

In [7]:
List_Data_NL=[]
for i in range(1,14):
    List_Data_NL = glob.glob( "./data/*.html")

In [8]:
List_Data_NL

['./data/kadai1.html',
 './data/kadai6.html',
 './data/kadai10.html',
 './data/kadai11.html',
 './data/kadai7.html',
 './data/kadai4.html',
 './data/kadai12.html',
 './data/kadai8.html',
 './data/kadai9.html',
 './data/kadai13.html',
 './data/kadai5.html',
 './data/kadai2.html',
 './data/kadai3.html']

In [9]:
DataPath = "./data/kadai"

In [10]:
sentence = []
for i in range(1,len(List_Data_NL)+1):
    with open(DataPath +str(i) + ".html" ) as f:
        r = f.read()
        sentence.append(r)

In [11]:
len(sentence)

13

### コードブック生成

In [22]:
codebook = collect_words_eng(sentence)
print('codebook = ',codebook)




### 文書ベクトル

In [13]:
vectors = make_vectors_eng(sentence, codebook)
for index in range(len(sentence)):
    print('docs[{}] = {}'.format(index,sentence[index]))
    print('vectors[{}] = {}'.format(index,vectors[index]))
    print('----')

docs[0] = 
<!-- saved from url=(0035)https://www.nltk.org/book/ch00.html -->
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"><head><meta http-equiv="Content-Type" content="text/html; charset=UTF-8"><script language="javascript" type="text/javascript">

function astext(node)
{
    return node.innerHTML.replace(/(<([^>]+)>)/ig,"")
                         .replace(/&gt;/ig, ">")
                         .replace(/&lt;/ig, "<")
                         .replace(/&quot;/ig, '"')
                         .replace(/&amp;/ig, "&");
}

function copy_notify(node, bar_color, data)
{
    // The outer box: relative + inline positioning.
    var box1 = document.createElement("div");
    box1.style.position = "relative";
    box1.style.display = "inline";
    box1.style.top = "2em";
    box1.style.left = "1em";
  
    // A shadow for fun
    var shadow = document.createElement("div");
    shadow.style.position = "absolute";
    shadow.style.left = "-1.3em";
    shadow.style.top = 

### ユークリッド距離を求める

In [69]:
distances = euclidean_distance(vectors)
print('# euclidean_distance')
for index in range(len(distances)):
    print(distances[index])

# euclidean_distance
[0.0, 6163.175155713166, 7319.142436105476, 9850.352531762506, 8523.743191814263, 6679.858830843658, 4453.808931689819, 4086.666979336584, 4245.508567886774, 5464.20259507277, 1141.4512692182702, 3375.864037546536, 622.1864672266667]
[6163.175155713166, 0.0, 1715.393249374615, 3785.4739465488333, 2552.0771931898926, 1111.17505371566, 2349.0500207530704, 2428.4746241210755, 2230.7610808869695, 1764.569635916928, 7044.810217457955, 3208.733083321204, 6584.136237958628]
[7319.142436105476, 1715.393249374615, 0.0, 2865.290735684601, 1869.5748179733273, 1179.951693926493, 3459.659231774135, 3491.943728068939, 3480.6282478885905, 2851.036653570066, 8204.946617742251, 4300.851776102031, 7750.336508823343]
[9850.352531762506, 3785.4739465488333, 2865.290735684601, 0.0, 1785.4705262199093, 3383.432428762247, 5865.009207153898, 5977.8253571010255, 5861.451697318677, 4900.032754992562, 10710.075816725108, 6789.397248651753, 10267.167038672353]
[8523.743191814263, 2552.0771931

### コサイン類似度を求める

In [70]:
similarities = cosine_similarity(vectors)
print('# cosine_similarity')
for index in range(len(similarities)):
    print(np.round(similarities[index],3))

# cosine_similarity
[0.    0.189 0.187 0.219 0.2   0.18  0.124 0.172 0.142 0.207 0.349 0.129
 0.093]
[0.189 0.    0.013 0.006 0.007 0.009 0.037 0.022 0.02  0.028 0.481 0.046
 0.28 ]
[0.187 0.013 0.    0.011 0.013 0.008 0.046 0.02  0.034 0.043 0.446 0.046
 0.285]
[0.219 0.006 0.011 0.    0.007 0.01  0.054 0.026 0.034 0.034 0.508 0.056
 0.316]
[0.2   0.007 0.013 0.007 0.    0.009 0.041 0.015 0.027 0.038 0.474 0.042
 0.287]
[0.18  0.009 0.008 0.01  0.009 0.    0.035 0.018 0.025 0.038 0.462 0.042
 0.28 ]
[0.124 0.037 0.046 0.054 0.041 0.035 0.    0.038 0.021 0.05  0.397 0.037
 0.186]
[0.172 0.022 0.02  0.026 0.015 0.018 0.038 0.    0.027 0.044 0.426 0.036
 0.242]
[0.142 0.02  0.034 0.034 0.027 0.025 0.021 0.027 0.    0.026 0.409 0.044
 0.217]
[0.207 0.028 0.043 0.034 0.038 0.038 0.05  0.044 0.026 0.    0.484 0.074
 0.272]
[0.349 0.481 0.446 0.508 0.474 0.462 0.397 0.426 0.409 0.484 0.    0.412
 0.424]
[0.129 0.046 0.046 0.056 0.042 0.042 0.037 0.036 0.044 0.074 0.412 0.
 0.175]
[0.093 0.28

## それぞれのFiIeの関係性をコサイン類似度で確認する

In [16]:
for i in range(0,len(sentence)):
    for j in range(0,len(sentence)):
        list=[]
        if i < j:
            print(i,j)
            list.append(sentence[i])
            list.append(sentence[j])
            
        else:
            continue
        codebook = collect_words_eng(list)
        vectors = make_vectors_eng(list, codebook)
        similarities = cosine_similarity(vectors)
        print('# cosine_similarity')
        for index in range(len(similarities)):
            print(similarities[index])

0 1
# cosine_similarity
[0.0, 0.18887530203651393]
[0.18887530203651393, 0.0]
0 2
# cosine_similarity
[0.0, 0.18676683291388385]
[0.18676683291388385, 0.0]
0 3
# cosine_similarity
[0.0, 0.21936147859644506]
[0.21936147859644506, 0.0]
0 4
# cosine_similarity
[0.0, 0.20025570960970163]
[0.20025570960970163, 0.0]
0 5
# cosine_similarity
[0.0, 0.17977087542170045]
[0.17977087542170045, 0.0]
0 6
# cosine_similarity
[0.0, 0.12383533297034233]
[0.12383533297034233, 0.0]
0 7
# cosine_similarity
[0.0, 0.17222093373830194]
[0.17222093373830194, 0.0]
0 8
# cosine_similarity
[0.0, 0.14157483521111514]
[0.14157483521111514, 0.0]
0 9
# cosine_similarity
[0.0, 0.2071564483033992]
[0.2071564483033992, 0.0]
0 10
# cosine_similarity
[0.0, 0.34910262492529776]
[0.34910262492529776, 0.0]
0 11
# cosine_similarity
[0.0, 0.12913954952576245]
[0.12913954952576245, 0.0]
0 12
# cosine_similarity
[0.0, 0.09335328250143238]
[0.09335328250143238, 0.0]
1 2
# cosine_similarity
[0.0, 0.013423253145316671]
[0.01342325

ここで一度別の**前処理**として、文章ベクトルを**標準化**を行う

## 文章ベクトルを標準化

In [28]:
#from sklearn import preprocessing as pp
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

In [62]:
vectors = make_vectors_eng(sentence, codebook)
ppSS = StandardScaler()

In [63]:
for index in range(len(sentence)):
    print(index)
    print('vectors[{}] = {}'.format(index,vectors[index]))
    print('-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------')

0
vectors[0] = [36, 1, 25, 4, 1, 1, 24, 28, 27, 6, 419, 55, 26, 51, 48, 4, 11, 36, 322, 1, 312, 1, 1, 1, 77, 1, 208, 2, 2, 74, 11, 85, 3, 342, 1, 5, 6, 31, 215, 1, 38, 1, 4, 2, 64, 3, 148, 13, 3, 60, 14, 104, 11, 2, 5, 1, 5, 1, 4, 4, 4, 295, 1, 1, 1, 1, 1, 2, 1, 97, 3, 2, 40, 5, 190, 1, 2, 3, 12, 3, 2, 16, 8, 7, 3, 7, 50, 6, 43, 4, 19, 2, 22, 7, 21, 7, 151, 8, 88, 1, 2, 2, 30, 4, 21, 6, 1, 1, 11, 67, 26, 48, 50, 1, 2, 2, 32, 1, 2, 1, 5, 1, 1, 1, 1, 70, 1, 2, 18, 1, 7, 7, 135, 2, 1, 50, 234, 1, 4, 2, 1, 1, 1, 2, 2, 1, 2, 1, 3, 1, 1, 1, 1, 1, 2, 6, 17, 3, 2, 2, 5, 2, 14, 1, 13, 10, 2, 2, 3, 1, 5, 21, 2, 1, 2, 3, 4, 1, 12, 510, 2, 1, 2, 11, 2, 4, 2, 2, 2, 2, 1, 2, 1, 1, 8, 16, 30, 1, 62, 1, 1, 1, 1, 4, 10, 3, 3, 2, 3, 3, 11, 2, 3, 5, 1, 1, 1, 6, 4, 11, 8, 2, 1, 10, 13, 1, 3, 1, 1, 1, 29, 2, 2, 1, 1, 1, 282, 32, 1, 14, 1, 1, 2, 5, 7, 3, 3, 8, 7, 6, 2, 2, 2, 4, 14, 7, 1, 98, 1, 2, 7, 38, 2, 28, 24, 33, 2, 59, 3, 1, 3, 43, 1, 1, 1, 6, 3, 1, 1, 1, 5, 33, 1, 1, 1, 10, 3, 6, 2, 1, 1, 4, 159, 2,

In [64]:
data_std = ppSS.fit_transform(vectors)

In [65]:
print(type(data_std))
print(len(data_std[0]))

<class 'numpy.ndarray'>
11526


In [71]:
for index in range(len(data_std)):
    print('vectors[{}] = {}'.format(index,data_std[index]))
    print('-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------')

vectors[0] = [-0.77918914 -0.63900965 -0.71651428 ... -0.28867513 -0.28867513
 -0.28867513]
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------
vectors[1] = [-0.1414084   0.54772256  0.66075442 ... -0.28867513 -0.28867513
 -0.28867513]
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------
vectors[2] = [-0.06637537  0.54772256  0.69699833 ... -0.28867513 -0.28867513
 -0.28867513]
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------
vectors[3] = [ 0.30878977 -0.63900965  2.47295008 ... -0.28867513 -0.28867513
 -0.28867513]
--------------------------------------------------------------------------------------------------------------

In [72]:
similarities = cosine_similarity(data_std)
print('# cosine_similarity')
for index in range(len(similarities)):
    print(np.round(similarities[index],3))

# cosine_similarity
[0.    1.081 1.133 1.178 1.135 1.112 1.095 1.026 1.068 1.048 0.771 1.056
 0.792]
[1.081 0.    1.084 1.081 1.061 1.073 1.084 1.103 1.098 1.114 1.11  1.11
 1.103]
[1.133 1.084 0.    1.092 1.09  1.09  1.12  1.125 1.123 1.126 1.138 1.104
 1.146]
[1.178 1.081 1.092 0.    1.043 1.094 1.106 1.117 1.112 1.125 1.212 1.106
 1.201]
[1.135 1.061 1.09  1.043 0.    1.063 1.064 1.115 1.088 1.109 1.194 1.081
 1.165]
[1.112 1.073 1.09  1.094 1.063 0.    1.066 1.061 1.08  1.098 1.113 1.102
 1.111]
[1.095 1.084 1.12  1.106 1.064 1.066 0.    1.047 1.072 1.092 1.101 1.089
 1.081]
[1.026 1.103 1.125 1.117 1.115 1.061 1.047 0.    1.054 1.062 0.96  1.089
 1.001]
[1.068 1.098 1.123 1.112 1.088 1.08  1.072 1.054 0.    1.016 1.043 1.095
 1.046]
[1.048 1.114 1.126 1.125 1.109 1.098 1.092 1.062 1.016 0.    1.004 1.098
 1.026]
[0.771 1.11  1.138 1.212 1.194 1.113 1.101 0.96  1.043 1.004 0.    1.079
 0.673]
[1.056 1.11  1.104 1.106 1.081 1.102 1.089 1.089 1.095 1.098 1.079 0.
 1.065]
[0.792 1.103

## 正規化してデータ分析

In [74]:
ms = MinMaxScaler()

In [87]:
x = ms.fit_transform(vectors)

In [89]:
similarities = cosine_similarity(x)
print('# cosine_similarity')
for index in range(len(similarities)):
    print(np.round(similarities[index],3))

# cosine_similarity
[0.    0.7   0.754 0.751 0.706 0.72  0.715 0.711 0.712 0.728 0.961 0.7
 0.611]
[0.7   0.    0.714 0.69  0.658 0.672 0.687 0.706 0.696 0.738 0.975 0.72
 0.728]
[0.754 0.714 0.    0.727 0.706 0.712 0.737 0.741 0.738 0.767 0.986 0.741
 0.775]
[0.751 0.69  0.727 0.    0.65  0.691 0.704 0.705 0.702 0.738 0.975 0.717
 0.769]
[0.706 0.658 0.706 0.65  0.    0.651 0.657 0.693 0.667 0.711 0.982 0.682
 0.733]
[0.72  0.672 0.712 0.691 0.651 0.    0.668 0.668 0.675 0.72  0.984 0.711
 0.73 ]
[0.715 0.687 0.737 0.704 0.657 0.668 0.    0.662 0.674 0.714 0.983 0.702
 0.709]
[0.711 0.706 0.741 0.705 0.693 0.668 0.662 0.    0.675 0.713 0.981 0.712
 0.708]
[0.712 0.696 0.738 0.702 0.667 0.675 0.674 0.675 0.    0.657 0.971 0.706
 0.708]
[0.728 0.738 0.767 0.738 0.711 0.72  0.714 0.713 0.657 0.    0.984 0.739
 0.726]
[0.961 0.975 0.986 0.975 0.982 0.984 0.983 0.981 0.971 0.984 0.    0.967
 0.972]
[0.7   0.72  0.741 0.717 0.682 0.711 0.702 0.712 0.706 0.739 0.967 0.
 0.715]
[0.611 0.728 0

## 主成分分析によるデータの圧縮化

In [36]:
from sklearn.decomposition import PCA

In [81]:
pca = PCA(n_components=10)
pca.fit(data_std)

PCA(copy=True, iterated_power='auto', n_components=10, random_state=None,
    svd_solver='auto', tol=0.0, whiten=False)

In [82]:
pca.explained_variance_ratio_

array([0.14599779, 0.13580654, 0.10165497, 0.09726184, 0.09239373,
       0.08734583, 0.08407744, 0.07234759, 0.06935515, 0.05624416])

In [83]:
pca.singular_values_

array([147.87334603, 142.61890616, 123.39031476, 120.69464507,
       117.63539266, 114.37677361, 112.2164428 , 104.09474381,
       101.91922387,  91.78156738])

In [84]:
pca_X = pca.transform(data_std)

In [85]:
pca_X

array([[-17.6622535 , -12.70646477, -15.43916855,  -4.61203932,
          0.7511924 ,  -9.28006832,  -2.6281817 , -22.94673849,
        -27.03197608, -33.29292183],
       [ -3.45672755,  -5.14336558,  35.7746087 , -48.11219693,
         94.38021428, -11.87514981,  -2.01075859,  12.81963113,
          7.54235464,   4.70647486],
       [-18.64345957, 135.33943804,  -6.31880779,   2.30306934,
         -4.38594009,  -2.44997384,   2.99481168,   2.48167347,
          2.080061  ,   0.98415551],
       [139.15004493,   5.01041943, -22.536181  ,  -2.98462239,
         -0.97541623,  -0.99779893,   0.64130002,  -0.8762156 ,
         -2.70756352,  -3.32210785],
       [ 11.53041252,  -6.11915607, 100.07657066,  40.30949244,
        -28.42180046, -29.61561801, -10.41490676,  -5.14818476,
         -9.08984347,  -1.93525024],
       [ -6.77239746,  -7.96368405,  18.89413061, -20.6218053 ,
        -19.22831859,  93.02045086, -44.30689723,  13.70335274,
          3.41571182,  -4.78391841],
       [ -

## 最終的なコサイン類似度の計算

In [86]:
similarities = cosine_similarity(pca_X)
print('# cosine_similarity')
for index in range(len(similarities)):
    print(np.round(similarities[index],3))

# cosine_similarity
[0.    1.105 1.186 1.244 1.173 1.158 1.137 1.082 1.11  1.086 0.028 1.075
 0.01 ]
[1.105 0.    1.084 1.081 1.062 1.073 1.084 1.103 1.097 1.114 1.159 1.11
 1.15 ]
[1.186 1.084 0.    1.092 1.09  1.09  1.12  1.125 1.123 1.126 1.19  1.104
 1.209]
[1.244 1.081 1.092 0.    1.043 1.094 1.106 1.117 1.112 1.125 1.299 1.106
 1.288]
[1.173 1.062 1.09  1.043 0.    1.063 1.064 1.114 1.088 1.109 1.284 1.081
 1.238]
[1.158 1.073 1.09  1.094 1.063 0.    1.066 1.061 1.08  1.098 1.158 1.102
 1.157]
[1.137 1.084 1.12  1.106 1.064 1.066 0.    1.047 1.072 1.092 1.145 1.089
 1.106]
[1.082 1.103 1.125 1.117 1.114 1.061 1.047 0.    1.055 1.063 0.9   1.089
 0.993]
[1.11  1.097 1.123 1.112 1.088 1.08  1.072 1.055 0.    1.016 1.052 1.095
 1.057]
[1.086 1.114 1.126 1.125 1.109 1.098 1.092 1.063 1.016 0.    0.991 1.098
 1.032]
[0.028 1.159 1.19  1.299 1.284 1.158 1.145 0.9   1.052 0.991 0.    1.115
 0.007]
[1.075 1.11  1.104 1.106 1.081 1.102 1.089 1.089 1.095 1.098 1.115 0.
 1.091]
[0.01  1.15 

## 主成分分析だけしたデータでのコサイン類似度

In [60]:
pca.fit(vectors)
pca_y = pca.transform(vectors)

In [61]:
similarities = cosine_similarity(pca_y)
print('# cosine_similarity')
for index in range(len(similarities)):
    print(np.round(similarities[index],3))

# cosine_similarity
[0.    1.94  1.934 1.982 1.98  1.944 0.857 0.5   0.655 1.507 0.005 0.221
 0.002]
[1.94  0.    0.142 0.049 0.075 0.117 1.207 1.604 1.32  0.534 1.929 1.801
 1.94 ]
[1.934 0.142 0.    0.067 0.09  0.068 1.228 1.472 1.465 0.65  1.931 1.742
 1.941]
[1.982 0.049 0.067 0.    0.032 0.065 1.257 1.552 1.414 0.536 1.968 1.803
 1.98 ]
[1.98  0.075 0.09  0.032 0.    0.076 1.182 1.462 1.391 0.597 1.971 1.753
 1.978]
[1.944 0.117 0.068 0.065 0.076 0.    1.156 1.494 1.4   0.647 1.944 1.757
 1.954]
[0.857 1.207 1.228 1.257 1.182 1.156 0.    1.018 0.556 1.063 0.925 0.734
 0.876]
[0.5   1.604 1.472 1.552 1.462 1.494 1.018 0.    0.907 1.425 0.487 0.524
 0.484]
[0.655 1.32  1.465 1.414 1.391 1.4   0.556 0.907 0.    0.859 0.708 0.798
 0.676]
[1.507 0.534 0.65  0.536 0.597 0.647 1.063 1.425 0.859 0.    1.488 1.567
 1.488]
[0.005 1.929 1.931 1.968 1.971 1.944 0.925 0.487 0.708 1.488 0.    0.258
 0.002]
[0.221 1.801 1.742 1.803 1.753 1.757 0.734 0.524 0.798 1.567 0.258 0.
 0.223]
[0.002 1.94

# LEVEL2