機械学習(自然言語処理)入門 2024/5/19

放送大学・多摩学習センター 森川 馨

テキスト解析

1. 形態素解析

In [None]:
# 35 秒かかる
!pip install janome
import numpy as np

In [None]:
from janome.tokenizer import Tokenizer
t = Tokenizer()
text = "国境の長いトンネルを抜けると雪国であった。"

In [None]:
for token in t.tokenize(text):
  print(token)


In [None]:
from janome.tokenizer import Tokenizer
t = Tokenizer()
text = "国境の長いトンネルを抜けると雪国であった。夜の底が白くなった。信号所に汽車が止まった。"

In [None]:
for token in t.tokenize(text):
  if token.part_of_speech.split(',')[0] == '名詞':
    print(token.surface)

In [None]:
for token in t.tokenize(text):
  if token.part_of_speech.split(',')[0] == '動詞':
    print(token.base_form)

2. テキスト分析

In [None]:
#形態素解析
# Documents
import janome.tokenizer
d_1 = "Pythonは楽しい"
d_2 = "バイクは速く走る"
d_3 = "プログラミングは楽しい"
d_4 = "三輪車はゆっくり走る"
d_5 = "自転車はゆっくり走る"
d_6 = "車は速く走る"

In [None]:
# 分かち書き
tokenizer = janome.tokenizer.Tokenizer()
print([token.base_form for token in tokenizer.tokenize(d_1)])

In [None]:
# Vocabularyを作る
tokens_1 = [token.base_form for token in tokenizer.tokenize(d_1)]
tokens_2 = [token.base_form for token in tokenizer.tokenize(d_2)]
tokens_3 = [token.base_form for token in tokenizer.tokenize(d_3)]
tokens_4 = [token.base_form for token in tokenizer.tokenize(d_4)]
tokens_5 = [token.base_form for token in tokenizer.tokenize(d_5)]
tokens_6 = [token.base_form for token in tokenizer.tokenize(d_6)]
vocabulary = list(set(tokens_1+tokens_2+tokens_3+tokens_4+tokens_5+tokens_6))
print(vocabulary)

In [None]:
for i in range(len(vocabulary)):
    print("token ID : {}, token : {}".format(i,vocabulary[i]))


In [None]:
# One-hotベクトルを作る
import sklearn.preprocessing
vocabulary_onehot = sklearn.preprocessing.label_binarize(vocabulary,classes=vocabulary)
for token, onehotvec in zip(vocabulary,vocabulary_onehot):
    print("one-hot vector : {}, token : {}".format(onehotvec,token))

In [None]:
# 任意のOne-hot表現を取り出す
token_index = vocabulary.index("走る")
print("「走る」のOne-hot表現は {}".format(vocabulary_onehot[token_index]))

3. 特異値分解(singular value decomposition)

In [None]:
import numpy as np
from numpy.linalg import svd, matrix_rank
np.set_printoptions(precision=3, suppress=True)
A = np.array([[1,0,0,0,0,0],[0,1,0,0,0,0],[0,0,1,0,0,0],
              [0,0,0,1,1,0],[1,0,1,0,0,0],[0,0,0,1,0,0],
              [0,0,0,0,1,0],[0,0,0,0,0,1],[0,1,0,1,1,1],
              [0,1,0,0,0,1]])

In [None]:
print('matrix A\n', A)
print('rank: ', matrix_rank(A))


In [None]:
# 特異値分解(singular value decomposition)
u, s, vt = svd(A)
print('\nSVD result')
print('shape of u, s, vt:', u.shape, s.shape, vt.shape)
print('singular values:', s.round(2))

In [None]:
# full_matrices=Falseの場合
u, s, vh = svd(A, full_matrices=False)
print('\nSVD result (full_matrices: False)')
print('shape of u, s, vt:', u.shape, s.shape, vt.shape)

In [None]:
print(u, '\n')
print(np.diag(s), '\n')
print(vt)

In [None]:
# 特異値分解の結果を確かめる
A_re = (u @ np.diag(s) @ vh).round(2)
print('\nreconstructed A:\n', A_re)

In [None]:
print(np.dot(u.T,u))
print(np.dot(u,u.T))
print(np.dot(vt, vt.T))
print(np.dot(vt.T, vt))
print(np.dot(np.diag(s),vt.T)[:,:3])
print(np.dot(np.diag(s),vh.T))

4. 単語分散表現(単語埋め込み)

Google Newsデータセット（約1,000億単語）での学習済み単語ベクトル（300万単語・フレーズ，300次元）をダウンロードし，単語ベクトルを表示してみよう．

In [None]:
import gdown

# 学習済み単語ベクトルのダウンロード
url = 'https://drive.google.com/uc?id=0B7XkCwpI5KDYNlNUTTlSS21pQmM'
output = 'GoogleNews-vectors-negative300.bin.gz'
gdown.download(url, output, quiet=False)

In [None]:
# 1分2秒かかる
from gensim.models import KeyedVectors

# 学習済みモデルのロード
model = KeyedVectors.load_word2vec_format('./GoogleNews-vectors-negative300.bin.gz', binary=True)

In [None]:
# 単語ベクトルの表示
model['Japan']

In [None]:
inf=model['Japan']
inf.shape

In [None]:
# 単語ベクトルの表示
model['United_States']

単語の類似度(コサイン類似度の計算)

In [None]:
model.similarity('Japan', 'Tokyo')

In [None]:
model.similarity('Japan', 'Washington')

In [None]:
model.similarity('Tokyo', 'Washington')

類似度の高い単語10件

In [None]:
model.most_similar('Japan', topn=10)

In [None]:
model.most_similar('Tokyo', topn=10)

In [None]:
model.most_similar('United_States', topn=10)

単語ベクトルのベクトル演算

In [None]:
model.most_similar(positive=['king', 'woman'], negative=['man'], topn=10)

In [None]:
model.most_similar(positive=['woman', 'father'], negative=['man'], topn=10)

In [None]:
model.most_similar(positive=['woman', 'nephew'], negative=['man'], topn=10)

In [None]:
model.most_similar(positive=['woman', 'uncle'], negative=['man'], topn=10)

In [None]:
model.most_similar(positive=['Hitler', 'Italy'], negative=['Germany'], topn=10)

In [None]:
model.most_similar(positive=['Spain', 'Athens'], negative=['Madrid'], topn=10)