<a href="https://colab.research.google.com/github/ghostfm3/ksks/blob/master/100pca1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import bz2

In [2]:
# coding: utf-8
import bz2
fname_input = 'enwiki-20150112-400-r100-10576.txt.bz2'      # 1/100版^^;
fname_output = 'corpus80.txt'

# 1行ずつ処理
with bz2.open(fname_input, 'rt') as data_file, \
        open(fname_output, mode='wt') as out_file:
    for line in data_file:

        # 空白で分解、前後の記号除去
        tokens = []     # 結果のトークン配列
        for chunk in line.split(' '):
            token = chunk.strip().strip('.,!?;:()[]\'"')
            if len(token) > 0:
                tokens.append(token)

        # 出力
        print(*tokens, sep=' ', end='\n', file=out_file)

In [4]:
# coding: utf-8
fname_input = 'corpus80.txt'
fname_output = 'corpus81.txt'
fname_countries = 'countries.txt'

# 国名一覧を読み込んで集合と辞書作成、ただし1語の国は含めない。
# 辞書には{ 最初の1語, [全体の語数1, 全体の語数2...] }を登録し、
# 全体の語数は降順でソートして格納する。
# たとえば最初の1語が'United'の国は次の6つある。
#   United States of America
#   United Mexican States
#   United Kingdom of Great Britain and Northern Ireland
#   United Arab Emirates
#   United Republic of Tanzania
#   United States
# この場合、全体の語数が4語、3語、8語、2語のものがあるので、
# 辞書には { 'United', [8, 4, 3, 2] } を登録する。
# 全体の個数を降順ソートするのは最長一致でマッチングさせるため。
set_country = set()
dict_country = {}
with open(fname_countries, 'rt') as data_file:
    for line in data_file:
        words = line.split(' ')
        if len(words) > 1:

            # 集合に追加
            set_country.add(line.strip())

            # 辞書に追加
            if words[0] in dict_country:
                lengths = dict_country[words[0]]
                if not len(words) in lengths:
                    lengths.append(len(words))
                    lengths.sort(reverse=True)
            else:
                dict_country[words[0]] = [len(words)]

# 1行ずつ処理
with open(fname_input, 'rt') as data_file, \
        open(fname_output, mode='wt') as out_file:
    for line in data_file:

        # 1語ずつチェック
        tokens = line.strip().split(' ')
        result = []     # 結果のトークン配列
        skip = 0        # >0なら複数語の続き
        for i in range(len(tokens)):

            # 複数語の続きの場合はスキップ
            if skip > 0:
                skip -= 1
                continue

            # 1語目が辞書にある？
            if tokens[i] in dict_country:

                # 後続の語数を切り取って集合にあるかチェック
                hit = False
                for length in dict_country[tokens[i]]:
                    if ' '.join(tokens[i:i + length]) in set_country:

                        # 複数語の国を発見したので'_'で連結して結果に追加
                        result.append('_'.join(tokens[i:i + length]))
                        skip = length - 1       # 残りの語はスキップ
                        hit = True
                        break
                if hit:
                    continue

            # 複数語の国ではないので、そのまま結果に追加
            result.append(tokens[i])

        # 出力
        print(*result, sep=' ', end='\n', file=out_file)

In [5]:
import random
fname_input = 'corpus81.txt'
fname_output = 'context.txt'

# 1行ずつ処理
with open(fname_input, 'rt') as data_file, \
        open(fname_output, mode='wt') as out_file:
    for i, line in enumerate(data_file):

        # 1語ずつ処理
        tokens = line.strip().split(' ')
        for j in range(len(tokens)):

            t = tokens[j]                   # 単語t
            d = random.randint(1, 5)        # 文脈幅d

            # 前後d語以内の語の列挙
            for k in range(max(j - d, 0), min(j + d + 1, len(tokens))):
                if j != k:
                    print('{}\t{}'.format(t, tokens[k]), file=out_file)

        # 経過表示
        if i % 10000 == 0:
            print('{} done.'.format(i))

0 done.
10000 done.
20000 done.
30000 done.
40000 done.
50000 done.
60000 done.
70000 done.
80000 done.
90000 done.
100000 done.
110000 done.
120000 done.
130000 done.
140000 done.
150000 done.
160000 done.
170000 done.
180000 done.
190000 done.
200000 done.
210000 done.
220000 done.
230000 done.
240000 done.
250000 done.
260000 done.
270000 done.
280000 done.


In [6]:
# coding: utf-8
from collections import Counter
import pickle

fname_input = 'context.txt'
fname_counter_tc = 'counter_tc'
fname_counter_t = 'counter_t'
fname_counter_c = 'counter_c'


# Counter作成
counter_tc = Counter()
counter_t = Counter()
counter_c = Counter()

# 1行ずつ処理
work_tc = []
work_t = []
work_c = []
with open(fname_input, 'rt') as data_file:
    for i, line in enumerate(data_file, start=1):

        line = line.strip()
        tokens = line.split('\t')

        work_tc.append(line)
        work_t.append(tokens[0])
        work_c.append(tokens[1])

        # 1,000,000行単位でCounterに追加
        if i % 1000000 == 0:
            counter_tc.update(work_tc)
            counter_t.update(work_t)
            counter_c.update(work_c)
            work_tc = []
            work_t = []
            work_c = []
            print('{} done.'.format(i))

# 最後の半端分を追加
counter_tc.update(work_tc)
counter_t.update(work_t)
counter_c.update(work_c)

# Counter書き出し
with open(fname_counter_tc, 'wb') as data_file:
    pickle.dump(counter_tc, data_file)
with open(fname_counter_t, 'wb') as data_file:
    pickle.dump(counter_t, data_file)
with open(fname_counter_c, 'wb') as data_file:
    pickle.dump(counter_c, data_file)

print('N={}'.format(i))

1000000 done.
2000000 done.
3000000 done.
4000000 done.
5000000 done.
6000000 done.
7000000 done.
8000000 done.
9000000 done.
10000000 done.
11000000 done.
12000000 done.
13000000 done.
14000000 done.
15000000 done.
16000000 done.
17000000 done.
18000000 done.
19000000 done.
20000000 done.
21000000 done.
22000000 done.
23000000 done.
24000000 done.
25000000 done.
26000000 done.
27000000 done.
28000000 done.
29000000 done.
30000000 done.
31000000 done.
32000000 done.
33000000 done.
34000000 done.
35000000 done.
36000000 done.
37000000 done.
38000000 done.
39000000 done.
40000000 done.
41000000 done.
42000000 done.
43000000 done.
44000000 done.
45000000 done.
46000000 done.
47000000 done.
48000000 done.
49000000 done.
50000000 done.
51000000 done.
52000000 done.
53000000 done.
54000000 done.
55000000 done.
56000000 done.
57000000 done.
58000000 done.
59000000 done.
60000000 done.
61000000 done.
62000000 done.
63000000 done.
64000000 done.
65000000 done.
66000000 done.
67000000 done.
6800

In [7]:
# coding: utf-8
import math
import pickle
from collections import Counter
from collections import OrderedDict
from scipy import sparse, io

fname_counter_tc = 'counter_tc'
fname_counter_t = 'counter_t'
fname_counter_c = 'counter_c'
fname_matrix_x = 'matrix_x'
fname_dict_index_t = 'dict_index_t'
N = 68031841        # 問題83で求めた定数

# Counter読み込み
with open(fname_counter_tc, 'rb') as data_file:
    counter_tc = pickle.load(data_file)
with open(fname_counter_t, 'rb') as data_file:
    counter_t = pickle.load(data_file)
with open(fname_counter_c, 'rb') as data_file:
    counter_c = pickle.load(data_file)

# {単語, インデックス}の辞書作成
dict_index_t = OrderedDict((key, i) for i, key in enumerate(counter_t.keys()))
dict_index_c = OrderedDict((key, i) for i, key in enumerate(counter_c.keys()))

# 行列作成
size_t = len(dict_index_t)
size_c = len(dict_index_c)
matrix_x = sparse.lil_matrix((size_t, size_c))

# f(t, c)を列挙して処理
for k, f_tc in counter_tc.items():
    if f_tc >= 10:
        tokens = k.split('\t')
        t = tokens[0]
        c = tokens[1]
        ppmi = max(math.log((N * f_tc) / (counter_t[t] * counter_c[c])), 0)
        matrix_x[dict_index_t[t], dict_index_c[c]] = ppmi

# 結果の書き出し
io.savemat(fname_matrix_x, {'matrix_x': matrix_x})
with open(fname_dict_index_t, 'wb') as data_file:
    pickle.dump(dict_index_t, data_file)

In [8]:
# coding: utf-8
from scipy import sparse, io
import sklearn.decomposition

fname_matrix_x = 'matrix_x'
fname_matrix_x300 = 'matrix_x300'

# 行列読み込み
matrix_x = io.loadmat(fname_matrix_x)['matrix_x']

# 次元圧縮
clf = sklearn.decomposition.TruncatedSVD(300)
matrix_x300 = clf.fit_transform(matrix_x)
io.savemat(fname_matrix_x300, {'matrix_x300': matrix_x300})

In [10]:
fname_input = 'questions-words.txt'
fname_output = 'family.txt'

with open(fname_input, 'rt') as data_file, \
        open(fname_output, 'wt') as out_file:

    target = False      # 対象のデータ？
    for line in data_file:

        if target is True:

            # 対象データの場合は別のセクションになるまで出力
            if line.startswith(': '):
                break
            print(line.strip(), file=out_file)

        elif line.startswith(': family'):

            # 対象データ発見
            target = True

In [11]:
# coding: utf-8
import pickle
from collections import OrderedDict
from scipy import io
import numpy as np

fname_dict_index_t = 'dict_index_t'
fname_matrix_x300 = 'matrix_x300'
fname_input = 'family.txt'
fname_output = 'family_out.txt'


def cos_sim(vec_a, vec_b):
    '''コサイン類似度の計算
    ベクトルvec_a、vec_bのコサイン類似度を求める

    戻り値：
    コサイン類似度
    '''
    norm_ab = np.linalg.norm(vec_a) * np.linalg.norm(vec_b)
    if norm_ab != 0:
        return np.dot(vec_a, vec_b) / norm_ab
    else:
        # ベクトルのノルムが0だと似ているかどうかの判断すらできないので最低値
        return -1


# 辞書読み込み
with open(fname_dict_index_t, 'rb') as data_file:
        dict_index_t = pickle.load(data_file)
keys = list(dict_index_t.keys())

# 行列読み込み
matrix_x300 = io.loadmat(fname_matrix_x300)['matrix_x300']

# 評価データ読み込み
with open(fname_input, 'rt') as data_file, \
        open(fname_output, 'wt') as out_file:

    for line in data_file:
        cols = line.split(' ')

        try:

            # ベクトル計算
            vec = matrix_x300[dict_index_t[cols[1]]] \
                    - matrix_x300[dict_index_t[cols[0]]] \
                    + matrix_x300[dict_index_t[cols[2]]]

            # コサイン類似度の一番高い単語を抽出
            dist_max = -1
            index_max = 0
            result = ''
            for i in range(len(dict_index_t)):
                dist = cos_sim(vec, matrix_x300[i])
                if dist > dist_max:
                    index_max = i
                    dist_max = dist

            result = keys[index_max]

        except KeyError:

            # 単語がなければ0文字をコサイン類似度-1で出力
            result = ''
            dist_max = -1

        # 出力
        print('{} {} {}'.format(line.strip(), result, dist_max), file=out_file)
        print('{} {} {}'.format(line.strip(), result, dist_max))

boy girl brother sister brother 0.9643705791376848
boy girl brothers sisters brothers 0.8793347649553842
boy girl dad mom mother-in-law 0.840454058302376
boy girl father mother father 0.9610422367050222
boy girl grandfather grandmother grandfather 0.8536635994841226
boy girl grandpa grandma  -1
boy girl grandson granddaughter grandson 0.7711095511875965
boy girl groom bride girl 0.6567288124400436
boy girl he she he 0.9957411857356849
boy girl his her his 0.9978794363457415
boy girl husband wife husband 0.9627954858946359
boy girl king queen king 0.7517835965108276
boy girl man woman man 0.9221044782275418
boy girl nephew niece grandmother 0.7795481924100014
boy girl policeman policewoman girl 0.7091486188170547
boy girl prince princess girl 0.7790672040657758
boy girl son daughter son 0.9843363227089166
boy girl sons daughters sons 0.9480615576387851
boy girl stepbrother stepsister girl 0.6421450284678032
boy girl stepfather stepmother mom 0.8483079227932397
boy girl stepson stepdaugh

In [12]:
fname_input = 'family_out.txt'

with open(fname_input, 'rt') as data_file:

    # 1行ずつチェック
    correct = 0
    total = 0

    for line in data_file:
        cols = line.split(' ')
        total += 1
        if cols[3] == cols[4]:
            correct += 1

# 正解率表示
print('{} ({}/{})'.format(correct / total, correct, total))

0.02766798418972332 (14/506)
