In [3]:
%matplotlib inline
import pandas as pd
import numpy as np
from gensim.models.doc2vec import Doc2Vec
maisho = pd.read_csv('../data/毎日新聞コーパス/maisho2018-utf8.csv')
maisho

In [25]:
from mojimoji import zen_to_han 
maisho.columns = ['ID', 'C0', 'AD', 'AE', 'AF', 'ZZ', 'T1', 'S1', 'S2', 'T2', 'KB']
maisho.drop(maisho.index[maisho.ZZ == '著作権無'], inplace=True)
maisho.drop(maisho.index[maisho.T1.str.contains('新聞休みます')], inplace=True)
maisho = maisho.applymap(lambda x: zen_to_han(str(x), kana=False))
maisho.drop_duplicates(subset='T2', inplace=True)
maisho

In [26]:
# ふりがなの削除 ()の中身がひらがなのときのみ
maisho.T2 = maisho.T2.str.replace('\([\u3041-\u309F]+\)', '')
maisho.T2

In [27]:
mai = pd.read_csv('../data/毎日新聞コーパス/mai2018-utf8.csv')
mai

In [28]:
mai.drop(mai.index[mai.ZZ == '著作権無'], inplace=True)
mai.drop(mai.index[mai.T1.str.contains('(?:朝刊|夕刊)休みます')], inplace=True)
mai.drop(mai.index[mai.T1.str.contains('東日本大震災・(?:空間|大気中の環境)放射線量')], inplace=True)
mai.drop(mai.index[mai.T2.str.len() < 100], inplace=True)
mai['T2_len'] = mai.T2.str.len()
mai.drop_duplicates(subset='T2', inplace=True)
mai.T2 = mai.T2.str.replace('\([\u3041-\u309F]+\)', '')
print(mai.T2_len.describe())
mai.to_csv('../results/mai_len.csv', index=False)

In [29]:
mai.T2_len.plot.hist(bins=30,range=(0,2000))

In [30]:
import MeCab
def wakati(text: str) -> str:
    """分かち書きにして返す"""
    # nd_path = sp.check_output('echo `mecab-config --dicdir`"/mecab-ipadic-neologd"',
    #                           shell=True).decode().strip('\n')
    # m = MeCab.Tagger("-Owakati -d " + nd_path)
    m = MeCab.Tagger("-Owakati")
    return m.parse(text)

maisho['wakati'] = maisho.T2.map(wakati)
mai['wakati'] = mai.T2.map(wakati)

In [31]:
from gensim.models.doc2vec import TaggedDocument
documents = [TaggedDocument(words = text.split(" "), tags = [ID]) 
             for df in [mai, maisho] for ID, text in df[['ID', 'wakati']].values]

In [32]:
from multiprocessing import cpu_count
settings = {
    "dbow300d": {"vector_size": 300,
                 "epochs": 20,
                 "window": 15,
                 "min_count": 5,
                 "dm": 0,  # PV-DBOW
                 "dbow_words": 1,
                 "workers": cpu_count()},
    "dmpv300d": {"vector_size": 300,
                 "epochs": 20,
                 "window": 10,
                 "min_count": 2,
                 "alpha": 0.05,
                 "dm": 1,  # PV-DM
                 "sample": 0,
                 "workers": cpu_count()}
}

for setting_name, setting in settings.items():
    model = Doc2Vec(documents=documents, **setting)
    model.save(f"../models/mai-doc2vec-{setting_name}.model")

In [33]:
model = Doc2Vec.load("../models/mai-doc2vec-dmpv300d.model")

def find_similar(tag):
    for t, sim in model.docvecs.most_similar(tag, topn=30):
        if 'S' not in str(t):
            return str(t), sim
    
    return np.nan, np.nan

maisho['SIM_ID'], maisho['SIM'] = zip(*maisho.ID.map(find_similar))
maisho[['SIM_ID','SIM']]

In [34]:
maisho.dropna(subset = ['SIM_ID'])
maisho[['SIM_ID','SIM']]

In [35]:
mai['ID'] = mai['ID'].astype(str)

In [36]:
result = pd.merge(maisho, mai, left_on='SIM_ID', right_on='ID', suffixes=('_Maisho', '_Mai'))
result.to_csv('../results/mai_sim.csv', index=False)
result

In [37]:
result = pd.read_csv('../results/mai_sim.csv')
result[['SIM', 'T2_Maisho', 'T2_Mai']]

In [38]:
import re
import MeCab
from gensim import corpora
from typing import List
parser = MeCab.Tagger()
def tokenize(text: str) -> List[str]:
    """形態素への分割とステミング(語形の変化を取り除く)を行う
    >>> tokenize('自分の目標を達成した;●ゴルフの選手')
    ['自分', 'の', '目標', 'を', '達成', 'する', 'た', 'ゴルフ', 'の', '選手']
    """
    morph_list: List[str] = []
    for m in parser.parse(text).splitlines():
        if m == 'EOS':
            continue  # break でも可
    
        # タブで区切って、表層形と各種情報を得る
        surface, features = m.split('\t')
        feature_list = features.split(',')
        
        # 原形または表層形を不要語になければリストに追加する
        if not re.search(r'[。、;◆◇●,★〓「」()<>…]', surface):
            morph_list.append(feature_list[6] if feature_list[6] != '*' else surface)
    
    return morph_list

mai['Token'] = mai.T2.map(tokenize)
maisho['Token'] = maisho.T2.map(tokenize)

In [39]:
documents = [text for df in [mai, maisho] for text in df['Token'].values]
dic = corpora.Dictionary(documents) 
dic.filter_extremes(no_below=20, no_above=0.3) 
bow_corpus = [dic.doc2bow(d) for d in documents] 

In [40]:
from gensim import models
 
tfidf_model = models.TfidfModel(bow_corpus) 
tfidf_corpus = tfidf_model[bow_corpus]
lsi_model = models.LsiModel(tfidf_corpus, id2word=dic, num_topics=400) 
lsi_corpus = lsi_model[tfidf_corpus] 
lsi_model.save('../models/lsi_topics400.model')
# https://stackoverflow.com/questions/28488714/retrieve-string-version-of-document-by-id-in-gensim
# MmCorpus.serialize(outp + '_bow.mm', corpus, progress_cnt=10000, metadata=True)

In [41]:
from gensim import similarities
lsi_model = models.LsiModel.load('../models/lsi_topics400.model')

from gensim.test.utils import common_corpus, common_dictionary, get_tmpfile
from gensim.similarities import Similarity
index_tmpfile = get_tmpfile("index")
index = similarities.MatrixSimilarity(lsi_model[lsi_corpus])

# https://radimrehurek.com/gensim/auto_examples/core/run_similarity_queries.html#sphx-glr-auto-examples-core-run-similarity-queries-py
# https://radimrehurek.com/gensim/similarities/docsim.html
# index = Similarity(index_tmpfile, lsi_corpus, num_features=len(dic)) # build the index
# the batch is simply an iterable of documents, aka gensim corpus:
# for similarities in index:
#     pass

index.save('../models/lsi.index')

In [42]:
from gensim import similarities
index = similarities.MatrixSimilarity.load('../models/lsi.index')

result_data = []
for i, doc in enumerate(lsi_corpus[len(mai):]):
    vec_lsi = lsi_model[doc]
    sims = index[vec_lsi]
    sims = sorted(enumerate(sims), key=lambda item: -item[1])
    maisho_txt = ''.join(documents[len(mai) + i])
    for idx, sim in sims:
        if idx < len(mai):
            result_data.append((maisho_txt, i, ''.join(documents[idx]), idx, sim))
            break

result_df = pd.DataFrame(result_data, columns=['Maisho_Txt','Maisho_Pos', 'Mai_Txt', 'Mai_Pos', 'Sim'])
result_df.to_csv('../results/mai_LSI_sim.csv', index=False)
result_df