In [2]:
import pandas as pd
import numpy as np
from gensim.models.doc2vec import Doc2Vec
maisho = pd.read_csv('../data/毎日新聞コーパス/maisho2018-utf8.csv')
maisho

In [4]:
from mojimoji import zen_to_han 
maisho.columns = ['ID', 'C0', 'AD', 'AE', 'AF', 'ZZ', 'T1', 'S1', 'S2', 'T2', 'KB']
maisho.drop(maisho.index[maisho.ZZ == '著作権無'], inplace=True)
maisho.drop(maisho.index[maisho.T1.str.contains('新聞休みます')], inplace=True)
maisho = maisho.applymap(lambda x: zen_to_han(str(x), kana=False))
maisho.drop_duplicates(subset='T2', inplace=True)
maisho

In [3]:
# ふりがなの削除 ()の中身がひらがなのときのみ
maisho.T2 = maisho.T2.str.replace('\([\u3041-\u309F]+\)', '')
maisho.T2

In [10]:
mai = pd.read_csv('../data/毎日新聞コーパス/mai2018-utf8.csv')
mai

In [12]:
mai.drop(mai.index[mai.ZZ == '著作権無'], inplace=True)
mai.drop(mai.index[mai.T1.str.contains('(?:朝刊|夕刊)休みます')], inplace=True)
mai.drop(mai.index[mai.T1.str.contains('東日本大震災・(?:空間|大気中の環境)放射線量')], inplace=True)
mai.drop(mai.index[mai.T2.str.len() < 100], inplace=True)
mai.drop_duplicates(subset='T2', inplace=True)
mai.T2 = mai.T2.str.replace('\([\u3041-\u309F]+\)', '')
mai

In [6]:
import MeCab
def wakati(text: str) -> str:
    """分かち書きにして返す"""
    # nd_path = sp.check_output('echo `mecab-config --dicdir`"/mecab-ipadic-neologd"',
    #                           shell=True).decode().strip('\n')
    # m = MeCab.Tagger("-Owakati -d " + nd_path)
    m = MeCab.Tagger("-Owakati")
    return m.parse(text)

maisho['wakati'] = maisho.T2.map(wakati)
mai['wakati'] = mai.T2.map(wakati)

In [7]:
from gensim.models.doc2vec import TaggedDocument
documents = [TaggedDocument(words = text.split(" "), tags = [ID]) 
             for df in [mai, maisho] for ID, text in df[['ID', 'wakati']].values]

In [8]:
from multiprocessing import cpu_count
settings = {
    "dbow300d": {"vector_size": 300,
                 "epochs": 20,
                 "window": 15,
                 "min_count": 5,
                 "dm": 0,  # PV-DBOW
                 "dbow_words": 1,
                 "workers": cpu_count()},
    "dmpv300d": {"vector_size": 300,
                 "epochs": 20,
                 "window": 10,
                 "min_count": 2,
                 "alpha": 0.05,
                 "dm": 1,  # PV-DM
                 "sample": 0,
                 "workers": cpu_count()}
}

for setting_name, setting in settings.items():
    model = Doc2Vec(documents=documents, **setting)
    model.save(f"../models/mai-doc2vec-{setting_name}.model")

In [9]:
model = Doc2Vec.load("../models/mai-doc2vec-dmpv300d.model")

def find_similar(tag):
    for t, sim in model.docvecs.most_similar(tag, topn=30):
        if 'S' not in str(t):
            return str(t), sim
    
    return np.nan, np.nan

maisho['SIM_ID'], maisho['SIM'] = zip(*maisho.ID.map(find_similar))
maisho[['SIM_ID','SIM']]

In [10]:
maisho.dropna(subset = ['SIM_ID'])
maisho[['SIM_ID','SIM']]

In [11]:
mai['ID'] = mai['ID'].astype(str)

In [12]:
result = pd.merge(maisho, mai, left_on='SIM_ID', right_on='ID', suffixes=('_Maisho', '_Mai'))
result.to_csv('../results/mai_sim.csv', index=False)
result

In [13]:
result = pd.read_csv('../results/mai_sim.csv')
result[['SIM', 'T2_Maisho', 'T2_Mai']]