# 毎日新聞データからのモデル作成
一般向けと小学生向け記事の対応付け
## データクリーニング

In [None]:
%matplotlib inline
import pandas as pd
import numpy as np
from gensim.models.doc2vec import Doc2Vec
maisho = pd.read_csv('../data/毎日新聞コーパス/maisho2018-utf8.csv')
# maisho

In [None]:
from mojimoji import zen_to_han 
maisho.columns = ['ID', 'C0', 'AD', 'AE', 'AF', 'ZZ', 'T1', 'S1', 'S2', 'T2', 'KB']
maisho.drop(maisho.index[maisho.ZZ == '著作権無'], inplace=True)
maisho.drop(maisho.index[maisho.T1.str.contains('新聞休みます')], inplace=True)
maisho = maisho.applymap(lambda x: zen_to_han(str(x), kana=False))
maisho.drop_duplicates(subset='T2', inplace=True)
# maisho

In [None]:
# ふりがなの削除 ()の中身がひらがなのときのみ
maisho.T2 = maisho.T2.str.replace('\([\u3041-\u309F]+\)', '')
# 不要な記号の削除
maisho.T2 = maisho.T2.str.replace(r'[;◆◇▽●★〓…]', '')
# maisho.T2

In [None]:
maisho['T2_len'] = maisho.T2.str.len()
print(maisho.T2_len.describe())
maisho.T2_len.plot.hist(bins=200,range=(50,2000))
# maisho.to_csv('../results/maisho_len.csv', index=False)
# print(maisho[maisho.T2_len < 100][['T1', 'T2']])
maisho.drop(maisho.index[maisho.T2_len < 100], inplace=True)
maisho.drop(columns=['T2_len', 'AE', 'ZZ', 'S2'], inplace=True)
maisho

In [None]:
mai = pd.read_csv('../data/毎日新聞コーパス/mai2018-utf8.csv')
# mai

In [None]:
mai.drop(mai.index[mai.ZZ == '著作権無'], inplace=True)
mai.drop(mai.index[mai.T1.str.contains('(?:朝刊|夕刊)休みます')], inplace=True)
mai.drop(mai.index[mai.T1.str.contains('東日本大震災・(?:空間|大気中の環境)放射線量')], inplace=True)
mai.drop(mai.index[mai.T1.str.contains('当選番号')], inplace=True)
mai.drop(mai.index[mai.T1.str.contains(r'ロシアW杯:')], inplace=True)
mai.drop(mai.index[mai.T1.str.contains(r'全国高校ラグビー:')], inplace=True)
mai.drop(mai.index[mai.T1.str.contains(r'米大リーグ:')], inplace=True)
mai.drop(mai.index[mai.T1.str.contains(r'プロ野球:')], inplace=True)
mai.drop(mai.index[mai.T1.str.contains(r'東京六大学野球:')], inplace=True)
mai.drop(mai.index[mai.T1.str.contains(r'ハンドボール:')], inplace=True)
mai.drop(mai.index[mai.T2.str.contains('当選金額')], inplace=True)
mai.drop_duplicates(subset='T2', inplace=True)
# ふりがなの削除 ()の中身がひらがなのときのみ
mai.T2 = mai.T2.str.replace('\([\u3041-\u309F]+\)', '')
# 不要な記号の削除
mai.T2 = mai.T2.str.replace(r'[;◆◇●▽★〓…]', '')

In [None]:
mai['T2_len'] = mai.T2.str.len()
print(mai.T2_len.describe())
mai.T2_len.plot.hist(bins=100,range=(0,200))
# mai.to_csv('../results/mai_len.csv', index=False)
mai.drop(mai.index[mai.T2_len < 150], inplace=True)
mai.drop(columns=['T2_len', 'AA', 'AB', 'AE', 'ZZ', 'S2'], inplace=True)
mai

## Doc2Vecによる類似文章算出

In [None]:
%%time
import MeCab
from typing import List
# nd_path = sp.check_output('echo `mecab-config --dicdir`"/mecab-ipadic-neologd"',
#                           shell=True).decode().strip('\n')
# m = MeCab.Tagger("-Owakati -d " + nd_path)
m = MeCab.Tagger("-Owakati")
def wakati(text: str) -> List[str]:
    """分かち書きにしてリスト返す"""
    return m.parse(text).split(' ')

maisho['wakati'] = maisho.T2.map(wakati)
mai['wakati'] = mai.T2.map(wakati)

In [None]:
%%time
from multiprocessing import cpu_count
settings = {
    "dbow300d": {"size": 300,
                 "iter": 20,
                 "window": 15,
                 "min_count": 5,
                 "dm": 0,  # PV-DBOW
                 "dbow_words": 1,
                 "workers": cpu_count()},
    "dmpv300d": {"size": 300,
                 "iter": 20,
                 "window": 10,
                 "min_count": 2,
                 "alpha": 0.05,
                 "dm": 1,  # PV-DM
                 "sample": 0,
                 "workers": cpu_count()}
}

from gensim.sklearn_api import D2VTransformer
import joblib
for setting_name, setting in settings.items():
    model = D2VTransformer(**setting)
    docvecs = model.fit_transform(pd.concat([mai['wakati'], maisho['wakati']]).to_numpy())
    joblib.dump(docvecs, f"../models/mai-doc2vec-{setting_name}.joblib", compress=True)

In [None]:
%%time
import joblib
docvecs = joblib.load("../models/mai-doc2vec-dmpv300d.joblib")
from sklearn.metrics.pairwise import cosine_similarity
# コサイン類似度行列を求める
sim_matrix = cosine_similarity(docvecs[-len(maisho):, :], docvecs[:len(mai), :])
# 対戦表
combi_table = pd.DataFrame(sim_matrix, index=maisho['ID'], columns=mai['ID'])

In [None]:
%%time
top = combi_table.stack().rename_axis(index=['ID_Maisho', 'ID_Mai'])
top = top.reset_index().rename(columns={0: 'SIM'})
top = top.sort_values('SIM', ascending=False).drop_duplicates('ID_Maisho')
top

In [None]:
mai['ID'] = mai['ID'].astype(str)
top_table = top.merge(maisho, left_on='ID_Maisho', right_on='ID').drop(columns='ID')
top_table = top_table.merge(mai, left_on='ID_Mai', right_on='ID', suffixes=('_Maisho', '_Mai')).drop(columns='ID')
top_table.to_csv('../results/mai_doc2vec_sim.csv', index=False)

In [None]:
result = pd.read_csv('../results/mai_doc2vec_sim.csv')
# result[['SIM', 'T2_Maisho', 'T2_Mai']]

## LSI(潜在的意味索引)による類似文章算出

In [None]:
import re
import MeCab
from typing import List
class Tokenizer(object):
    """単語分割と見出し語化(語形の変化を取り除く)を行う
    >>> t = Tokenizer()
    >>> t('今日の目標は達成した。;●なので、「次の目標は?」と尋ねる。')
    ['今日', 'の', '目標', 'は', '達成', 'する', 'た', 'だ', 'ので', '次', 'の', '目標', 'は', 'と', '尋ねる']
    """
    def __init__(self):
        self.parser = MeCab.Tagger()
    def __call__(self, doc: str) -> List[str]:
        morph_list: List[str] = []
        for m in self.parser.parse(doc).splitlines():
            if m == 'EOS':
                continue  # break でも可
        
            # タブで区切って、表層形と各種情報を得る
            surface, features = m.split('\t')
            feature_list = features.split(',')
            
            # 原形または表層形をストップワードでなければリストに追加する
            if not re.search(r'[。、;◆◇●,★〓「」()<>…]', surface):
                morph_list.append(feature_list[6] if feature_list[6] != '*' else surface)
        
        return morph_list

import doctest
doctest.testmod()

In [None]:
%%time
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import Pipeline
import joblib
num_topics = 400
vectorizer = TfidfVectorizer(tokenizer=Tokenizer(), use_idf=True, smooth_idf=True)
svd_model = TruncatedSVD(n_components=num_topics, algorithm='randomized', n_iter=10, random_state=42)
lsi_transformer = Pipeline([('tfidf', vectorizer), ('svd', svd_model)])
lsi_matrix = lsi_transformer.fit_transform(pd.concat([mai['T2'], maisho['T2']]).to_numpy())
joblib.dump(lsi_matrix, f"../models/mai-lsi_topics{num_topics}.joblib", compress=True)

In [None]:
%%time
import joblib
# lsi_matrix = joblib.load("../models/mai-lsi_topics400.joblib")
from sklearn.metrics.pairwise import cosine_similarity
# コサイン類似度行列を求める
sim_matrix = cosine_similarity(lsi_matrix[-len(maisho):, :], lsi_matrix[:len(mai), :])
# 対戦表
combi_table = pd.DataFrame(sim_matrix, index=maisho['ID'], columns=mai['ID'])

In [None]:
%%time
top = combi_table.stack().rename_axis(index=['ID_Maisho', 'ID_Mai'])
top = top.reset_index().rename(columns={0: 'SIM'})
top = top.sort_values('SIM', ascending=False).drop_duplicates('ID_Maisho')
top

In [None]:
mai['ID'] = mai['ID'].astype(str)
top_table = top.merge(maisho, left_on='ID_Maisho', right_on='ID').drop(columns='ID')
top_table = top_table.merge(mai, left_on='ID_Mai', right_on='ID', suffixes=('_Maisho', '_Mai')).drop(columns='ID')
top_table.to_csv('../results/mai_lsi_sim.csv', index=False)

In [None]:
result = pd.read_csv('../results/mai_lsi_sim.csv')
result[['SIM', 'T2_Maisho', 'T2_Mai']]