<a href="https://colab.research.google.com/github/franseal/project_X/blob/master/%E5%B7%9D%E6%9F%B3%E7%94%9F%E6%88%90%E5%99%A8.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#準備

In [0]:
import numpy as np

In [0]:
!pip install janome

In [0]:
!apt-get -q -y install swig
!apt install aptitude
!aptitude install mecab libmecab-dev mecab-ipadic-utf8 git make curl xz-utils file -y
!pip install mecab-python3

In [0]:
!pip install chainer
!pip install cupy-cuda92

In [0]:
!pip install tqdm

In [0]:
import chainer
chainer.print_runtime_info()

In [0]:
!wget http://www.cl.ecei.tohoku.ac.jp/nlp100/data/neko.txt

#分かち書き

In [0]:
from tqdm import tqdm
import MeCab
import numpy as np

In [0]:
mecab = MeCab.Tagger("-Ochasen")

In [0]:
from google.colab import files
uploaded = files.upload()

In [0]:
text = open("neko.txt").readlines()

In [0]:
wakati = []
for t in tqdm(text):
  for row in mecab.parse(t).split("\n"):
    row = row.split("\t")
    if len(row) > 3:
      wakati.append(row[0])

In [0]:
wakati = np.array(wakati)

In [0]:
from collections import defaultdict
word2id = defaultdict(lambda: len(word2id))
for w in wakati:
  word2id[w]
  
len(word2id)

In [0]:
id2word = {i:w for w,i in word2id.items()}
len(id2word)

In [0]:
_wakati = wakati[1:]
_wakati = _wakati[_wakati != "\u3000"]
sentences = []
st = 0
for i,w in enumerate(_wakati):
  if w == "。":
    sentences.append(_wakati[st:i+1])
    st = i + 1
    
print(len(sentences))
sentences[0]

In [0]:
def wlist_to_id(sen):
  return [word2id[w] for w in sen]

train_data = [wlist_to_id(sen) for sen in sentences]
train_data[0]

#ネットワーク

In [0]:
import chainer
from chainer.backends import cuda
from chainer import Function, gradient_check, report, training, utils, Variable
from chainer import datasets, iterators, optimizers, serializers
from chainer import Link, Chain, ChainList
import chainer.functions as F
import chainer.links as L
from chainer.training import extensions
import cupy as cp
import random
import copy

In [0]:
class SentenceGenerator(Chain):
  def __init__(self, vocab_size, n_embed=100, n_mid=400):
    super().__init__()
    with super().init_scope():
      self.embed = L.EmbedID(vocab_size, n_embed)
      self.lstm = L.NStepLSTM(n_layers=2, in_size=n_embed, out_size=n_mid, dropout=0)
      self.out = L.Linear(n_mid, vocab_size)
      
  def __call__(self, list_of_sentences):
    xs = [self.embed(Variable(sen[ :-1])) for sen in list_of_sentences]
    ts = [           Variable(sen[1:  ])  for sen in list_of_sentences]
    
    hs, cs, ys = self.lstm(None, None, xs)
    loss = 0.0
    for y, t in zip(ys,ts):
      z = self.out(y)
      loss += F.softmax_cross_entropy(z, t)
    return loss
  
  def predict(self, prefix):
    xs = [self.embed(Variable(prefix))]
    hs, cs, ys = self.lstm(None, None, xs)
    y = ys[0]
    z = self.out(y)
    return F.softmax(z)[-1].data

In [0]:
model = SentenceGenerator(vocab_size=len(word2id))
model.to_gpu(0)

optimizer = optimizers.Adam()
optimizer.setup(model)

#学習

In [0]:
from tqdm import trange

batchsize = 128
for epoch in range(30): #学習回数
  shuffled = np.random.permutation(len(train_data))
  sum_loss = 0.0
  n=0
  
  for i in trange(0, len(train_data), batchsize):
    ids = shuffled[i:i+batchsize]
    xs = [cp.array(train_data[i]) for i in ids]
    
    model.cleargrads()
    loss = model(xs)
    loss.backward()
    optimizer.update()
    
    sum_loss += loss.data
    #print(loss.data)
    n += len(ids)
    
  print("Epoch {} : loss {}".format(epoch, sum_loss / n))

#生成
引数はsen, 最初の生成で上位いくつワードを取ってくるか, 以降の生成回数

In [0]:
from janome.tokenizer import Tokenizer
import re
target_word1 = "、"
target_word2 = "。"

In [0]:
def first(sen, seisei):
  keeplist = []
  result = []
  
  for i in range(1):
    with chainer.using_config("train", False):
      pr = model.predict(cp.array(sen))
    pr = chainer.cuda.to_cpu(pr)
    pr = np.argsort(pr)[::-1]
    
    for j in range(seisei):
      sen1 = copy.deepcopy(sen)
      sen1.append(int(pr[j]))
      keeplist.append(sen1)
      
  t = Tokenizer()
  wordcount = 0
  word = ""
  for i in range(len(keeplist)):
    s = "".join([id2word[i] for i in keeplist[i]])
    if target_word1 in s or target_word2 in s:
      break
    for token in t.tokenize(s, stream=True):
      match = re.search(r"[ャュョヮ]", token.reading)
      wordcount += (len(token.reading))
      if match:
        wordcount -= (len(re.findall(r"[ャュョヮ]", token.reading)))
    if wordcount == 5:
      word = (token.part_of_speech.split(",")[0])
      if word == "助詞":
        result.append(keeplist[i])
    wordcount = 0
    word = ""
    
  return result

In [0]:
def second(sen, seisei, repeat):
  keeplist = []
  result = []
  
  for i in range(1):
    with chainer.using_config("train", False):
      pr = model.predict(cp.array(sen))
    pr = chainer.cuda.to_cpu(pr)
    pr = np.argsort(pr)[::-1]
    
    for j in range(seisei):
      sen1 = copy.deepcopy(sen)
      sen1.append(int(pr[j]))
      keeplist.append(sen1)
      
  for i in range(repeat):
    with chainer.using_config("train", False):
      sen = copy.deepcopy(keeplist[i])
      pr = model.predict(cp.array(sen))
    pr = chainer.cuda.to_cpu(pr)
    pr = np.argsort(pr)[::-1]
    
    for j in range(seisei):
      sen1 = copy.deepcopy(sen)
      sen1.append(int(pr[j]))
      keeplist.append(sen1)
      
  t = Tokenizer()
  wordcount = 0
  word = ""
  for i in range(len(keeplist)):
    s = "".join([id2word[i] for i in keeplist[i]])
    if target_word1 in s or target_word2 in s:
      break
    for token in t.tokenize(s, stream=True):
      match = re.search(r"[ャュョヮ]", token.reading)
      wordcount += (len(token.reading))
      if match:
        wordcount -= (len(re.findall(r"[ャュョヮ]", token.reading)))
    if wordcount == 12:
      word = (token.part_of_speech.split(",")[0])
      if word == "助詞":
        result.append(keeplist[i])
    wordcount = 0
    word = ""
    
  return result

In [0]:
def third(sen, seisei, repeat):
  keeplist = []
  result = []
  
  for i in range(1):
    with chainer.using_config("train", False):
      pr = model.predict(cp.array(sen))
    pr = chainer.cuda.to_cpu(pr)
    pr = np.argsort(pr)[::-1]
    
    for j in range(seisei):
      sen1 = copy.deepcopy(sen)
      sen1.append(int(pr[j]))
      keeplist.append(sen1)
      
  for i in range(repeat):
    with chainer.using_config("train", False):
      sen = copy.deepcopy(keeplist[i])
      pr = model.predict(cp.array(sen))
    pr = chainer.cuda.to_cpu(pr)
    pr = np.argsort(pr)[::-1]
    
    for j in range(seisei):
      sen1 = copy.deepcopy(sen)
      sen1.append(int(pr[j]))
      keeplist.append(sen1)
      
  t = Tokenizer()
  wordcount = 0
  word = ""
  for i in range(len(keeplist)):
    s = "".join([id2word[i] for i in keeplist[i]])
    if target_word1 in s or target_word2 in s:
      break
    for token in t.tokenize(s, stream=True):
      match = re.search(r"[ャュョヮ]", token.reading)
      wordcount += (len(token.reading))
      if match:
        wordcount -= (len(re.findall(r"[ャュョヮ]", token.reading)))
    if wordcount == 17:
      word = (token.part_of_speech.split(",")[0])
      #if word == "助詞":
      result.append(keeplist[i])
    wordcount = 0
    word = ""
    
  return result

In [0]:
f_rank = 80 #上五の上位数
s_rank = 10 #中七の上位数
s_create = 20 #中七の生成数
t_rank = 10 #下五の上位数
t_create = 20 #下五の生成数

sen = [word2id["男"]] #渡す単語

In [0]:
kari = []
res2 = []
res3 = []

res = first(sen, f_rank) #上位
print(res)

for i in range(len(res)):
  sen = res[i]
  kari = second(sen, s_rank, s_create)
  for j in range(len(kari)):
    res2.append(kari[j])
print(res2)

for i in range(len(res2)):
  sen = res2[i]
  kari = third(sen, t_rank, t_create)
  for j in range(len(kari)):
    res3.append(kari[j])
print(res3)

for i in range(len(res3)):
  s = "".join([id2word[i] for i in res3[i]])
  print(s)