### LSTM 사용 문장 생성 구현

In [1]:
from nn_layers import softmax,Rnnlm,BetterRnnlm,RnnlmTrainer
import numpy as np
from dataset import ptb

In [2]:
class RnnlmGen(Rnnlm): # Rnnlm class를 상속 받아 사용
    def generate(self,start_id,skip_ids=None,sample_size=100): # sample_size:샘플링하는 단어 수
        word_ids = [start_id]
        
        x = start_id
        while len(word_ids) < sample_size:
            x = np.array(x).reshape(1,1)
            score = self.predict(x)      # 3차원
            p = softmax(score.flatten()) # 10000개의 단어의 각각의 확률을 구함
            
            sampled = np.random.choice(len(p),size=1,p=p)
            # 확률 분포를 사용하여 random으로 1개의 단어 샘플링, 확률적 방법
            
            if (skip_ids is None) or (sampled not in skip_ids):
                x = sampled
                word_ids.append(int(x)) # word_ids 리스트에 샘플링된 단어를 추가
                
        return word_ids
                
    def get_state(self):
        return self.lstm_layer.h, self.lstm_layer.c

    def set_state(self, state):
        self.lstm_layer.set_state(*state)          

In [20]:
corpus, word_to_id, id_to_word = ptb.load_data('train')
vocab_size = len(word_to_id)  # 10000
corpus_size = len(corpus)
print(vocab_size,corpus_size)

model = RnnlmGen()
model.load_params('Rnnlm.pkl')  # 미리 학습된 parameter를 읽어오기, 학습 불필요

# start 단어와 skip 단어(문자열) 설정
start_word = 'you'
start_id = word_to_id[start_word]
print(start_id)  # 316

skip_words = ['N','<unk>', '$']
skip_ids = [word_to_id[w] for w in skip_words]  # 전처리된 단어를 제외
print(skip_ids)

# 문장 생성
word_ids = model.generate(start_id,skip_ids,100)
# 시작할 단어의 id와 제외할 단어 id를 입력하여 100개의 단어 샘플링

txt = ' '.join([id_to_word[i] for i in word_ids])
txt = txt.replace(' <eos>','.\n')    # 100개의 단어를 한 문장으로 연결
print(txt)

10000 929589
316
[27, 26, 416]
you low-income years syndrome dataproducts missile philadelphia remarkable terminal intensify encountered colonial lies proving goodyear label send accrued flashy ill. peaked rarely fitness bank time redford hutton breakers notified eroding advantages feeds wives another confirm wheels batibot unavailable movies peddling morale l. collar barney south iranian shrank across ssangyong trading occurred outside i relied alaska burns 20-year pasadena luxury convert school photography discouraging offshore die differ destroying complaints vaccine mentioned smallest lay ago builds foreigners fad undoubtedly machinists non-violent accountability incorporated aer fully donuts invites nonperforming prescription greece female tariffs crash bikers novels blocked create best-known energy fiscal respected set


### 더 좋은 문장으로 : 2층 LSTM,  Dropout, 가중치 공유 사용

In [21]:
class BetterRnnlmGen(BetterRnnlm): # BetterRnnlm class를 상속 받아 사용
    def generate(self,start_id,skip_ids=None,sample_size=100): # sample_size:샘플링하는 단어 수
        word_ids = [start_id]             # start_id : 최초로 시작할 단어
        
        x = start_id
        while len(word_ids) < sample_size:
            x = np.array(x).reshape(1,1)
            score = self.predict(x)       # 3차원
            p = softmax(score.flatten())  # 10000개의 단어의 각각의 확률을 구함
            # print(p.shape)    # (10000,)
            
            sampled = np.random.choice(len(p),size=1,p=p)
            # 확률 분포를 사용하여 random으로 1개의 단어 샘플링, 확률적 방법
            
            if (skip_ids is None) or (sampled not in skip_ids):
                x = sampled
                word_ids.append(int(x)) # word_ids 리스트에 샘플링된 단어를 추가
        return word_ids

    def get_state(self):
        states = []
        for layer in self.lstm_layers:
            states.append((layer.h, layer.c))
        return states

    def set_state(self, states):
        for layer, state in zip(self.lstm_layers, states):
            layer.set_state(*state) 

In [29]:
corpus, word_to_id, id_to_word = ptb.load_data('train')
vocab_size = len(word_to_id)
corpus_size = len(corpus)
print(vocab_size,corpus_size)

model = RnnlmGen()
model.load_params('BetterRnnlm.pkl')  # 미리 학습된 parameter를 읽어오기

# start 단어와 skip 단어(문자열) 설정
start_word = 'you'
start_id = word_to_id[start_word]
print(start_id)  # 316

skip_words =['N','<unk>','$']
skip_ids = [word_to_id[w] for w in skip_words]  # 전처리된 단어를 제외
print(skip_ids)

# 문장 생성
word_ids = model.generate(start_id,skip_ids,100) 
# 시작할 단어의 id와 제외할 단어 id를 입력하여 100개의 단어 샘플링

txt = ' '.join([id_to_word[i] for i in word_ids])
txt = txt.replace(' <eos>','.\n')  # 100개의 단어를 한 문장으로 연결
print(txt)  # 실행시 마다 다름

10000 929589
316
[27, 26, 416]
you fireman precious professor application national reporting winter airplanes vicar fairness woods interest sassy lucky dark until cell views downward born prospectus representatives compact adjusted brewer drafted removal s&ls fierce optimism rampant nikko exist redevelopment there fletcher pace forcing kelly deadline evasion stakes acknowledging competition pain abused barron projections runaway francs detergent lovely retain circle accessories deficits debt supported lawn angels remember nasa whittington enacted national portable chefs disobedience effective original victim toward places write-offs climbed greece tritium customs elections winning gaubert scripts wilfred triggered food driver malcolm differ say rudolph squibb regain squibb low-cost protected expectation discovision courtaulds surprising


### 단어열을 초기 값으로 주고 문장을 생성

In [34]:
model.reset_state()

start_words = 'the meaning of life is'
start_ids = [word_to_id[w] for w in start_words.split(' ')]
print(start_ids)

# 문장 생성
word_ids = model.generate(start_ids[-1], skip_ids)  # 마지막 단어('is')를 시작 단어로 문장 생성
word_ids = start_ids[:-1] + word_ids                # 'is' 앞까지의 단어를 앞부분에 추가
txt = ' '.join([id_to_word[i] for i in word_ids])
txt = txt.replace(' <eos>', '.\n')
print(txt)  #  실행시 마다 다름

[32, 4748, 42, 2262, 40]
the meaning of life is container fournier soliciting new-issue trendy everything bargain-hunting insider closer window urged maturities clinical hall shed training romantic fcc reflecting bring compensate virtue prediction approached buried aspect piano paid trinova zone underwriting financially sides manage 12-month have spurred greedy proposal depositary affluent quotations due wcrs infected owes vintage harper virus pbs coaches guinness deep sole demise succeeded movements injuries unity escrow jolt fruit complicated academy minimal quotas improves sister confederation cuban rosenthal noncallable pitches looked undo cemetery reassuring coordinate olympics stems anti-nuclear king managua gives anderson championship plaintiff seems those presumed matthews high-definition stolen opponents concept 30-share done beings demler


In [37]:
# 'the meaning of life' 부분 예측  :  'meaning of life is' 으로 예측 되지 않음
for x in start_ids[:-1]:
    x = np.array(x).reshape(1, 1)
    score = model.predict(x).flatten()
    p = softmax(score).flatten()
    sampled = np.random.choice(len(p), size=1, p=p)
    print(id_to_word[sampled[0]])

h.f.
spin
lovely
showtime
