In [0]:
from google.colab import files
uploaded = files.upload() # import pol.txt

Saving pol.txt to pol.txt


In [1]:
from __future__ import print_function

from keras.models import Model
from keras.layers import Input, LSTM, Dense, Embedding
import numpy as np


Using TensorFlow backend.


In [0]:
# Read file 
path = 'pol.txt'
with open(path, 'r', encoding='utf-8') as f:
    lines = f.read().split('\n')

num_samples = 10000  # Number of samples to train on.

# read lines -> mark start and end of target lines 
input_lines, target_lines = [], []
start_mark, end_mark = '_START_', '_END_'
for line in lines[: min(num_samples, len(lines) - 1)]:
  input_text, target_text, _ = line.split('\t')[:3]
  input_lines.append(input_text)
  target_lines.append(start_mark + ' ' + target_text + ' ' + end_mark)

In [4]:
print(input_lines[:10])
print(target_lines[:10])

['Go.', 'Hi.', 'Run!', 'Run.', 'Run.', 'Who?', 'Wow!', 'Wow!', 'Help!', 'Jump.']
['_START_ Idź. _END_', '_START_ Cześć. _END_', '_START_ Uciekaj! _END_', '_START_ Biegnij. _END_', '_START_ Uciekaj. _END_', '_START_ Kto? _END_', '_START_ O, dziamdzia zaprzała jej szadź! _END_', '_START_ Łał! _END_', '_START_ Pomocy! _END_', '_START_ Skok. _END_']


In [0]:
def do_sth_with_punctuations(lines):
  for i in range(len(lines)):
    for x in ["?", ",", ".", "!"]:
      lines[i] = lines[i].replace(x, " " + x)
    for x in ["'"]:
      lines[i] = lines[i].replace(x, " ")
  
  return lines


In [0]:
input_lines = do_sth_with_punctuations(input_lines)
target_lines = do_sth_with_punctuations(target_lines)

In [7]:
print(input_lines[:10])
print(target_lines[:10])

['Go .', 'Hi .', 'Run !', 'Run .', 'Run .', 'Who ?', 'Wow !', 'Wow !', 'Help !', 'Jump .']
['_START_ Idź . _END_', '_START_ Cześć . _END_', '_START_ Uciekaj ! _END_', '_START_ Biegnij . _END_', '_START_ Uciekaj . _END_', '_START_ Kto ? _END_', '_START_ O , dziamdzia zaprzała jej szadź ! _END_', '_START_ Łał ! _END_', '_START_ Pomocy ! _END_', '_START_ Skok . _END_']


In [0]:
# get set of all words for each language
input_words, target_words = set(), set()
for input_line, target_line in zip(input_lines, target_lines):
  for word in input_line.split():
    input_words.add(word)
  for word in target_line.split():
    target_words.add(word)

In [9]:
print(input_words)
print(target_words)


{'For', 'album', 'safe', 'mustn', 'jumping', 'around', 'closer', 'crazy', 'regret', 'cancel', 'monster', 'roommate', 'shop', 'wasn', 'let', 'Where', 'begin', 'falls', 'plane', 'gas', 'ponies', 'happen', 'shot', 'reserved', 'load', 'belong', 'survived', 'grandfather', 'shirt', 'outside', 'Then', 'mango', 'Wood', '105', 'right', 'charming', 'disagree', 'Flip', 'is', 'bit', 'tired', 'tipsy', 'plead', 'Pass', 'restroom', 'born', 'NHK', 'optimistic', 'gamble', 'eat', 'singing', 'recycle', 'selling', 'likes', 'matters', 'attracted', 'detest', 'closed', 'sealed', 'miracle', 'organized', 'laid', 'Mondays', 'Women', 'acting', 'chop', 'accept', 'dying', 'shout', 'insane', 'powerless', 'habits', 'copilot', 'tall', 'sleeve', 'proud', 'comes', 'glad', 'Thanks', 'being', 'effective', 'anyone', 'Clocks', 'shoes', 'Hit', 'lay', 'luck', 'drugs', 'answered', 'please', 'Tuesday', 'her', 'doctor', 'jail', 'towel', 'psyched', 'melons', 'details', 'punctuality', 'patience', 'idol', 'boyfriend', 'ever', 'Eve

In [10]:
# vectorize the data
input_words = sorted(list(input_words))
target_words = sorted(list(target_words))
num_encoder_tokens = len(input_words)
num_decoder_tokens = len(target_words)
max_encoder_seq_length = max([len(txt.split()) for txt in input_lines])
max_decoder_seq_length = max([len(txt.split()) for txt in target_lines])

print('Number of samples:', len(input_lines))
print('Number of unique input tokens:', num_encoder_tokens)
print('Number of unique output tokens:', num_decoder_tokens)
print('Max sequence length for inputs:', max_encoder_seq_length)
print('Max sequence length for outputs:', max_decoder_seq_length)

input_token_index = dict(
    [(word, i) for i, word in enumerate(input_words)])
target_token_index = dict(
    [(word, i) for i, word in enumerate(target_words)])

Number of samples: 10000
Number of unique input tokens: 3437
Number of unique output tokens: 6930
Max sequence length for inputs: 9
Max sequence length for outputs: 13


In [0]:
encoder_input_data = np.zeros((len(input_lines), max_encoder_seq_length), dtype='float32')
decoder_input_data = np.zeros((len(target_lines), max_decoder_seq_length), dtype='float32')
decoder_target_data = np.zeros((len(target_lines), max_decoder_seq_length, num_decoder_tokens), dtype='float32')

for i, (input_text, target_text) in enumerate(zip(input_lines, target_lines)):
    for t, word in enumerate(input_text.split()):
        encoder_input_data[i, t] = input_token_index[word]
    for t, word in enumerate(target_text.split()):
        # decoder_target_data is ahead of decoder_input_data by one timestep
        decoder_input_data[i, t] = target_token_index[word]
        if t > 0:
            # decoder_target_data will be ahead by one timestep
            # and will not include the start character.
            decoder_target_data[i, t - 1, target_token_index[word]] = 1.

In [12]:
#Define an input sequence and process it. -> Input words embedding and encoder
latent_dim = 150  # Latent dimensionality of the encoding space.
encoder_inputs = Input(shape=(None,))
encoder = LSTM(latent_dim, return_state=True)
input_embedding = Embedding(num_encoder_tokens, 50)(encoder_inputs)
encoder_outputs, state_h, state_c = encoder(input_embedding)

# We discard `encoder_outputs` and only keep the states.
encoder_states = [state_h, state_c]






In [0]:
# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = Input(shape=(None,))
embedding = Embedding(num_decoder_tokens, 50)
decoder_embedding = embedding(decoder_inputs)
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

In [14]:
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])





In [28]:
batch_size = 2* 256  # Batch size for training.
epochs = 100  # Number of epochs to train for.

model.fit([encoder_input_data, decoder_input_data], decoder_target_data,
          batch_size=batch_size, epochs=epochs, validation_split=0.20)

Train on 8000 samples, validate on 2000 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100

<keras.callbacks.History at 0x7fc906999ef0>

In [0]:
# Encoder model 
encoder_model = Model(encoder_inputs, encoder_states)
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_embedding_2= embedding(decoder_inputs)
decoder_outputs_2, state_h_2, state_c_2 = decoder_lstm(
    decoder_embedding_2, initial_state=decoder_states_inputs)
decoder_states_2 = [state_h_2, state_c_2]
decoder_outputs_2 = decoder_dense(decoder_outputs_2)
decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs_2] + decoder_states_2)

# Reverse-lookup token index to decode sequences back to
# something readable.
reverse_input_word_index = dict(
    (i, word) for word, i in input_token_index.items())
reverse_target_word_index = dict(
    (i, word) for word, i in target_token_index.items())


In [0]:
def decode_sequence(input_seq):
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq)

    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1,1))

    # Populate the first character of target sequence with the start character.
    target_seq[0, 0] = target_token_index[start_mark]

    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
      output_tokens, h, c = decoder_model.predict(
        [target_seq] + states_value)
        
      # Sample a token
      sampled_token_index = np.argmax(output_tokens[0, -1, :])
      sampled_char = reverse_target_word_index[sampled_token_index]
      
      # Exit condition: either hit max length
      # or find stop character.
      if (sampled_char == end_mark or
        len(decoded_sentence) > max_decoder_seq_length + 1):
          stop_condition = True
      else:
        # add to sequence 
        decoded_sentence += ' ' + sampled_char
      
      # Update the target sequence (of length 1).
      target_seq = np.zeros((1,1))
      target_seq[0, 0] = sampled_token_index

      # Update states
      states_value = [h, c]
    
    
    return decoded_sentence


In [31]:
for seq_index in range(100):
    # Take one sequence (part of the training set)
    # for trying out decoding.
    input_seq = encoder_input_data[seq_index: seq_index + 1]
    decoded_sentence = decode_sequence(input_seq)
    print('-')
    print('Input sentence:', input_lines[seq_index])
    print('Decoded sentence:', decoded_sentence)

-
Input sentence: Go .
Decoded sentence:  Idź .
-
Input sentence: Hi .
Decoded sentence:  Cześć .
-
Input sentence: Run !
Decoded sentence:  Uciekaj !
-
Input sentence: Run .
Decoded sentence:  Biegnij .
-
Input sentence: Run .
Decoded sentence:  Biegnij .
-
Input sentence: Who ?
Decoded sentence:  Kto ?
-
Input sentence: Wow !
Decoded sentence:  Łał !
-
Input sentence: Wow !
Decoded sentence:  Łał !
-
Input sentence: Help !
Decoded sentence:  Pomocy !
-
Input sentence: Jump .
Decoded sentence:  Skok .
-
Input sentence: Stop !
Decoded sentence:  Zatrzymaj się !
-
Input sentence: Stop !
Decoded sentence:  Zatrzymaj się !
-
Input sentence: Begin .
Decoded sentence:  Zaczynajcie .
-
Input sentence: Begin .
Decoded sentence:  Zaczynajcie .
-
Input sentence: Hello !
Decoded sentence:  Cześć .
-
Input sentence: Hurry !
Decoded sentence:  Pośpiesz się !
-
Input sentence: I see .
Decoded sentence:  Rozumiem .
-
Input sentence: I see .
Decoded sentence:  Rozumiem .
-
Input sentence: I try .
Dec

In [0]:
from google.colab import files

model.save('model_p2.h5')
encoder_model.save('encoder_model_p2.h5')

files.download('model_p2.h5') 
files.download('encoder_model_p2.h5') 