In [None]:
%tensorflow_version 2.x
import tensorflow as tf
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

Found GPU at: /device:GPU:0


In [None]:
import tensorflow as tf
import pandas as pd
import numpy as np
from tensorflow.keras.models import load_model
import pickle
import gzip

In [None]:
database = pd.read_csv('train.tsv', sep='\t', nrows=30000)

training_input = database['sentence1']
training_output = database['sentence2']

PADDING_CHAR_CODE=0
START_CHAR_CODE=1

DEFAULT_INPUT_LENGTH = 20
DEFAULT_OUTPUT_LENGTH = 20

print(training_input.shape, training_output.shape)

(30000,) (30000,)


In [None]:
import numpy as np

class Embeddings():
    def __init__(self, path, vector_dimension):
        self.path = path 
        self.vector_dimension = vector_dimension
    
    @staticmethod
    def get_coefs(word, *arr): 
        return word, np.asarray(arr, dtype='float32')

    def get_embedding_index(self):
        embeddings_index = dict(self.get_coefs(*o.split(" ")) for o in open(self.path, errors='ignore'))
        return embeddings_index

    def create_embedding_matrix(self, tokenizer, max_features):
        model_embed = self.get_embedding_index()

        embedding_matrix = np.zeros((max_features + 1, self.vector_dimension))
        for word, index in tokenizer.word_index.items():
            if index > max_features:
                break
            else:
                try:
                    embedding_matrix[index] = model_embed[word]
                except:
                    continue
        return embedding_matrix

In [None]:
# Preprocess data
import re
def preprocess_sentence(sentence):
    ret = sentence.lower()
    ret = ret.strip()
    ret = re.sub("([?.!,])", " \1 ", ret)
    ret = re.sub('[" "]+', " ", ret)
    ret = re.sub("-", " ", ret)
    ret = ret.strip()
    return ret

training_input = list(map(lambda x: preprocess_sentence(x), training_input))
training_output = list(map(lambda x: preprocess_sentence(x), training_output))

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Sentences to tokens
original_tokenizer = Tokenizer(num_words=22000)
original_tokenizer.fit_on_texts(training_input)

clone_tokenizer = Tokenizer(num_words=22000)
clone_tokenizer.fit_on_texts(training_output)

original_vocab_size, clone_vocab_size = 22000, 22000#len(original_tokenizer.word_counts), len(clone_tokenizer.word_counts)
largest_vocab_size = max([original_vocab_size, clone_vocab_size])
print(len(original_tokenizer.word_counts), largest_vocab_size, original_vocab_size, clone_vocab_size)

54015 22000 22000 22000


In [None]:
#Word embedding
embedding = Embeddings(
    path = 'glove.6B.200d.txt',
    vector_dimension = 200,
)

original_embedding_matrix = embedding.create_embedding_matrix(original_tokenizer, largest_vocab_size)
clone_embedding_matrix = embedding.create_embedding_matrix(clone_tokenizer, largest_vocab_size)
original_embedding_dim = 200
clone_embedding_dim = 200

In [None]:
# Get index to word from word to index dict
def inverse_mapping(f):
    return f.__class__(map(reversed, f.items()))

# Build encoding dictionary
original_encoding, original_decoding = dict(original_tokenizer.word_index), inverse_mapping(dict(original_tokenizer.word_index))
clone_encoding, clone_decoding = dict(clone_tokenizer.word_index), inverse_mapping(dict(clone_tokenizer.word_index))

# Transform the data
encoded_training_input = original_tokenizer.texts_to_sequences(training_input)
encoded_training_output = clone_tokenizer.texts_to_sequences(training_output)


# Encoder Input
training_encoder_input = pad_sequences(encoded_training_input, maxlen=DEFAULT_INPUT_LENGTH)
# Decoder Input (need padding by START_CHAR_CODE)
# training_decoder_input = np.zeros_like(encoded_training_output)
# training_decoder_input[:, 1:] = encoded_training_output[:,:-1]
# training_decoder_input[:, 0] = START_CHAR_CODE
training_decoder_input = pad_sequences(encoded_training_input, maxlen=DEFAULT_INPUT_LENGTH)
encoded_training_output = pad_sequences(encoded_training_output, maxlen=DEFAULT_INPUT_LENGTH)


(30000, 20)


In [None]:
# One hot encoding in chunks

size = int(len(encoded_training_output) / 30)
for i in range(1, 35):
    with gzip.open(f'pickle/{i}.pkl', 'wb') as f:
        array = np.eye(clone_vocab_size+1)[encoded_training_output[size*(i-1):size*i]]
        pickle.dump(array, f, pickle.HIGHEST_PROTOCOL)

In [None]:
def output_gen():        
    for i in range(1, 31):
        with gzip.open(f'pickle/{i}.pkl', 'rb') as f:
            yield pickle.load(f)

In [None]:
print(size)
def model_generator():
    while True:
        for index, output in enumerate(output_gen()):
            for i in range(size):
                yield ([np.expand_dims(training_encoder_input[size*index+i], axis=0), np.expand_dims(training_decoder_input[size*index+i], axis=0)], np.expand_dims(output[i], axis=0))

1000


In [None]:
def create_model(
        input_length=20,
        output_length=20):

    encoder_input = tf.keras.Input(shape=(input_length,))
    decoder_input = tf.keras.Input(shape=(output_length,))

    encoder = tf.keras.layers.Embedding(original_embedding_matrix.shape[0], original_embedding_dim, weights=[original_embedding_matrix], trainable=False)(encoder_input)
    encoder, h_encoder, u_encoder = tf.keras.layers.LSTM(64, return_state=True)(encoder)

    decoder = tf.keras.layers.Embedding(clone_embedding_matrix.shape[0], clone_embedding_dim, weights=[clone_embedding_matrix], trainable=False)(decoder_input)
    decoder = tf.keras.layers.LSTM(64, return_sequences=True)(decoder, initial_state=[h_encoder, u_encoder])
    decoder = tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(clone_vocab_size+1))(decoder)

    model = tf.keras.Model(inputs=[encoder_input, decoder_input], outputs=[decoder])
    model.compile(optimizer='adam', loss='mean_squared_error', metrics=['accuracy'])

    return model

model = create_model()

print(model.summary())

model.fit(model_generator(), 
          epochs=20, 
          batch_size=4, 
          steps_per_epoch=training_decoder_input.shape[0]//256,
          verbose=1)

model.save('model')

Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 20)]         0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 20)]         0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 20, 200)      4400200     input_1[0][0]                    
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 20, 200)      4400200     input_2[0][0]                    
_______________________________________________________________________________________