In [1]:
import tensorflow as tf
import numpy as np
import nltk  # For tokenize
from tqdm import tqdm  # Progress bar
import pickle  # Saving the data
import os
import random
from opensubsdata import OpensubsData

### init

In [2]:
model = None

corpus = 'opensubs'
corpus_dir = '/Users/fatu/dev/github/fatu/chatbot-tutorial/data/opensubs'

word2id = {}
id2word = {}
idCount = {}
embedding_size = 64
softmaxSamples = 0

pad_token = -1  # Padding
go_token = -1  # Start of sequence
eos_token = -1  # End of sequence
unknown_token = -1  # Word dropped from vocabulary

training_samples = []


vocabulary_size = len(word2id)


learning_rate = 0.002

full_sample_path = '/Users/fatu/dev/github/fatu/chatbot-tutorial/data/samples/dataset-opensubs.pkl'
summary_name = '/Users/fatu/dev/github/fatu/chatbot-tutorial/model'

In [3]:
def getWordId(word, create=True):
    """Get the id of the word (and add it to the dictionary if not existing). If the word does not exist and
    create is set to False, the function will return the unknownToken value
    Args:
        word (str): word to add
        create (Bool): if True and the word does not exist already, the world will be added
    Return:
        int: the id of the word created
    """
    # Should we Keep only words with more than one occurrence ?

    word = word.lower()  # Ignore case

    # At inference, we simply look up for the word
    if not create:
        wordId = word2id.get(word, unknown_token)
    # Get the id if the word already exist
    elif word in word2id:
        wordId = word2id[word]
        idCount[wordId] += 1
    # If not, we create a new entry
    else:
        wordId = len(word2id)
        word2id[word] = wordId
        id2word[wordId] = word
        idCount[wordId] = 1

    return wordId

def extractText(line):
    """Extract the words from a sample lines
    Args:
        line (str): a line containing the text to extract
    Return:
        list<list<int>>: the list of sentences of word ids of the sentence
    """
    sentences = []  # List[List[str]]

    # Extract sentences
    sentencesToken = nltk.sent_tokenize(line)

    # We add sentence by sentence until we reach the maximum length
    for i in range(len(sentencesToken)):
        tokens = nltk.word_tokenize(sentencesToken[i])

        tempWords = []
        for token in tokens:
            tempWords.append(getWordId(token))  # Create the vocabulary and the training sentences

        sentences.append(tempWords)

    return sentences

def saveDataset(filename):
    """Save samples to file
    Args:
        filename (str): pickle filename
    """

    with open(os.path.join(filename), 'wb') as handle:
        data = {  # Warning: If adding something here, also modifying loadDataset
            'word2id': word2id,
            'id2word': id2word,
            'idCount': idCount,
            'training_samples': training_samples
        }
        pickle.dump(data, handle, -1)  # Using the highest protocol available

def loadDataset(filename):
    """Load samples from file
    Args:
        filename (str): pickle filename
    """
    dataset_path = os.path.join(filename)
    print('Loading dataset from {}'.format(dataset_path))
    with open(dataset_path, 'rb') as handle:
        data = pickle.load(handle)  # Warning: If adding something here, also modifying saveDataset
        word2id = data['word2id']
        id2word = data['id2word']
        idCount = data.get('idCount', None)
        training_samples = data['training_samples']

        padToken = word2id['<pad>']
        goToken = word2id['<go>']
        eosToken = word2id['<eos>']
        unknownToken = word2id['<unknown>']  # Restore special words
    return word2id, id2word, idCount, training_samples, padToken, goToken, eosToken, unknownToken
        
def tqdm_wrap(iterable, *args, **kwargs):
    """Forward an iterable eventually wrapped around a tqdm decorator
    The iterable is only wrapped if the iterable contains enough elements
    Args:
        iterable (list): An iterable object which define the __len__ method
        *args, **kwargs: the tqdm parameters
    Return:
        iter: The iterable eventually decorated
    """
    if len(iterable) > 100:
        return tqdm(iterable, *args, **kwargs)
    return iterable

### Load  corpus data

In [54]:
print('Constructing dataset...')
optional = ''

datasetExist = os.path.isfile(full_sample_path)
if not datasetExist:
    # Corpus creation
    corpusData = OpensubsData(corpus_dir)
    # createFullCorpus(corpusData.getConversations())
    padToken = getWordId('<pad>')  # Padding (Warning: first things to add > id=0 !!)
    goToken = getWordId('<go>')  # Start of sequence
    eosToken = getWordId('<eos>')  # End of sequence
    unknownToken = getWordId('<unknown>')  # Word dropped from vocabulary

    # Preprocessing data

    for conversation in tqdm(corpusData.getConversations(), desc='Extract conversations'):
        """Extract the sample lines from the conversations
        Args:
            conversation (Obj): a conversation object containing the lines to extract
        """
        step = 1
        # Iterate over all the lines of the conversation
        for i in tqdm_wrap(
            range(0, len(conversation['lines']) - 1, step),  # We ignore the last line (no answer for it)
            desc='Conversation',
            leave=False
        ):
            inputLine  = conversation['lines'][i]
            targetLine = conversation['lines'][i+1]

            inputWords  = extractText(inputLine['text'])
            targetWords = extractText(targetLine['text'])

            if inputWords and targetWords:  # Filter wrong samples (if one of the list is empty)
                training_samples.append([inputWords, targetWords])
    
    print('Filtering words (vocabSize = {} and wordCount > {})...'.format(
    vocabulary_size,
    1 #filterVocab
    ))
    
    saveDataset(full_sample_path)
else:
    word2id, id2word, idCount, training_samples, pad_token, go_token, eos_token, unknown_token = loadDataset(full_sample_path)

print('Loaded {}: {} words, {} QA'.format(corpus, len(word2id), len(training_samples)))


# filterFromFull()  # Extract the sub vocabulary for the given maxLength and filterVocab

Constructing dataset...
Loading dataset from /Users/fatu/dev/github/fatu/chatbot-tutorial/data/samples/dataset-opensubs.pkl
Loaded opensubs: 115763 words, 1618483 QA


### Prepare the model

In [82]:
# with tf.device(self.getDevice()):
#     model = Model(self.args, self.textData)
print(len(training_samples))

1618483


### Create the computational graph

In [83]:
tf.reset_default_graph()

### create RNN cell

In [84]:
class ProjectionOp:
    """ Single layer perceptron
    Project input tensor on the output dimension
    """
    def __init__(self, shape, scope=None, dtype=None):
        """
        Args:
            shape: a tuple (input dim, output dim)
            scope (str): encapsulate variables
            dtype: the weights type
        """
        assert len(shape) == 2

        self.scope = scope

        # Projection on the keyboard
        with tf.variable_scope('weights_' + self.scope):
            self.W_t = tf.get_variable(
                'weights',
                shape,
                # initializer=tf.truncated_normal_initializer()  # TODO: Tune value (fct of input size: 1/sqrt(input_dim))
                dtype=dtype
            )
            self.b = tf.get_variable(
                'bias',
                shape[0],
                initializer=tf.constant_initializer(),
                dtype=dtype
            )
            self.W = tf.transpose(self.W_t)

    def getWeights(self):
        """ Convenience method for some tf arguments
        """
        return self.W, self.b

    def __call__(self, X):
        """ Project the output of the decoder into the vocabulary space
        Args:
            X (tf.Tensor): input value
        """
        with tf.name_scope(self.scope):
            return tf.matmul(X, self.W) + self.b

In [85]:
units = 512
numLayers = 2
dropout = 0.9
max_length_Deco = 12
max_length_enco = 10

# Placeholders
encoder_inputs  = None
decoder_inputs  = None  # Same that decoderTarget plus the <go>
decoder_targets = None
decoder_weights = None  # Adjust the learning to the target sentence size


In [86]:
#from tensorflow.contrib.rnn.python.ops.core_rnn_cell_impl import MultiRNNCell
def create_rnn_cell():
    enco_deco_cell = tf.contrib.rnn.BasicLSTMCell(units)
    enco_deco_cell = tf.contrib.rnn.DropoutWrapper(
        enco_deco_cell,
        input_keep_prob=1.0,
        output_keep_prob=dropout
    )
    return enco_deco_cell
enco_deco_cell = tf.contrib.rnn.MultiRNNCell([create_rnn_cell() for _ in range(numLayers)])

In [87]:
with tf.name_scope('placeholder_encoder'):
    encoder_inputs  = [tf.placeholder(tf.int32,   [None, ]) for _ in range(max_length_enco)]

with tf.name_scope('placeholder_decoder'):
    decoder_inputs  = [tf.placeholder(tf.int32,   [None, ], name='inputs') for _ in range(max_length_Deco)]
    decoder_targets = [tf.placeholder(tf.int32,   [None, ], name='targets') for _ in range(max_length_Deco)]
    decoder_weights = [tf.placeholder(tf.float32, [None, ], name='weights') for _ in range(max_length_Deco)]

### Define the network

In [88]:
#tf.reset_default_graph()
#if 0 < self.args.softmaxSamples < vocabulary_size:
dtype = tf.float32

outputProjection = ProjectionOp(
    (vocabulary_size, units), scope='softmax_projection', dtype=dtype
)
###
print(encoder_inputs)
print(embedding_size)
print(enco_deco_cell)
print(enco_deco_cell.state_size)
###

decoderOutputs, states = tf.contrib.legacy_seq2seq.embedding_rnn_seq2seq(
            encoder_inputs,
            decoder_inputs,
            enco_deco_cell,
            40004, #len(word2id),
            40004, #len(word2id),
            embedding_size=embedding_size, 
            output_projection=outputProjection.getWeights() if outputProjection else None,
            feed_previous=False
        )

[<tf.Tensor 'placeholder_encoder/Placeholder:0' shape=(?,) dtype=int32>, <tf.Tensor 'placeholder_encoder/Placeholder_1:0' shape=(?,) dtype=int32>, <tf.Tensor 'placeholder_encoder/Placeholder_2:0' shape=(?,) dtype=int32>, <tf.Tensor 'placeholder_encoder/Placeholder_3:0' shape=(?,) dtype=int32>, <tf.Tensor 'placeholder_encoder/Placeholder_4:0' shape=(?,) dtype=int32>, <tf.Tensor 'placeholder_encoder/Placeholder_5:0' shape=(?,) dtype=int32>, <tf.Tensor 'placeholder_encoder/Placeholder_6:0' shape=(?,) dtype=int32>, <tf.Tensor 'placeholder_encoder/Placeholder_7:0' shape=(?,) dtype=int32>, <tf.Tensor 'placeholder_encoder/Placeholder_8:0' shape=(?,) dtype=int32>, <tf.Tensor 'placeholder_encoder/Placeholder_9:0' shape=(?,) dtype=int32>]
64
<tensorflow.contrib.rnn.python.ops.core_rnn_cell_impl.MultiRNNCell object at 0x163670d50>
(LSTMStateTuple(c=512, h=512), LSTMStateTuple(c=512, h=512))


ValueError: Shapes (512, 0) and [None, 40004] are incompatible

### Loss Function ###

In [8]:
loss_function = tf.contrib.legacy_seq2seq.sequence_loss(
                decoderOutputs,
                decoder_targets,
                decoder_weights,
                vocabulary_size,
                softmax_loss_function= sampled_softmax if outputProjection else None  # If None, use default SoftMax
            )
            tf.summary.scalar('loss', loss_function)  # Keep track of the cost

IndentationError: unexpected indent (<ipython-input-8-f32f56a2a3b0>, line 8)

In [9]:
# Initialize the optimizer
opt = tf.train.AdamOptimizer(
    learning_rate=learning_rate,
    beta1=0.9,
    beta2=0.999,
    epsilon=1e-08
)
opt_op = opt.minimize(loss_function)

NameError: name 'learning_rate' is not defined

In [None]:
# Saver/summaries
writer = tf.summary.FileWriter(summary_name
saver = tf.train.Saver(max_to_keep=200)

### Load Model