# Unilever encoder decoder RNN model

## Model

<img src="./images/architecture2.png">

In [1]:
# ---- Imports -----
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

import pandas as pd, numpy as np

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print('device:', device)

device: cpu


## Reading data for the encoder
Get the data of Brand/territory variance matrix by month.
Not all brands are shipped to all territories. Therefore, filtering on a specific brand may return only some territories and not all of them. That is why it is important to get all territories and associate a zero to the ones who are missing. 

In [2]:
#Import dataframes from pickle file (saved previously)
from helper_save_load import load_from_pickle
df_a, df_f, df_v = load_from_pickle("dataframes_Dollars.pickle")
del df_a, df_f

### Grouping territories

In [None]:
territories = [territory for territory, values in df_v.groupby(['Territory']).groups.items()]
print(territories)
print(len(territories),' territories')

In [4]:
empty_df = pd.DataFrame(0.0, index=[0], columns=territories)

### Retrieving variance vector from brand and month
This function gets from the A/F dataset the variance by territory for a given month and brand. Multibrands are not supported, only `Brand_1` is considered in this study. The order of territories is the same as the `territories` vector. In case no data is available, a zero vector is returned

In [5]:
#return pivot table for the required month in Millions of $
def get_pivot_month_Territory_by_brand(month, brand, flatten=1):
    #Group by Territory and Brand
    df_group_Br_Tr = df_v[df_v['Brand'] == brand].groupby(['Brand', 'Territory']).sum()
    result = pd.pivot_table(df_group_Br_Tr, values=[month], index=['Brand'], 
                            columns=['Territory'], aggfunc=np.sum, fill_value=0) / 1e6
    result.columns = result.columns.droplevel()  #drop month level as there is only one month
    if len(result.index)>0:   #if no data is available, return a zero vector
    #Align with empty_df that includes all territories
        result = empty_df.append(result, sort=True).fillna(0)      
        result.drop(0, inplace=True)  #drop line 0 of empty_df
    else:
        result = empty_df 
    if (flatten==1): result = result.values.flatten()
    return (result)  


get_pivot_month_Territory_by_brand('Jan_2018', '05-AXE SA Brand', 1)

array([ 2.027350e-03,  1.380900e-03,  0.000000e+00,  0.000000e+00,
       -9.378000e-05,  1.371011e-02, -6.985500e-04, -1.274860e-03,
        0.000000e+00,  0.000000e+00,  2.840290e-03,  1.669580e-03,
       -5.781450e-03,  3.096440e-03, -1.241803e-02,  0.000000e+00,
        1.315100e-04, -8.686940e-03,  0.000000e+00, -5.194400e-04,
        2.118400e-04,  0.000000e+00,  0.000000e+00,  0.000000e+00,
        0.000000e+00,  0.000000e+00,  0.000000e+00,  5.971960e-03,
        0.000000e+00,  0.000000e+00,  0.000000e+00,  0.000000e+00,
        0.000000e+00, -2.078000e-04])

## Preparing data for the decoder
Get commentaries and dictionary from file

In [None]:
#Import dataframes from pickle file (saved previously)
from helper_save_load import load_from_pickle
dfc, vocab, word_to_ix, ix_to_word = load_from_pickle("commentaries.pickle")
display(dfc.head(2))
print('index of word lcl:', word_to_ix['lcl'])
print('word at index 0:', ix_to_word[0])

In [8]:
# print('Comparing results columns and territories vector ...')
# for index, row in dfc.iterrows():
#     vector = get_pivot_month_Territory_by_brand(row['Month_f'], row['Brand_1'], 0)
# #     display(vector)
#     diff = [i for i, j in zip(vector.columns.tolist(), territories) if i != j]
#     if len(diff) != 0: 
#         print('Differences found !!!!!')
#         print(row['Month_f'], '**', row['Comment_w'], '**', row['Brand_1'])        
#         print(diff)
# print('All columns were parsed, the differences should be shown by the loop if there are any!')

In [None]:
dfc['Comment_w'].replace('[NOC]', '[SOS] [EOS]', inplace=True)   #replace NoComment with StartOfSentence + EndOfSentence
commentaries = dfc['Comment_w']
commentaries[0:5]

In [279]:
from pickle import load
from numpy import array
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.utils.vis_utils import plot_model
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense
from keras.layers import Embedding
from keras.layers import RepeatVector
from keras.layers import TimeDistributed
from keras.callbacks import ModelCheckpoint

### Tokenizing words (RNN2 inputs and outputs)

In [287]:
# fit a tokenizer
def create_tokenizer(lines):
    tokenizer = Tokenizer(filters='')
    tokenizer.fit_on_texts(lines)
    return tokenizer

# max sentence length
def max_length(lines):
    return max(len(line.split()) for line in lines)

In [None]:
# prepare tokenizer for commentaries
tokenizer = create_tokenizer(commentaries)
tokenizer.word_index

In [289]:
#Calculate vocabulary size
vocab_size = len(tokenizer.word_index) + 1
#Calculate maximum length of commentaries
com_length = max_length(commentaries)

print('Vocabulary size (vocab_size):', vocab_size)
print('Max length of commentary (com_length):', com_length)
print('Number of commentaries :', len(commentaries))

Vocabulary size (vocab_size): 678
Max length of commentary (com_length): 127
Number of commentaries : 1093


In [383]:
#Filter only non empty commentaries
commentaries = [c for c in commentaries if c != '[SOS] [EOS]']

In [384]:
# encode and pad sequences
def encode_sequences(tokenizer, length, lines):
    # integer encode sequences
    X = tokenizer.texts_to_sequences(lines)
    # pad sequences with 0 values
    X = pad_sequences(X, maxlen=length, padding='post')
    return X

#Tokenizing all comments



trainX = encode_sequences(tokenizer, com_length, commentaries)
print('trainX=\n', trainX)

#Shifting tokenized words by 1 to predict next word in RNN2
trainY = np.zeros((len(commentaries), com_length), dtype='int')
trainY[:,0:com_length-2] = trainX[:,1:com_length-1]
print('\ntrainY=\n', trainY)

trainX=
 [[  1  18  19 ...   0   0   0]
 [  1  18  19 ...   0   0   0]
 [  1 130  68 ...   0   0   0]
 ...
 [  1   4  38 ...   0   0   0]
 [  1   3  71 ...   0   0   0]
 [  1   3  71 ...   0   0   0]]

trainY=
 [[ 18  19 388 ...   0   0   0]
 [ 18  19   8 ...   0   0   0]
 [130  68   2 ...   0   0   0]
 ...
 [  4  38  22 ...   0   0   0]
 [  3  71   6 ...   0   0   0]
 [  3  71   6 ...   0   0   0]]


### One-hot encoding RNN2 outputs

In [385]:
# one hot encode target sequence
def encode_output(sequences, vocab_size):
    ylist = list()
    for sequence in sequences:
        encoded = to_categorical(sequence, num_classes=vocab_size)
        ylist.append(encoded)
    y = array(ylist)
    y = y.reshape(sequences.shape[0], sequences.shape[1], vocab_size)
    return y

trainY = encode_output(trainY, vocab_size)
trainY

array([[[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.]],

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.]],

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 1., ..., 0., 0., 0.],
        ...,
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.]],

       ...,

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0.

In [386]:
#One-hot encode inputs in trainX_oh 
trainX_oh = encode_output(trainX, vocab_size)
trainX_oh

array([[[0., 1., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.]],

       [[0., 1., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.]],

       [[0., 1., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.]],

       ...,

       [[0., 1., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0.

### Preparing training data for encoder

In [None]:
var_trainX = []
for i, (index, row) in zip(range(len(commentaries)),dfc.iterrows()):
    vector = get_pivot_month_Territory_by_brand(row['Month_f'], row['Brand_1'], 0)
    var_trainX.append(vector.values.tolist()[0])
    if i<5:
        print(i, '**', index, '**', row['Month_f'], '**', row['Comment_w'], '**', row['Brand_1'])  
        print(trainX[i])
        display(vector)  

var_trainX = np.asarray(var_trainX)
print(var_trainX)

## 2. Decoder (RNN2)

Receives the variance vector that is concatenated with the embedding vector of the word, then is trained to predict the next word using the current word from the commentary of month i related to brand k. 

**It makes senses also to classify the commentaries in classes, such as: over delivery, driven by territory, orders phased, ...**

<img src="./images/decoder-arch.png">

In [141]:
from pickle import load
from numpy import array
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.utils.vis_utils import plot_model
from keras.models import Sequential
from keras.models import Model
from keras.layers import LSTM, GRU
from keras.layers import Input
from keras.layers import concatenate
from keras.layers import Dense
from keras.layers import Embedding
from keras.layers import RepeatVector
from keras.layers import TimeDistributed
from keras.callbacks import ModelCheckpoint

In [387]:
#Embedding size
embed_size = 200
# Preparing parameters
hidden_size = 256
# Number of words in vocabulary
src_vocab = vocab_size
tar_vocab = src_vocab
# Max length of input/ouput sentence
src_timesteps = com_length #max(len(line.split()) for line in dfc['Comment_w'])
tar_timesteps = src_timesteps
# Length of variance vector
varv_length = len(empty_df.columns)
# Number of commentaries
num_comments = len(commentaries)

#Overview of the parameters calculated from dataset
print('Embedding size: embed_size =', embed_size)
print('Size of LSTM: hidden_size =', hidden_size)
print('Commentaries vocabulary length: src_vocab =', src_vocab)
print('Commentaries length (output): src_timesteps =', src_timesteps)
print('Variance vector length: varv_length =', varv_length)
print('Number of commentaries: num_comments =', num_comments)

Embedding size: embed_size = 200
Size of LSTM: hidden_size = 256
Commentaries vocabulary length: src_vocab = 678
Commentaries length (output): src_timesteps = 127
Variance vector length: varv_length = 34
Number of commentaries: num_comments = 275


encoder_input_data = (comment_num, variance pos, variance value)  => dimension (comment len, variance vector len, 1)

decoder_input_data = (comment num, word pos, word one-hot encoded vector) => (comments number, comment len, )

decoder_target_data = (comment num, word pos, word one-hot encoded vector) - words are shifted of 1

In [226]:
# Define an input sequence and process it.
encoder_inputs = Input(shape=(None, 1))   #we feed encoder with one variance by timestep
encoder = LSTM(hidden_size, return_state=True)
encoder_outputs, state_h, state_c = encoder(encoder_inputs)
# We discard `encoder_outputs` and only keep the states.
encoder_states = [state_h, state_c]

# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = Input(shape=(None, 1))  #we feed the decoder with one-hot encoded words
#We convert one-hot encoded representation to embedding
# comment_Embedding = Embedding(src_vocab, embed_size, input_length=1, mask_zero=True)(decoder_inputs)

# We set up our decoder to return full output sequences,
# and to return internal states as well. We don't use the
# return states in the training model, but we will use them in inference.
decoder_lstm = LSTM(hidden_size, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs, initial_state=encoder_states)

decoder_dense = Dense(src_vocab, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

# Run training
model.compile(optimizer='adam', loss='categorical_crossentropy')      #rmsprop
print(model.summary())

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_23 (InputLayer)           (None, None, 1)      0                                            
__________________________________________________________________________________________________
input_24 (InputLayer)           (None, None, 1)      0                                            
__________________________________________________________________________________________________
lstm_15 (LSTM)                  [(None, 256), (None, 264192      input_23[0][0]                   
__________________________________________________________________________________________________
lstm_16 (LSTM)                  [(None, None, 256),  264192      input_24[0][0]                   
                                                                 lstm_15[0][1]                    
          

#### Reshaping input/output vectors for LSTM

inputs are 3-dim with the following format: (samples, time steps, features)
- **Samples**. One sequence is one sample. A batch is comprised of one or more samples.
- **Time Steps**. One time step is one point of observation in the sample.
- **Features**. One feature is one observation at a time step.



**RNN1** :
- **samples**: number of comments: `num_comments` 
- **time steps**: number of territories: `varv_length`
- **Features**: one element per territory: `1`

In [301]:
var_trainXr = var_trainX.reshape(num_comments, varv_length, 1)
# var_trainXr[0]

**RNN2** :
- **samples**: number of comments: `num_comments` 
- **time steps**: max length of a commentary: `com_length`
- **Features**: token of each time step: `1`

In [299]:
trainXr = trainX.reshape(num_comments, com_length, 1)
# print(trainXr[0:2])

In [258]:
batch_size = 40
epochs = 10

filename = 'unilever.h5'
checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
model.fit([var_trainXr, trainXr], trainY, batch_size=batch_size, epochs=epochs, validation_split=0.2, 
          callbacks=[checkpoint], verbose=1)


Train on 874 samples, validate on 219 samples
Epoch 1/10

Epoch 00001: val_loss improved from inf to 0.23075, saving model to unilever.h5
Epoch 2/10

Epoch 00002: val_loss improved from 0.23075 to 0.23025, saving model to unilever.h5
Epoch 3/10

Epoch 00003: val_loss did not improve from 0.23025
Epoch 4/10

Epoch 00004: val_loss did not improve from 0.23025
Epoch 5/10

Epoch 00005: val_loss did not improve from 0.23025
Epoch 6/10

Epoch 00006: val_loss did not improve from 0.23025
Epoch 7/10

Epoch 00007: val_loss did not improve from 0.23025
Epoch 8/10

Epoch 00008: val_loss did not improve from 0.23025
Epoch 9/10

Epoch 00009: val_loss did not improve from 0.23025
Epoch 10/10

Epoch 00010: val_loss did not improve from 0.23025


<keras.callbacks.History at 0x16a7a3b30f0>

## Sampling model

1) Encode input and retrieve initial decoder state

2) Run one step of decoder with this initial state and a "start of sequence" token as target. Output will be the next target token.

3) Repeat with the current target token and current states

In [259]:
# Define sampling models
encoder_model = Model(encoder_inputs, encoder_states)   #(input,output)

decoder_state_input_h = Input(shape=(hidden_size,))
decoder_state_input_c = Input(shape=(hidden_size,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_outputs, state_h, state_c = decoder_lstm(decoder_inputs, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs] + decoder_states)

In [260]:
# encoder_model.predict(input_seq)

In [261]:
encoder_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_23 (InputLayer)        (None, None, 1)           0         
_________________________________________________________________
lstm_15 (LSTM)               [(None, 256), (None, 256) 264192    
Total params: 264,192
Trainable params: 264,192
Non-trainable params: 0
_________________________________________________________________


In [262]:
decoder_model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_24 (InputLayer)           (None, None, 1)      0                                            
__________________________________________________________________________________________________
input_27 (InputLayer)           (None, 256)          0                                            
__________________________________________________________________________________________________
input_28 (InputLayer)           (None, 256)          0                                            
__________________________________________________________________________________________________
lstm_16 (LSTM)                  [(None, None, 256),  264192      input_24[0][0]                   
                                                                 input_27[0][0]                   
          

In [263]:
reverse_input_word_index = dict((i+1, word) for i, word in enumerate(tokenizer.word_index))
reverse_input_word_index[0] = ''

In [265]:
for c in range(2,3):
    print('c=',c)

    input_seq = var_trainXr[c:c+1]    #To have 3 dim. One variance vector
    print('Original comment: ', commentaries[c])
    comment = commentaries[c].split()
    # print('input_seq=', input_seq)

    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq)
#     print('states_value =', states_value)

    # Generate empty target sequence.
    target_seq = np.zeros((1, 1, 1))  
    # Populate the first character of target sequence with the start character.
    target_seq[0, 0, 0] = tokenizer.word_index['[sos]']   
#     target_seq = trainXr[c:c+1]
    print('target_seq =', target_seq)

    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    i = 0
    decoded_sentence = []

    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)
#         print('max probability =',np.max(output_tokens[0, -1, :]))

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
#         print('token of max =', sampled_token_index)
        
        sampled_word = reverse_input_word_index[sampled_token_index]
        decoded_sentence.append (sampled_word)

        # Exit condition: either hit max length
        # or find stop character.
        if (sampled_word == '[eos]' or len(decoded_sentence) > src_timesteps):
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1, 1, 1))
        if i<len(comment):
            target_seq[0, 0, 0] = tokenizer.word_index[comment[0].lower()]
            i = i + 1
        else:
            target_seq[0, 0, 0] = sampled_token_index
#         target_seq[0, 0, 0] = sampled_token_index
        
        
        

        # Update states
        states_value = [h, c]
        
    print('Result =', ' '.join(decoded_sentence))


c= 2
Original comment:  [NOC]
target_seq = [[[2.]]]
Result =                                                                                                                                


In [None]:
for seq_index in range(100):
    # Take one sequence (part of the training set)
    # for trying out decoding.
    input_seq = encoder_input_data[seq_index: seq_index + 1]
    decoded_sentence = decode_sequence(input_seq)
    print('-')
    print('Input sentence:', input_texts[seq_index])
    print('Decoded sentence:', decoded_sentence)

## Model without encoder +embedding (without variance inputs)

In [398]:
##################  Second version without variance vector, only training on commentaries with RNN2 #################
embed_size = 100
# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = Input(shape=(None,))    #src_timesteps #we feed the decoder with tokenized word

word_Embedding = Embedding(src_vocab, embed_size,  mask_zero=True)  #input_length=src_timesteps,
embded_out = word_Embedding(decoder_inputs)

# We set up our decoder to return full output sequences,
# and to return internal states as well. We don't use the
# return states in the training model, but we will use them in inference.
decoder_lstm = LSTM(hidden_size, return_sequences=True)      #, return_sequences=True, return_state=True)
decoder_outputs = decoder_lstm(embded_out)  #.reshape(-1,embed_size)     #(decoder_inputs)

decoder_dense = Dense(src_vocab, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = Model(decoder_inputs, decoder_outputs)

# Run training
model.compile(optimizer='adam', loss='categorical_crossentropy')      #rmsprop
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_51 (InputLayer)        (None, None)              0         
_________________________________________________________________
embedding_21 (Embedding)     (None, None, 100)         67800     
_________________________________________________________________
lstm_33 (LSTM)               (None, None, 256)         365568    
_________________________________________________________________
dense_21 (Dense)             (None, None, 678)         174246    
Total params: 607,614
Trainable params: 607,614
Non-trainable params: 0
_________________________________________________________________
None


In [409]:
################## Training model without variance #################################
batch_size = 20
epochs = 10

filename = 'unilever_WVOH.h5'
# checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
model.fit(trainX, trainY, batch_size=batch_size, epochs=epochs, validation_split=0.1, 
           verbose=1)    #callbacks=[checkpoint],

Train on 247 samples, validate on 28 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x16a1bacd748>

In [410]:
############## Sampling model without encoder + Embedding #################
decoder_outputs = word_Embedding(decoder_inputs)
decoder_outputs = decoder_lstm(decoder_outputs)
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model(decoder_inputs, decoder_outputs)

In [None]:
######### Sampling with model without encoder + Embedding ########

for c in range(1,50):
    comment = commentaries[c].split()
    if comment[1] == '[EOS]': continue

    print('\nc=',c)
    print('Original comment: ', commentaries[c])    

    # Generate empty target sequence.
    target_seq = np.zeros((1, 1))  
    # Populate the first character of target sequence with the start character.
#     target_seq[0, 0] = tokenizer.word_index['[sos]']       

    target_seq[0, 0] = tokenizer.word_index[comment[2].lower()]
    print('seed word:', comment[2].lower())

#     target_seq = trainXr[c:c+1]
#     print('target_seq =', target_seq)

    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    i = 0
    decoded_sentence = []

    while not stop_condition:
        output_tokens = decoder_model.predict(target_seq)
#         print(output_tokens[0, -1, :])
        print('max probability =',np.max(output_tokens[0, -1, :]))
    
    
    

        result = log_probs.data.numpy().tolist()[0]   #Convert tensor to list
        result_s = sorted(result, reverse=True)

        mydict = [(ix_to_word[result.index(p)], np.exp(p)) for p in result_s[0:topn]]
    

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        print('token of max =', sampled_token_index)
        
        sampled_word = reverse_input_word_index[sampled_token_index]
        decoded_sentence.append (sampled_word)
        
        break

        # Exit condition: either hit max length
        # or find stop character.
        if (sampled_word == '[eos]' or len(decoded_sentence) > src_timesteps):
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1, 1))
        if i<len(comment):
            target_seq[0, 0] = tokenizer.word_index[comment[0].lower()]
            i = i + 1
        else:
            target_seq[0, 0] = sampled_token_index
#         target_seq[0, 0, 0] = sampled_token_index

        
    print('Result =', ' '.join(decoded_sentence))


In [79]:
# #inputs
# variance_input = Input(shape=(varv_length,), name='variance_input')
# word_input = Input(shape=(src_timesteps,), name='word_input')

# #Defining embedding layer: keras.layers.Embedding(input_dim, output_dim)
# comment_Embedding = Embedding(src_vocab, embed_size, input_length=src_timesteps, mask_zero=True)(word_input)

# #merge word embeddings and variance vector
# # merged = concatenate([variance_rep, comment_Embedding])

# #Creating dense layer for LSM initialization by variance input
# dense_var = Dense(hidden_size, activation='relu')(variance_input)

# #Defining LSTM
# # decoder = LSTM(units=hidden_size, return_sequences=True, initial_state=dense_var)(comment_Embedding)

# decoder_lstm = LSTM(hidden_size)
# decoder_outputs = decoder_lstm(comment_Embedding)   #, initial_state=dense_var

# decoder_dense = Dense(src_vocab, activation='softmax')(decoder_outputs)

# model = Model(inputs=[word_input], outputs=[decoder_dense])

# print(model.summary())

In [81]:
# #variance inputs
# variance_input = Input(shape=(None, varv_length), name='variance_input')
# word_input = Input(shape=(None, src_vocab), name='word_input')

# #Defining embedding layer: keras.layers.Embedding(input_dim, output_dim)
# comment_Embedding = Embedding(src_vocab, embed_size, input_length=src_timesteps, mask_zero=True)(word_input)

# #merge word embeddings and variance vector
# merged = Concatenate([variance_inputs, comment_Embedding])

# decoder = LSTM(units=hidden_size, input_shape=(varv_length + embed_size,), return_sequences=True)

# # encoder_outputs, state_h, state_c = decoder(merged)


# model = Sequential()
# model.add(merged)
# model.add(decoder)
# model.add(TimeDistributed(Dense(tar_vocab, activation='softmax')))
# model.compile(optimizer='adam', loss='categorical_crossentropy')

# model = Model(inputs=[first_input, second_input, third_input], outputs=merge_two)

# print(model.summary())

In [82]:
# # define NMT model
# def define_model(src_vocab, tar_vocab, src_timesteps, tar_timesteps, n_units):
#     model = Sequential()
#     #Defining embedding layer: keras.layers.Embedding(input_dim, output_dim)
#     model.add(Embedding(src_vocab, n_units, input_length=src_timesteps, mask_zero=True))
#     # Defining Encoder LSTM
#     model.add(LSTM(n_units))    
#     #Defining Decoder LSTM    
#     model.add(RepeatVector(tar_timesteps))
#     model.add(LSTM(n_units, return_sequences=True))   
#     model.add(TimeDistributed(Dense(tar_vocab, activation='softmax')))
#     return model

# model = define_model(fre_vocab_size, eng_vocab_size, fre_length, eng_length, 256)




#     first = Sequential()
#     first.add(Dense(1, input_doi=(2,), activation='sigmoid'))

#     second = Sequential()
#     second.add(Dense(1, input_shape=(1,), activation='sigmoid'))

#     third = Sequential()
#     # of course you must provide the input to result with will be your x3
#     third.add(Dense(1, input_shape=(1,), activation='sigmoid'))
    
    
#     merged = Concatenate([first, second])
    


# def define_model(src_vocab, tar_vocab, src_timesteps, tar_timesteps, hidden_size):
#     model = Sequential()
#     model.add(Embedding(vocabulary, hidden_size, input_length=num_steps))
#     model.add(LSTM(hidden_size, return_sequences=True))
#     model.add(LSTM(hidden_size, return_sequences=True))
#     if use_dropout:
#         model.add(Dropout(0.5))
#     model.add(TimeDistributed(Dense(vocabulary)))
#     model.add(Activation('softmax'))


In [83]:
# # define model
# model = define_model(fre_vocab_size, eng_vocab_size, fre_length, eng_length, 256)
# model.compile(optimizer='adam', loss='categorical_crossentropy')

# # summarize defined model
# print(model.summary())

In [78]:
#Old pytorch code
#----------------


# class DecoderRNN(nn.Module):
#     def __init__(self, hidden_size, output_size):
#         super(DecoderRNN, self).__init__()
#         self.hidden_size = hidden_size

#         self.embedding = nn.Embedding(output_size, hidden_size)  #input and input sizes are identical
#         self.gru = nn.GRU(hidden_size, hidden_size)
#         self.out = nn.Linear(hidden_size, output_size)
#         self.softmax = nn.LogSoftmax(dim=1)

#     def forward(self, input, hidden):
#         output = self.embedding(input).view(1, 1, -1)
#         output = F.relu(output)
#         output, hidden = self.gru(output, hidden)
#         output = self.softmax(self.out(output[0]))
#         return output, hidden

#     def initHidden(self):
#         return torch.zeros(1, 1, self.hidden_size, device=device)

In [81]:
decoder = DecoderRNN(300, 1)
print(decoder)

DecoderRNN(
  (embedding): Embedding(1, 300)
  (gru): GRU(300, 300)
  (out): Linear(in_features=300, out_features=1, bias=True)
  (softmax): LogSoftmax()
)


## Global Architecture 

In [90]:
#define components sizes
CONTEXT_SIZE_1 = 300
CONTEXT_SIZE_2 = 300
EMBEDDING_DIM = 30


encoder = EncoderRNN(VAR_MONTH_DATA_SIZE, CONTEXT_SIZE_1)
decoder = DecoderRNN(CONTEXT_SIZE_2, 1)

learning_rate=0.01
encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)

criterion = nn.NLLLoss()



In [91]:
#iteration here
encoder_hidden = encoder.initHidden()
encoder_optimizer.zero_grad()
decoder_optimizer.zero_grad()


In [None]:
#Training, you can re-run this function as much time as needed to train more
for epoch in range(60):
    total_loss = 0
    for context, target in trigrams:

        # Step 1. Prepare the inputs to be passed to the model (i.e, turn the words
        # into integer indices and wrap them in tensors)
        context_idxs = torch.tensor([word_to_ix[w] for w in context], dtype=torch.long)

        # Step 2. Recall that torch *accumulates* gradients. Before passing in a
        # new instance, you need to zero out the gradients from the old
        # instance
        model.zero_grad()

        # Step 3. Run the forward pass, getting log probabilities over next
        # words
        log_probs = model(context_idxs)

        # Step 4. Compute your loss function. (Again, Torch wants the target
        # word wrapped in a tensor)
        loss = loss_function(log_probs, torch.tensor([word_to_ix[target]], dtype=torch.long))

        # Step 5. Do the backward pass and update the gradient
        loss.backward()
        optimizer.step()

        # Get the Python number from a 1-element Tensor by calling tensor.item()
        total_loss += loss.item()
    losses.append(total_loss)
    print('epoch %d : Total loss=%.3f' % (epoch, total_loss))
#print(losses)  # The loss decreased every iteration over the training data!

In [None]:
class NGramLanguageModeler(nn.Module):

    def __init__(self, vocab_size, embedding_dim, context_size):
        super(NGramLanguageModeler, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim) #Embedding matrix: each line is the embedding of one word
        self.linear1 = nn.Linear(context_size * embedding_dim, 128) #Parameter matrix embedding and hidden layer
        self.linear2 = nn.Linear(128, vocab_size)  #Parameter matrix between hidden layer and output

    def forward(self, inputs):
        embeds = self.embeddings(inputs).view((1, -1))  #get embedding from Embedding matrix
        out = F.relu(self.linear1(embeds))  #
        out = self.linear2(out)
        log_probs = F.log_softmax(out, dim=1)
        return log_probs


losses = []
loss_function = nn.NLLLoss()
model = NGramLanguageModeler(len(vocab), EMBEDDING_DIM, CONTEXT_SIZE)

optimizer = optim.SGD(model.parameters(), lr=0.01)   #before 0.001

In [None]:
#Training, you can re-run this function as much time as needed to train more
for epoch in range(60):
    total_loss = 0
    for context, target in trigrams:

        # Step 1. Prepare the inputs to be passed to the model (i.e, turn the words
        # into integer indices and wrap them in tensors)
        context_idxs = torch.tensor([word_to_ix[w] for w in context], dtype=torch.long)

        # Step 2. Recall that torch *accumulates* gradients. Before passing in a
        # new instance, you need to zero out the gradients from the old
        # instance
        model.zero_grad()

        # Step 3. Run the forward pass, getting log probabilities over next
        # words
        log_probs = model(context_idxs)

        # Step 4. Compute your loss function. (Again, Torch wants the target
        # word wrapped in a tensor)
        loss = loss_function(log_probs, torch.tensor([word_to_ix[target]], dtype=torch.long))

        # Step 5. Do the backward pass and update the gradient
        loss.backward()
        optimizer.step()

        # Get the Python number from a 1-element Tensor by calling tensor.item()
        total_loss += loss.item()
    losses.append(total_loss)
    print('epoch %d : Total loss=%.3f' % (epoch, total_loss))
#print(losses)  # The loss decreased every iteration over the training data!

In [75]:
import time
import math

def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

In [None]:
teacher_forcing_ratio = 0.5


def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, 
          max_length=MAX_LENGTH):
    
    encoder_hidden = encoder.initHidden()

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)

    encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

    loss = 0

    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(input_tensor[ei], encoder_hidden)
        encoder_outputs[ei] = encoder_output[0, 0]

    decoder_input = torch.tensor([[SOS_token]], device=device)

    decoder_hidden = encoder_hidden

    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    if use_teacher_forcing:
        # Teacher forcing: Feed the target as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            loss += criterion(decoder_output, target_tensor[di])
            decoder_input = target_tensor[di]  # Teacher forcing

    else:
        # Without teacher forcing: use its own predictions as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            topv, topi = decoder_output.topk(1)
            decoder_input = topi.squeeze().detach()  # detach from history as input

            loss += criterion(decoder_output, target_tensor[di])
            if decoder_input.item() == EOS_token:
                break

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length

In [None]:
def trainIters(encoder, decoder, n_iters, print_every=1000, plot_every=100, learning_rate=0.01):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
    training_pairs = [tensorsFromPair(random.choice(pairs))
                      for i in range(n_iters)]
    criterion = nn.NLLLoss()

    for iter in range(1, n_iters + 1):
        training_pair = training_pairs[iter - 1]
        input_tensor = training_pair[0]
        target_tensor = training_pair[1]

        loss = train(input_tensor, target_tensor, encoder,
                     decoder, encoder_optimizer, decoder_optimizer, criterion)
        print_loss_total += loss
        plot_loss_total += loss

        if iter % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters),
                                         iter, iter / n_iters * 100, print_loss_avg))

        if iter % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0

    showPlot(plot_losses)

Plotting results
----------------

Plotting is done with matplotlib, using the array of loss values
``plot_losses`` saved while training.


In [None]:
import matplotlib.pyplot as plt
plt.switch_backend('agg')
import matplotlib.ticker as ticker
import numpy as np


def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)

In [None]:
def evaluate(encoder, decoder, sentence, max_length=MAX_LENGTH):
    with torch.no_grad():
        input_tensor = tensorFromSentence(input_lang, sentence)
        input_length = input_tensor.size()[0]
        encoder_hidden = encoder.initHidden()

        encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

        for ei in range(input_length):
            encoder_output, encoder_hidden = encoder(input_tensor[ei],
                                                     encoder_hidden)
            encoder_outputs[ei] += encoder_output[0, 0]

        decoder_input = torch.tensor([[SOS_token]], device=device)  # SOS

        decoder_hidden = encoder_hidden

        decoded_words = []
        decoder_attentions = torch.zeros(max_length, max_length)

        for di in range(max_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            decoder_attentions[di] = decoder_attention.data
            topv, topi = decoder_output.data.topk(1)
            if topi.item() == EOS_token:
                decoded_words.append('<EOS>')
                break
            else:
                decoded_words.append(output_lang.index2word[topi.item()])

            decoder_input = topi.squeeze().detach()

        return decoded_words, decoder_attentions[:di + 1]

In [None]:
def evaluateRandomly(encoder, decoder, n=10):
    for i in range(n):
        pair = random.choice(pairs)
        print('>', pair[0])
        print('=', pair[1])
        output_words, attentions = evaluate(encoder, decoder, pair[0])
        output_sentence = ' '.join(output_words)
        print('<', output_sentence)
        print('')

## Train and evaluate

In [None]:
hidden_size = 256
encoder1 = EncoderRNN(input_lang.n_words, hidden_size).to(device)
attn_decoder1 = AttnDecoderRNN(hidden_size, output_lang.n_words, dropout_p=0.1).to(device)

trainIters(encoder1, attn_decoder1, 75000, print_every=5000)

In [2]:
line='this is a paragraph with<[1> in between</[1> and then there are cases ... where the<[99> number ranges from 1-100</[99>. and there are many other lines in the txt files with<[3> such tags </[3>'
import re
line = re.sub(r"</?\[\d+>", "", line)
line

'this is a paragraph with in between and then there are cases ... where the number ranges from 1-100. and there are many other lines in the txt files with such tags '

In [None]:
import re
comment = 'baselines driven by improving pos l4 +3%'
aa = r"[0-9]+(\.[0-9]+)?\%"
comment = re.sub(aa, "[%]", comment)
comment = re.sub(r"\-\$[0-9].[0-9][0-9]M\b", "[-]", comment)
comment

In [None]:
"/(^|\W)$[0-9]+(\.[0-9][0-9])?\b/"

## Updated version of Keras

In [135]:
from keras.models import Model
from keras.layers import Input, LSTM, Dense

num_encoder_tokens = 20
latent_dim = 200
encoder_inputs = 1

num_decoder_tokens = 30
decoder_outputs = 150


# Define an input sequence and process it.
encoder_inputs = Input(shape=(None, num_encoder_tokens))
encoder = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder(encoder_inputs)
# We discard `encoder_outputs` and only keep the states.
encoder_states = [state_h, state_c]

# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = Input(shape=(None, num_decoder_tokens))
# We set up our decoder to return full output sequences,
# and to return internal states as well. We don't use the 
# return states in the training model, but we will use them in inference.
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs,
                                     initial_state=encoder_states)
decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_5 (InputLayer)            (None, None, 20)     0                                            
__________________________________________________________________________________________________
input_6 (InputLayer)            (None, None, 30)     0                                            
__________________________________________________________________________________________________
lstm_5 (LSTM)                   [(None, 200), (None, 176800      input_5[0][0]                    
__________________________________________________________________________________________________
lstm_6 (LSTM)                   [(None, None, 200),  184800      input_6[0][0]                    
                                                                 lstm_5[0][1]                     
          

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, None, 100)    0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, None, 50)     0                                            
__________________________________________________________________________________________________
lstm_1 (LSTM)                   [(None, 200), (None, 240800      input_1[0][0]                    
__________________________________________________________________________________________________
lstm_2 (LSTM)                   [(None, None, 200),  200800      input_2[0][0]                    
                                                                 lstm_1[0][1]                     
          