# Unilever encoder decoder RNN model

## Model

<img src="./images/architecture2.png">

In [3]:
# ---- Imports -----
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

import pandas as pd, numpy as np

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print('device:', device)

device: cpu


## Reading data for the encoder
Get the data of Brand/territory variance matrix by month.
Not all brands are shipped to all territories. Therefore, filtering on a specific brand may return only some territories and not all of them. That is why it is important to get all territories and associate a zero to the ones who are missing. 

In [9]:
#Import dataframes from pickle file (saved previously)
from helper_save_load import load_from_pickle
df_a, df_f, df_v = load_from_pickle("dataframes_Dollars.pickle")
del df_a, df_f

### Grouping territories

In [None]:
territories = [territory for territory, values in df_v.groupby(['Territory']).groups.items()]
print(territories)
print(len(territories),' territories')

In [11]:
empty_df = pd.DataFrame(0.0, index=[0], columns=territories)

### Retrieving variance vector from brand and month
This function gets from the A/F dataset the variance by territory for a given month and brand. Multibrands are not supported, only `Brand_1` is considered in this study. The order of territories is the same as the `territories` vector. In case no data is available, a zero vector is returned

In [12]:
#return pivot table for the required month in Millions of $
def get_pivot_month_Territory_by_brand(month, brand, flatten=1):
    #Group by Territory and Brand
    df_group_Br_Tr = df_v[df_v['Brand'] == brand].groupby(['Brand', 'Territory']).sum()
    result = pd.pivot_table(df_group_Br_Tr, values=[month], index=['Brand'], 
                            columns=['Territory'], aggfunc=np.sum, fill_value=0) / 1e6
    result.columns = result.columns.droplevel()  #drop month level as there is only one month
    if len(result.index)>0:   #if no data is available, return a zero vector
    #Align with empty_df that includes all territories
        result = empty_df.append(result, sort=True).fillna(0)      
        result.drop(0, inplace=True)  #drop line 0 of empty_df
    else:
        result = empty_df 
    if (flatten==1): result = result.values.flatten()
    return (result)  


get_pivot_month_Territory_by_brand('Jan_2018', '05-AXE SA Brand', 1)

array([ 2.027350e-03,  1.380900e-03,  0.000000e+00,  0.000000e+00,
       -9.378000e-05,  1.371011e-02, -6.985500e-04, -1.274860e-03,
        0.000000e+00,  0.000000e+00,  2.840290e-03,  1.669580e-03,
       -5.781450e-03,  3.096440e-03, -1.241803e-02,  0.000000e+00,
        1.315100e-04, -8.686940e-03,  0.000000e+00, -5.194400e-04,
        2.118400e-04,  0.000000e+00,  0.000000e+00,  0.000000e+00,
        0.000000e+00,  0.000000e+00,  0.000000e+00,  5.971960e-03,
        0.000000e+00,  0.000000e+00,  0.000000e+00,  0.000000e+00,
        0.000000e+00, -2.078000e-04])

## Reading data for the decoder
Get commentaries and dictionary from file

In [None]:
#Import dataframes from pickle file (saved previously)
from helper_save_load import load_from_pickle
dfc, vocab, word_to_ix, ix_to_word = load_from_pickle("commentaries.pickle")
display(dfc.head(2))
print('index of word lcl:', word_to_ix['lcl'])
print('word at index 0:', ix_to_word[0])

In [14]:
print('Comparing results columns and territories vector ...')
for index, row in dfc.iterrows():
    vector = get_pivot_month_Territory_by_brand(row['Month_f'], row['Brand_1'], 0)
#     display(vector)
    diff = [i for i, j in zip(vector.columns.tolist(), territories) if i != j]
    if len(diff) != 0: 
        print('Differences found !!!!!')
        print(row['Month_f'], '**', row['Comment_w'], '**', row['Brand_1'])        
        print(diff)
print('All columns were parsed, the differences should be shown by the loop if there are any!')

Comparing results columns and territories vector ...
All columns were parsed, the difference should be shown by the loop if there are any!


## Encoder (RNN1)
Basic structure from: <a href="http://localhost:8888/notebooks/Desktop/Jupyter/pyTorch/seq2seq_translation_tutorial/seq2seq_translation_tutorial.ipynb"> seq2seq_translation</a>

The input of the Encoder RNN is the matrix brands/territories variance for month i. The commentaries of the same month are applied to the decoder. Basically, the encoder creates a context vector that represents the relevant data in the input matrix.
The input matrix should be flattened.

The encoder may not be necessary as the vector of variance by territory is dimension 34 and can be fed into the RNN2 directly.

<img src="./images/encoder-arch.png">



In [50]:
# class EncoderRNN(nn.Module):
#     def __init__(self, input_size, hidden_size):
#         super(EncoderRNN, self).__init__()
#         self.hidden_size = hidden_size
        
#         self.linear = nn.Linear(input_size, hidden_size)
#         self.gru = nn.GRU(hidden_size, hidden_size)

# #         self.embedding = nn.Embedding(input_size, hidden_size)
# #         self.gru = nn.GRU(hidden_size, hidden_size)


#     def forward(self, input, hidden):
# #         embedded = self.embedding(input).view(1, 1, -1)
# #         output = embedded
#         output = self.linear(input) 
#         output, hidden = self.gru(output, hidden)
#         return output, hidden

#     def initHidden(self):
#         return torch.zeros(1, 1, self.hidden_size, device=device)

In [127]:
# encoder = EncoderRNN(VAR_MONTH_DATA_SIZE, 300)
# print(encoder)

## Decoder (RNN2)

Receives the variance vector that is concatenated with the embedding vector of the word, then is trained to predict the next word using the current word from the commentary of month i related to brand k. 

**It makes senses also to classify the commentaries in classes, such as: over delivery, driven by territory, orders phased, ...**

<img src="./images/decoder-arch v1.png">

In [15]:
from pickle import load
from numpy import array
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.utils.vis_utils import plot_model
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense
from keras.layers import Embedding
from keras.layers import RepeatVector
from keras.layers import TimeDistributed
from keras.callbacks import ModelCheckpoint

Using TensorFlow backend.


In [None]:
# define NMT model
def define_model(src_vocab, tar_vocab, src_timesteps, tar_timesteps, n_units):
    model = Sequential()
    #Defining embedding layer: keras.layers.Embedding(input_dim, output_dim)
    model.add(Embedding(src_vocab, n_units, input_length=src_timesteps, mask_zero=True))
    # Defining Encoder LSTM
    model.add(LSTM(n_units))    
    #Defining Decoder LSTM    
    model.add(RepeatVector(tar_timesteps))
    model.add(LSTM(n_units, return_sequences=True))   
    model.add(TimeDistributed(Dense(tar_vocab, activation='softmax')))
    return model



In [None]:
# define model
model = define_model(fre_vocab_size, eng_vocab_size, fre_length, eng_length, 256)
model.compile(optimizer='adam', loss='categorical_crossentropy')

# summarize defined model
print(model.summary())

In [78]:
#Old pytorch code
#----------------


# class DecoderRNN(nn.Module):
#     def __init__(self, hidden_size, output_size):
#         super(DecoderRNN, self).__init__()
#         self.hidden_size = hidden_size

#         self.embedding = nn.Embedding(output_size, hidden_size)  #input and input sizes are identical
#         self.gru = nn.GRU(hidden_size, hidden_size)
#         self.out = nn.Linear(hidden_size, output_size)
#         self.softmax = nn.LogSoftmax(dim=1)

#     def forward(self, input, hidden):
#         output = self.embedding(input).view(1, 1, -1)
#         output = F.relu(output)
#         output, hidden = self.gru(output, hidden)
#         output = self.softmax(self.out(output[0]))
#         return output, hidden

#     def initHidden(self):
#         return torch.zeros(1, 1, self.hidden_size, device=device)

In [81]:
decoder = DecoderRNN(300, 1)
print(decoder)

DecoderRNN(
  (embedding): Embedding(1, 300)
  (gru): GRU(300, 300)
  (out): Linear(in_features=300, out_features=1, bias=True)
  (softmax): LogSoftmax()
)


## Global Architecture 

In [90]:
#define components sizes
CONTEXT_SIZE_1 = 300
CONTEXT_SIZE_2 = 300
EMBEDDING_DIM = 30


encoder = EncoderRNN(VAR_MONTH_DATA_SIZE, CONTEXT_SIZE_1)
decoder = DecoderRNN(CONTEXT_SIZE_2, 1)

learning_rate=0.01
encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)

criterion = nn.NLLLoss()



In [91]:
#iteration here
encoder_hidden = encoder.initHidden()
encoder_optimizer.zero_grad()
decoder_optimizer.zero_grad()


In [None]:
#Training, you can re-run this function as much time as needed to train more
for epoch in range(60):
    total_loss = 0
    for context, target in trigrams:

        # Step 1. Prepare the inputs to be passed to the model (i.e, turn the words
        # into integer indices and wrap them in tensors)
        context_idxs = torch.tensor([word_to_ix[w] for w in context], dtype=torch.long)

        # Step 2. Recall that torch *accumulates* gradients. Before passing in a
        # new instance, you need to zero out the gradients from the old
        # instance
        model.zero_grad()

        # Step 3. Run the forward pass, getting log probabilities over next
        # words
        log_probs = model(context_idxs)

        # Step 4. Compute your loss function. (Again, Torch wants the target
        # word wrapped in a tensor)
        loss = loss_function(log_probs, torch.tensor([word_to_ix[target]], dtype=torch.long))

        # Step 5. Do the backward pass and update the gradient
        loss.backward()
        optimizer.step()

        # Get the Python number from a 1-element Tensor by calling tensor.item()
        total_loss += loss.item()
    losses.append(total_loss)
    print('epoch %d : Total loss=%.3f' % (epoch, total_loss))
#print(losses)  # The loss decreased every iteration over the training data!

In [None]:
class NGramLanguageModeler(nn.Module):

    def __init__(self, vocab_size, embedding_dim, context_size):
        super(NGramLanguageModeler, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim) #Embedding matrix: each line is the embedding of one word
        self.linear1 = nn.Linear(context_size * embedding_dim, 128) #Parameter matrix embedding and hidden layer
        self.linear2 = nn.Linear(128, vocab_size)  #Parameter matrix between hidden layer and output

    def forward(self, inputs):
        embeds = self.embeddings(inputs).view((1, -1))  #get embedding from Embedding matrix
        out = F.relu(self.linear1(embeds))  #
        out = self.linear2(out)
        log_probs = F.log_softmax(out, dim=1)
        return log_probs


losses = []
loss_function = nn.NLLLoss()
model = NGramLanguageModeler(len(vocab), EMBEDDING_DIM, CONTEXT_SIZE)

optimizer = optim.SGD(model.parameters(), lr=0.01)   #before 0.001

In [None]:
#Training, you can re-run this function as much time as needed to train more
for epoch in range(60):
    total_loss = 0
    for context, target in trigrams:

        # Step 1. Prepare the inputs to be passed to the model (i.e, turn the words
        # into integer indices and wrap them in tensors)
        context_idxs = torch.tensor([word_to_ix[w] for w in context], dtype=torch.long)

        # Step 2. Recall that torch *accumulates* gradients. Before passing in a
        # new instance, you need to zero out the gradients from the old
        # instance
        model.zero_grad()

        # Step 3. Run the forward pass, getting log probabilities over next
        # words
        log_probs = model(context_idxs)

        # Step 4. Compute your loss function. (Again, Torch wants the target
        # word wrapped in a tensor)
        loss = loss_function(log_probs, torch.tensor([word_to_ix[target]], dtype=torch.long))

        # Step 5. Do the backward pass and update the gradient
        loss.backward()
        optimizer.step()

        # Get the Python number from a 1-element Tensor by calling tensor.item()
        total_loss += loss.item()
    losses.append(total_loss)
    print('epoch %d : Total loss=%.3f' % (epoch, total_loss))
#print(losses)  # The loss decreased every iteration over the training data!

In [None]:
teacher_forcing_ratio = 0.5


def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, 
          max_length=MAX_LENGTH):
    
    encoder_hidden = encoder.initHidden()

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)

    encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

    loss = 0

    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(input_tensor[ei], encoder_hidden)
        encoder_outputs[ei] = encoder_output[0, 0]

    decoder_input = torch.tensor([[SOS_token]], device=device)

    decoder_hidden = encoder_hidden

    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    if use_teacher_forcing:
        # Teacher forcing: Feed the target as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            loss += criterion(decoder_output, target_tensor[di])
            decoder_input = target_tensor[di]  # Teacher forcing

    else:
        # Without teacher forcing: use its own predictions as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            topv, topi = decoder_output.topk(1)
            decoder_input = topi.squeeze().detach()  # detach from history as input

            loss += criterion(decoder_output, target_tensor[di])
            if decoder_input.item() == EOS_token:
                break

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length

In [None]:
def trainIters(encoder, decoder, n_iters, print_every=1000, plot_every=100, learning_rate=0.01):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
    training_pairs = [tensorsFromPair(random.choice(pairs))
                      for i in range(n_iters)]
    criterion = nn.NLLLoss()

    for iter in range(1, n_iters + 1):
        training_pair = training_pairs[iter - 1]
        input_tensor = training_pair[0]
        target_tensor = training_pair[1]

        loss = train(input_tensor, target_tensor, encoder,
                     decoder, encoder_optimizer, decoder_optimizer, criterion)
        print_loss_total += loss
        plot_loss_total += loss

        if iter % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters),
                                         iter, iter / n_iters * 100, print_loss_avg))

        if iter % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0

    showPlot(plot_losses)

Plotting results
----------------

Plotting is done with matplotlib, using the array of loss values
``plot_losses`` saved while training.


In [None]:
import matplotlib.pyplot as plt
plt.switch_backend('agg')
import matplotlib.ticker as ticker
import numpy as np


def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)

In [None]:
def evaluate(encoder, decoder, sentence, max_length=MAX_LENGTH):
    with torch.no_grad():
        input_tensor = tensorFromSentence(input_lang, sentence)
        input_length = input_tensor.size()[0]
        encoder_hidden = encoder.initHidden()

        encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

        for ei in range(input_length):
            encoder_output, encoder_hidden = encoder(input_tensor[ei],
                                                     encoder_hidden)
            encoder_outputs[ei] += encoder_output[0, 0]

        decoder_input = torch.tensor([[SOS_token]], device=device)  # SOS

        decoder_hidden = encoder_hidden

        decoded_words = []
        decoder_attentions = torch.zeros(max_length, max_length)

        for di in range(max_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            decoder_attentions[di] = decoder_attention.data
            topv, topi = decoder_output.data.topk(1)
            if topi.item() == EOS_token:
                decoded_words.append('<EOS>')
                break
            else:
                decoded_words.append(output_lang.index2word[topi.item()])

            decoder_input = topi.squeeze().detach()

        return decoded_words, decoder_attentions[:di + 1]

In [None]:
def evaluateRandomly(encoder, decoder, n=10):
    for i in range(n):
        pair = random.choice(pairs)
        print('>', pair[0])
        print('=', pair[1])
        output_words, attentions = evaluate(encoder, decoder, pair[0])
        output_sentence = ' '.join(output_words)
        print('<', output_sentence)
        print('')

## Train and evaluate

In [None]:
hidden_size = 256
encoder1 = EncoderRNN(input_lang.n_words, hidden_size).to(device)
attn_decoder1 = AttnDecoderRNN(hidden_size, output_lang.n_words, dropout_p=0.1).to(device)

trainIters(encoder1, attn_decoder1, 75000, print_every=5000)

In [2]:
line='this is a paragraph with<[1> in between</[1> and then there are cases ... where the<[99> number ranges from 1-100</[99>. and there are many other lines in the txt files with<[3> such tags </[3>'
import re
line = re.sub(r"</?\[\d+>", "", line)
line

'this is a paragraph with in between and then there are cases ... where the number ranges from 1-100. and there are many other lines in the txt files with such tags '

In [None]:
import re
comment = 'baselines driven by improving pos l4 +3%'
aa = r"[0-9]+(\.[0-9]+)?\%"
comment = re.sub(aa, "[%]", comment)
comment = re.sub(r"\-\$[0-9].[0-9][0-9]M\b", "[-]", comment)
comment

In [None]:
"/(^|\W)$[0-9]+(\.[0-9][0-9])?\b/"

## Updated version of Keras

In [135]:
from keras.models import Model
from keras.layers import Input, LSTM, Dense

num_encoder_tokens = 20
latent_dim = 200
encoder_inputs = 1

num_decoder_tokens = 30
decoder_outputs = 150


# Define an input sequence and process it.
encoder_inputs = Input(shape=(None, num_encoder_tokens))
encoder = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder(encoder_inputs)
# We discard `encoder_outputs` and only keep the states.
encoder_states = [state_h, state_c]

# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = Input(shape=(None, num_decoder_tokens))
# We set up our decoder to return full output sequences,
# and to return internal states as well. We don't use the 
# return states in the training model, but we will use them in inference.
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs,
                                     initial_state=encoder_states)
decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_5 (InputLayer)            (None, None, 20)     0                                            
__________________________________________________________________________________________________
input_6 (InputLayer)            (None, None, 30)     0                                            
__________________________________________________________________________________________________
lstm_5 (LSTM)                   [(None, 200), (None, 176800      input_5[0][0]                    
__________________________________________________________________________________________________
lstm_6 (LSTM)                   [(None, None, 200),  184800      input_6[0][0]                    
                                                                 lstm_5[0][1]                     
          

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, None, 100)    0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, None, 50)     0                                            
__________________________________________________________________________________________________
lstm_1 (LSTM)                   [(None, 200), (None, 240800      input_1[0][0]                    
__________________________________________________________________________________________________
lstm_2 (LSTM)                   [(None, None, 200),  200800      input_2[0][0]                    
                                                                 lstm_1[0][1]                     
          