# Commentaries file pre-processing

## Preparing the data

### Pre-processing commentaries

In [2]:
# Imports
import numpy as np
import pandas as pd
from helper_save_load import load_excel_spreadsheet

In [None]:
# Load Forecast and Actual sheets
dfc = load_excel_spreadsheet('./datasets/Commentaries dataset v2.xlsm', 'All')

In [None]:
dfc.head(2)

#### Cleaning commentaries
- Removing punctuation, indesirable words or sentences and copy new comment to a new column


In [5]:
#Taking a sample of commentaries
df_sample = dfc     #.loc[0:100,:]
NotNull = df_sample['Commentaries'].notna()
df_comments = df_sample[NotNull].loc[:,'Commentaries']
df_comments.head(5)

2     Driven  by JC/Sobeys/Metro total $300K
6                      Driven by SDM - $100K
19                 Driven by SDM/LCL + $500K
24                             Brand Growth 
31                      Driven by WM - $160K
Name: Commentaries, dtype: object

In [6]:
#Defining codes
SOS = '[SOS]'
EOS = '[EOS]'
MILLIONS_NUM = '[#M]'
THOUSAND_NUM = '[#K]'
NUMBER = '[#]'
PERCENT = '[%]'

In [7]:
#Adding space after +/-, deleting (,),: , deleting 'please see comments above', 
import re
from nltk.stem import PorterStemmer

ps = PorterStemmer()

#Replace patterns of numbers by tokens
resubs = [(r"(\$)?[0-9](\.[0-9]+)?M\b", MILLIONS_NUM),
         (r"(\$)?[0-9]+[Kk]\b", THOUSAND_NUM),
         (r"[0-9]+(\.[0-9]+)?\%", PERCENT),
         (r"\$[0-9]+(\.[0-9]+)?\b", NUMBER)]

raplacements = [('+ [#','+[#'), ('- [#','-[#'),
                ('+','+'),
                ('-','-'),
                ('(',' '),
                (')',' '),
                (':',' '),
                (',',' '),
                ('.',' '),
                (';',''),
                ('\n',' '),
                ('0ml ','0 ml '), ('3ml ','3 ml '), ('4ml ','4 ml '), ('5ml ','5 ml '),
                ('0g ','0 g '), ('3g ','3 g '), ('5g ','5 g '), ('6g ','6 g '),
                ('liption','lipton'),
                ('please see comments above', ''),
                ('please see comments below', ''),
                ('Please see comments above',''),
                ('Please see comments below',''),
               ]

for i, comment in zip(df_comments.index,df_comments):
    for r in resubs:
        comment = re.sub(r[0],r[1], comment)
    comment = comment.lower()
    for r in raplacements:
        comment = comment.replace(r[0],r[1])
    result = re.split(' |/|:', comment)
    result2 = [ps.stem(m) for m in result if m!='']
    df_sample.loc[i,'Comment_w'] = SOS + ' ' + ' '.join(result2) + ' ' + EOS

In [8]:
df_all_com = df_sample[NotNull].loc[:,'Comment_w']

In [None]:
[comment for comment in df_all_com if 'cancel' in comment]

In [10]:
com_corpus = ' '.join(df_all_com.ravel()).split()

In [11]:
# com_corpus

In [None]:
df_sample[NotNull].loc[0:100,['Commentaries', 'Comment_w']]

In [None]:
dfc.head(3)

### Creating dictionnary

In [14]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

torch.manual_seed(1)

<torch._C.Generator at 0x25cd1553f30>

In [15]:
#Create vocabulary and dictionary
vocab = set(com_corpus)
word_to_ix = {word: i for i, word in enumerate(vocab)}
ix_to_word = {i: word for i, word in enumerate(vocab)}
print('Size of vocab :', len(vocab))
print('Size of dictionary :', len(word_to_ix))

# we should tokenize the input, but we will ignore that for now
# build a list of tuples.  Each tuple is ([ word_i-2, word_i-1 ], target word)
#trigrams = [([test_sentence[i], test_sentence[i + 1]], test_sentence[i + 2])
#            for i in range(len(test_sentence) - 2)]

Size of vocab : 934
Size of dictionary : 934


In [None]:
for v in sorted(vocab):
    print (v)

In [17]:
#Save dictionnary and commentaries dataframe to pickle file
from helper_save_load import save_to_pickle
save_to_pickle("commentaries.pickle", (dfc, vocab, word_to_ix, ix_to_word))

### Creating word embedding using pytorch

In [None]:
#Create tri-grams from all comments [word1,word2,target]
trigrams = []
for comment in df_all_com:
    comment_list = comment.split()
    tri = [([comment_list[i], comment_list[i + 1]], comment_list[i + 2])
            for i in range(len(comment_list) - 2)]
    for tr in tri:
        trigrams.append(tr)
# print the first 3, just so you can see what they look like
print(df_all_com[df_all_com.index[0]])
print('\nExamples of trigrams for training:')
for tri in trigrams[:4]:
    print(tri)

In [19]:
CONTEXT_SIZE = 2
EMBEDDING_DIM = 200

class NGramLanguageModeler(nn.Module):

    def __init__(self, vocab_size, embedding_dim, context_size):
        super(NGramLanguageModeler, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim) #Embedding matrix: each line is the embedding of one word
        self.linear1 = nn.Linear(context_size * embedding_dim, 128) #Parameter matrix embedding and hidden layer
        self.linear2 = nn.Linear(128, vocab_size)  #Parameter matrix between hidden layer and output

    def forward(self, inputs):
        embeds = self.embeddings(inputs).view((1, -1))  #get embedding from Embedding matrix
        out = F.relu(self.linear1(embeds))  #
        out = self.linear2(out)
        log_probs = F.log_softmax(out, dim=1)
        return log_probs


losses = []
loss_function = nn.NLLLoss()   #Negative log likelihood loss (multiclass output with softmax)
model = NGramLanguageModeler(len(vocab), EMBEDDING_DIM, CONTEXT_SIZE)
optimizer = optim.SGD(model.parameters(), lr=0.01)  #stochastic gradient descent   #before 0.001

In [20]:
#Training, you can re-run this function as much time as needed to train more
for epoch in range(20):
    total_loss = 0
    for context, target in trigrams:

        # Step 1. Prepare the inputs to be passed to the model (i.e, turn the words
        # into integer indices and wrap them in tensors)
        context_idxs = torch.tensor([word_to_ix[w] for w in context], dtype=torch.long)

        # Step 2. Recall that torch *accumulates* gradients. Before passing in a
        # new instance, you need to zero out the gradients from the old
        # instance
        model.zero_grad()

        # Step 3. Run the forward pass, getting log probabilities over next
        # words
        log_probs = model(context_idxs)

        # Step 4. Compute your loss function. (Again, Torch wants the target
        # word wrapped in a tensor)
        loss = loss_function(log_probs, torch.tensor([word_to_ix[target]], dtype=torch.long))

        # Step 5. Do the backward pass and update the gradient
        loss.backward()
        optimizer.step()

        # Get the Python number from a 1-element Tensor by calling tensor.item()
        total_loss += loss.item()
    losses.append(total_loss)
    print('epoch %d : Total loss=%.3f' % (epoch, total_loss))
#print(losses)  # The loss decreased every iteration over the training data!

epoch 0 : Total loss=40003.434
epoch 1 : Total loss=31193.424
epoch 2 : Total loss=26502.132
epoch 3 : Total loss=22896.835
epoch 4 : Total loss=19967.092
epoch 5 : Total loss=17768.143
epoch 6 : Total loss=16317.012
epoch 7 : Total loss=15371.235
epoch 8 : Total loss=14787.410
epoch 9 : Total loss=14327.685
epoch 10 : Total loss=13964.084
epoch 11 : Total loss=13641.631
epoch 12 : Total loss=13354.252
epoch 13 : Total loss=13152.566
epoch 14 : Total loss=12964.661
epoch 15 : Total loss=12752.363
epoch 16 : Total loss=12594.136
epoch 17 : Total loss=12455.578
epoch 18 : Total loss=12286.438
epoch 19 : Total loss=12178.475


In [21]:
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

plt.figure()
plt.plot(losses);

In [22]:
context, target = trigrams[133]
print('Context:', context)
print('Target:', target)

#Predict next word using model
context_idxs = torch.tensor([word_to_ix[w] for w in context], dtype=torch.long)
log_probs = model(context_idxs)

#Get the word that has max probability and display it
result = log_probs.data.numpy().tolist()[0]   #Convert tensor to list
indexmax = result.index(np.max(result)) #get index of the greatest probable word
print('Prediction:', ix_to_word[indexmax])

Context: ['[SOS]', 'driven']
Target: by
Prediction: by


#### Useful function to predict next word, calculate distance between words

In [23]:
def predict_next(context):
    #Predict next word using model
    context_idxs = torch.tensor([word_to_ix[w] for w in context], dtype=torch.long)
    log_probs = model(context_idxs)

    #Get the word that has max probability and display it
    result = log_probs.data.numpy().tolist()[0]   #Convert tensor to list
    indexmax = result.index(np.max(result)) #get index of the greatest probable word
    return (ix_to_word[indexmax])

def embedding_word(word):
    word_idx = torch.tensor([word_to_ix[word]], dtype=torch.long)
    return(model.embeddings(word_idx).data.numpy().tolist()[0])

def distance_words_pytorch(word1,word2):    #between -1 and 1
    from scipy import spatial
    return(1-spatial.distance.cosine(embedding_word(word1), embedding_word(word2)))

def predict_next_multi(context, topn):
    #Predict next word using model
    context_idxs = torch.tensor([word_to_ix[w] for w in context], dtype=torch.long)
    log_probs = model(context_idxs)

    #Get the word that has max probability and display it
    result = log_probs.data.numpy().tolist()[0]   #Convert tensor to list
    result_s = sorted(result, reverse=True)
    
    mydict = [(ix_to_word[result.index(p)], np.exp(p)) for p in result_s[0:topn]]
    return (mydict)

In [None]:
predict_next_multi(['by', 'half'], 10)

In [None]:
predict_next_multi(['driven', 'by'], 10)

In [None]:
predict_next_multi(['by', 'lcl'], 10)

In [27]:
predict_next(['[SOS]','driven'])

'by'

In [28]:
predict_next(['expert','segment'])

'sh'

In [38]:
#Basic commentary generation from distance between words
word1 = SOS
word2 = 'half'
word=''
sentence = [word2]
i=0
while (word!= EOS) & (i<20):
    word = predict_next([word1, word2])
    sentence.append(word)
    word1 = word2
    word2 = word
    i += 1
    
print(' '.join(sentence[0:-1]))

half volum phase out of june +[#m] multipl account


In [31]:
def closet_words_pytorch(target, topn):   #Does not work well based on the embedding distances
    import operator
    mydict = [(m, distance_words_pytorch(target,m)) for m in list(vocab)]
    sorted_tuples = sorted(mydict, key=operator.itemgetter(1), reverse=True)
    return(sorted_tuples[1:topn])

In [34]:
closet_words_pytorch('driven',10) 

[('led', 0.20262270399837568),
 ('-[%]', 0.19083462441432775),
 ('bit', 0.18448048814438656),
 ('ly', 0.1811901528649147),
 ('packag', 0.1801820887018959),
 ('neg', 0.17552524775024658),
 ('cd', 0.1713322477884729),
 ('deliv', 0.16235459395322738),
 ('start', 0.16057906795451005)]

In [35]:
predict_next_multi([SOS, '+[#m]'],10) 

[('masterbrand', 0.2487331639088039),
 ('costco', 0.14434062808406664),
 ('pjc', 0.12098206119506873),
 ('sdm', 0.106232972504898),
 ('wm', 0.08852266565205603),
 ('wmt', 0.07560365918176581),
 ('lcl', 0.04569877335763406),
 ('multipl', 0.028043257946946846),
 ('distress', 0.01699105929699022),
 ('pw', 0.013071440110739773)]

In [62]:
distance_words_pytorch('sobey','lcl')

0.22757552606022324

In [63]:
word = 'doh'
word_to_ix[word]
word_idx = torch.tensor([word_to_ix[word]], dtype=torch.long)
model.embeddings(word_idx)

tensor([[ 0.3756,  0.0396,  0.1490,  0.1667, -0.8133,  0.0488,  0.7215, -0.6928,
         -1.5910, -1.1907, -0.0978, -0.0095, -0.9462, -1.2306, -0.1153, -0.1944,
         -0.4340,  0.1308, -0.8175,  0.4913]], grad_fn=<EmbeddingBackward>)

#### Save/Load model

In [64]:
#Save model
torch.save(model.state_dict(), './embedding_model_params')

In [65]:
# Load model
#The model class should be defined
model.load_state_dict(torch.load('./embedding_model_params'))
model.eval()

NGramLanguageModeler(
  (embeddings): Embedding(932, 20)
  (linear1): Linear(in_features=40, out_features=128, bias=True)
  (linear2): Linear(in_features=128, out_features=932, bias=True)
)

### Apply word2vec for embedding

In [None]:
#Using word2vec
import gensim
from gensim.models import Word2Vec

documents = [m.split() for m in df_all_com.tolist()]
documents[0:2]

In [67]:
# build vocabulary and train model with word2vec
w2v_model = gensim.models.Word2Vec(documents, size=200, window=3, min_count=1, workers=4)
w2v_model.train(documents, total_examples=len(documents), epochs=500) 
#w2v_model.wv.vocab   #displaying vocabulary

(2697213, 4148000)

In [68]:
print(w2v_model.wv.most_similar('driven', topn=10))

[('offset', 0.3579862117767334), ('hair', 0.33692508935928345), ('femal', 0.31099462509155273), ('tea', 0.3058239221572876), ('equal', 0.30187147855758667), ('tre', 0.28853273391723633), ('habl', 0.2797410190105438), ('rr', 0.26362720131874084), ('bw', 0.25601649284362793), ('cca', 0.25533267855644226)]


  if np.issubdtype(vec.dtype, np.int):


In [71]:
positive1 = 'driven'
negative1 = 'by'

positive2 = 'wm'

print(w2v_model.wv.most_similar([positive1, positive2], [negative1], topn=1))

[('sdm', 0.24068258702754974)]


In [74]:
#plot words similar to a given word
from bokeh.io import push_notebook, show, output_notebook
from bokeh.plotting import figure
from bokeh.models import ColumnDataSource, LabelSet
import pandas as pd

from scipy import spatial
def distance_words(word1,word2):    #between -1 and 1
    return(1-spatial.distance.cosine(w2v_model.wv[word1], w2v_model.wv[word2]))

def get_distance_words(words, target):
    words = words.split()
    distances=[]
    for word in words:
        distance = distance_words(target,word)
        distances.append((word,distance))
    distances = sorted(distances, key=lambda tup: tup[1], reverse=True)
    return(distances)

def plot_distance_words_1D(words, highlight_word):
    words = words + " " + highlight_word
    
    distances = get_distance_words(words, highlight_word)
    words = [word for word,i in distances]
    ds = [item[1] for item in distances]
    
    output_notebook()
    df = pd.DataFrame(columns=['x', 'y', 'word'])
    
    ds = [1-d for d in ds]
    angles = [i* 2 * np.math.pi / len(distances) for i in range(len(distances))]
    x_values = [d * np.math.cos(angle) for d,angle in zip(ds,angles)]
    y_values = [d * np.math.sin(angle) for d,angle in zip(ds,angles)]
    x_values[0] = 0
    
    df['x'], df['y'], df['word'] = x_values, y_values, words

    source = ColumnDataSource(ColumnDataSource.from_df(df))
    labels = LabelSet(x="x", y="y", text="word",
                      text_font_size="10pt", text_color="#555555",
                      x_offset=13, y_offset=-6,
                      source=source, text_align='left')

    plot = figure(plot_width=900, plot_height=600)
    alphas = [1-d if d<1 else 0 for d in ds]

    for (index, row), alpha in zip(df.iterrows(),alphas):
        plot.line([0, row['x']], [0, row['y']], line_width=1, line_color="green", alpha=alpha)
        
    plot.circle("x", "y", size=10, source=source, line_color="black", fill_alpha=0.9)
    plot.add_layout(labels)

    source1 = ColumnDataSource(ColumnDataSource.from_df(df[df['word']==highlight_word]))
    plot.circle("x", "y", size=20, source=source1, fill_color='red', line_color="red", fill_alpha=1)    
    
    show(plot, notebook_handle=True)

def plot_close_word_1D(target, topn=10):
    words = w2v_model.wv.most_similar(target, topn=topn)
    words = [word for word,i in words]
    words = ' '.join(words)
    plot_distance_words_1D(words, target)
    
plot_close_word_1D("driven", 20)
plot_distance_words_1D("lcl sobey rotat by build overdeliveri", "driven")