In [None]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import re
import random

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
import pandas as pd
from tqdm import tqdm

import time
import math

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import pickle


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Introduction

Usually, we use traditional scoring methods for keywords extraction(eg. TFIDF). However, the traditional methods usually do poorly on a small-length text. Here we overcome this diffculity by using supervised learning method (ie. we tell our model what we want our outcome be like and hopes it learns the patterns). Using Sequence-to-Sequence neural network, which is two RNNs with one served as encoder one as decoder, we achieved good results tested on a chinese ecommerce platform.

However, one difficulity we may have to overcome is finding good targets for our model to learn from. Luckly, if you are working in a ecommerce company, you already have natural targets for your model,customer-searched words. Binding the product descriptions with the customer-searched words when a customer clicks a product after searching, we can produce resonable training data for model to be trained on.

Reference: https://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html

# Functions for preparing data

Geting a list of words from a sentence. This function is typically useful for ecommerce sites that use chinese as their main language. Note that one can modify this part according to her needs.

In [None]:
def get_word_list(s1):
    regEx = re.compile('([\u4e00-\u9fa5]|[^a-zA-Z0-9_-]+)') 
    res = re.compile(r"([\u4e00-\u9fa5])") 

    p1 = regEx.split(s1.lower())
    str1_list = []
    for str in p1:
        
        if res.split(str) == None:
            str1_list.append(str)
        else:
            ret = res.split(str)
            for ch in ret:
                str1_list.append(ch)

    list_word1 = [w for w in str1_list if len(w) != 0] 

    return  list_word1


Lang tool class for translating words to one-hoc vector.

In [None]:
SOS_token = 0
EOS_token = 1


class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS"}
        self.n_words = 2  # Count SOS and EOS

    def addSentence(self, sentence):
        for word in get_word_list(sentence):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

In [None]:
def getPairs(version_num,reverse=False):
    print("Reading lines...")

    # Read the file and split into lines
    lines = open('training_data_'+str(version_num)+'.csv', encoding='utf-8').\
        read().strip().split('\n')

    # Split every line into pairs and normalize
    pairs = [[s for s in l.split('\t')] for l in lines]

    # Reverse pairs, make Lang instances
    if reverse:
        pairs = [list(reversed(p)) for p in pairs]
        

    return pairs

In [None]:
MAX_IN_LENGTH = 100


def filterPair(p):
    return len(get_word_list(p[0])) < MAX_IN_LENGTH and \
        len(get_word_list(p[1])) < MAX_IN_LENGTH



def filterPairs(pairs):
    return [pair for pair in pairs if filterPair(pair)]

In [None]:
def prepareData(version_num,reverse=False): #reverse=False means desciption to keywords
    pairs = getPairs(version_num,reverse=reverse)
    print("Read %s sentence pairs" % len(pairs))
    lang1 = Lang("My_Lang")
    
    pairs = filterPairs(pairs)
    print("Trimmed to %s sentence pairs" % len(pairs))
    
    print("Counting words...")
    pbar = tqdm(total=len(pairs),position=0, leave=True)
    for pair in pairs:
        
        lang1.addSentence(pair[0])
        lang1.addSentence(pair[1])
        pbar.update()

    print("Counted words:")
    print(lang1.n_words)
    return lang1, pairs

# Model

### Encoder

A encoder for encodeing the product description. We are using GRU here for simplicity, one can change it to LSTM for more complicate model.

In [None]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output = embedded
        output, hidden = self.gru(output, hidden)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

### Decoder

A decoder for producing predicted output keywords. Here we are Attention mechanism to improve performance.  
Reference: https://arxiv.org/abs/1706.03762

In [None]:
class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length=MAX_IN_LENGTH):
        super(AttnDecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length

        self.embedding = nn.Embedding(self.output_size, self.hidden_size)
        self.attn = nn.Linear(self.hidden_size * 2, self.max_length)
        self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout_p)
        self.gru = nn.GRU(self.hidden_size, self.hidden_size)
        self.out = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, input, hidden, encoder_outputs):
        embedded = self.embedding(input).view(1, 1, -1)
        embedded = self.dropout(embedded)

        attn_weights = F.softmax(
            self.attn(torch.cat((embedded[0], hidden[0]), 1)), dim=1)
        attn_applied = torch.bmm(attn_weights.unsqueeze(0),
                                 encoder_outputs.unsqueeze(0))

        output = torch.cat((embedded[0], attn_applied[0]), 1)
        output = self.attn_combine(output).unsqueeze(0)

        output = F.relu(output)
        output, hidden = self.gru(output, hidden)

        output = F.log_softmax(self.out(output[0]), dim=1)

        return output, hidden, attn_weights

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

# Training

### Tool functions for visualize progress.

In [None]:
def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

In [None]:
plt.switch_backend('agg')
def showPlot(points):
    %matplotlib inline
    plt.figure()
    fig, ax = plt.subplots()
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)

### Preparing Training Data

In [None]:
def indexesFromSentence(lang, sentence):
    return [lang.word2index[word] for word in get_word_list(sentence)]


def tensorFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)


def tensorsFromPair(pair):
    input_tensor = tensorFromSentence(lang1, pair[0])
    target_tensor = tensorFromSentence(lang1, pair[1])
    return (input_tensor, target_tensor)

### Train functions

In [None]:
teacher_forcing_ratio = 0.7


def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=MAX_IN_LENGTH):
    encoder_hidden = encoder.initHidden()

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)

    encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

    loss = 0

    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(
            input_tensor[ei], encoder_hidden)
        encoder_outputs[ei] = encoder_output[0, 0]

    decoder_input = torch.tensor([[SOS_token]], device=device)

    decoder_hidden = encoder_hidden

    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    if use_teacher_forcing:
        # Teacher forcing: Feed the target as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            loss += criterion(decoder_output, target_tensor[di])

            decoder_input = target_tensor[di]  # Teacher forcing

    else:
        # Without teacher forcing: use its own predictions as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            topv, topi = decoder_output.topk(1)
            decoder_input = topi.squeeze().detach()  # detach from history as input
            print(decoder_output.shape)
            print(target_tensor.shape)
            loss += criterion(decoder_output, target_tensor[di])
            if decoder_input.item() == EOS_token:
                break

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length

In [None]:
def trainIters(encoder, decoder, n_iters, print_every=1000, plot_every=100, learning_rate=0.01):
    start = time.time()
    global plot_losses 

    print_loss_total = 0
    plot_loss_total = 0 

    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
    training_pairs = [tensorsFromPair(random.choice(pairs))
                      for i in range(n_iters)]
    criterion = nn.NLLLoss()
    print("Start training...")
    for iter in range(1, n_iters + 1):
        training_pair = training_pairs[iter - 1]
        input_tensor = training_pair[0]
        target_tensor = training_pair[1]

        loss = train(input_tensor, target_tensor, encoder,
                     decoder, encoder_optimizer, decoder_optimizer, criterion)
        print_loss_total += loss
        plot_loss_total += loss

        if iter % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters),
                                         iter, iter / n_iters * 100, print_loss_avg))

        if iter % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0

    #showPlot(plot_losses)

### Evaluate function

In this function, we produce three predicted keywords for each description by feeding the second and third most possible words into the model for the start of the second and third keywords, respectively.

In [None]:
def evaluate(encoder, decoder, sentence, max_length=MAX_IN_LENGTH):
    with torch.no_grad():
        input_tensor = tensorFromSentence(lang1, sentence)
        input_length = input_tensor.size()[0]
        encoder_hidden = encoder.initHidden()

        encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

        for ei in range(input_length):
            encoder_output, encoder_hidden = encoder(input_tensor[ei],encoder_hidden)
            encoder_outputs[ei] += encoder_output[0, 0]
            
        decoded_words_all = []
        probability_all = []
        
        
        decoder_input = torch.tensor([[SOS_token]], device=device)
        
        decoder_output, decoder_hidden_first, _ = decoder(decoder_input, encoder_hidden, encoder_outputs)
        topv_first, topi_first = decoder_output.data.topk(3)
        
        topv_first = topv_first[0]
        topi_first = topi_first[0]
        for i in range(3):
            
            decoded_words = []
            probability = []
            
            decoder_hidden = torch.clone(decoder_hidden_first)

            

            probability.append(str(round(math.exp(topv_first[i].item()),2)))
            
            
            if topi_first[i].item() == EOS_token:
                decoded_words.append('<EOS>')
                continue
            else:
                decoded_words.append(lang1.index2word[topi_first[i].item()])
            
            
            decoder_input = topi_first[i].squeeze().detach()
            for di in range(max_length-1):
                decoder_output, decoder_hidden, _ = decoder(decoder_input, decoder_hidden, encoder_outputs)
            
                topv, topi = decoder_output.data.topk(1)
                probability.append(str(round(math.exp(topv.item()),2)))
                
                if topi.item() == EOS_token:
                    decoded_words.append('<EOS>')
                    break
                else:
                    decoded_words.append(lang1.index2word[topi.item()])

                decoder_input = topi.squeeze().detach()
            
            decoded_words_all.append(decoded_words)
            probability_all.append(probability)

        return decoded_words_all, probability_all

In [None]:
def evaluateRandomly(encoder, decoder, n=10):
    for i in range(n):
        pair = random.choice(pairs)
        print('Input : ', pair[0])
        print('Target : ', pair[1])
        output_words,probability = evaluate(encoder, decoder, pair[0])
        print(output_words)
        print(probability)

# Start Running

### Load and save prepared-data

Typically you would want to do this just once, and use the saved prepared-data in the future.

In [None]:
lang1,pairs = prepareData("v5", True)

'''
with open('lang.pkl', 'wb') as output:
    
    pickle.dump(lang1, output, pickle.HIGHEST_PROTOCOL)

with open('trimmed.pkl', 'wb') as output:
    pickle.dump(pairs, output, pickle.HIGHEST_PROTOCOL)
'''


### Create encoder and decoder instence

In [None]:
hidden_size = 256
encoder1 = EncoderRNN(lang1.n_words, hidden_size).to(device)
attn_decoder1 = AttnDecoderRNN(hidden_size, lang1.n_words, dropout_p=0.3).to(device)
plot_losses = []

### Start training

Since we are choosing data to train randomly. One can stop the training to evaluate, and re-run this block to continue training.

In [None]:

trainIters(encoder1, attn_decoder1, 7500, print_every=500,plot_every=100)


In [None]:
showPlot(plot_losses)

### Save model using pickle

In [None]:

with open('model.pkl', 'wb') as output:
    
    pickle.dump(encoder1, output, pickle.HIGHEST_PROTOCOL)
    
    pickle.dump(attn_decoder1, output, pickle.HIGHEST_PROTOCOL)

    


### Evaluate

In [None]:
evaluateRandomly(encoder1, attn_decoder1)