In [1]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
import time
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import backend as K
from tensorflow.keras import activations
from tensorflow.keras.layers import Layer, Input, Embedding, LSTM, Dense, Attention
from tensorflow.keras.models import Model

In [2]:
reviews = pd.read_csv("Reviews.csv")

In [3]:
reviews.shape

(568454, 10)

In [4]:
reviews.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


In [5]:
reviews.isnull().sum()

Id                         0
ProductId                  0
UserId                     0
ProfileName               16
HelpfulnessNumerator       0
HelpfulnessDenominator     0
Score                      0
Time                       0
Summary                   27
Text                       0
dtype: int64

In [6]:
# Remove null values and unneeded features
reviews = reviews.dropna()
reviews = reviews.drop(['Id','ProductId','UserId','ProfileName','HelpfulnessNumerator','HelpfulnessDenominator','Score','Time'],1)
reviews = reviews.reset_index(drop=True)

In [7]:
contractions = { 
"ain't": "am not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",
"he'll": "he will",
"he's": "he is",
"how'd": "how did",
"how'll": "how will",
"how's": "how is",
"i'd": "i would",
"i'll": "i will",
"i'm": "i am",
"i've": "i have",
"isn't": "is not",
"it'd": "it would",
"it'll": "it will",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"must've": "must have",
"mustn't": "must not",
"needn't": "need not",
"oughtn't": "ought not",
"shan't": "shall not",
"sha'n't": "shall not",
"she'd": "she would",
"she'll": "she will",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"that'd": "that would",
"that's": "that is",
"there'd": "there had",
"there's": "there is",
"they'd": "they would",
"they'll": "they will",
"they're": "they are",
"they've": "they have",
"wasn't": "was not",
"we'd": "we would",
"we'll": "we will",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what will",
"what're": "what are",
"what's": "what is",
"what've": "what have",
"where'd": "where did",
"where's": "where is",
"who'll": "who will",
"who's": "who is",
"won't": "will not",
"wouldn't": "would not",
"you'd": "you would",
"you'll": "you will",
"you're": "you are"
}

In [8]:
def clean_text(text, remove_stopwords = True):
    
    # Convert words to lower case
    text = text.lower()
    
    # Replace contractions with their longer forms 
    if True:
        text = text.split()
        new_text = []
        for word in text:
            if word in contractions:
                new_text.append(contractions[word])
            else:
                new_text.append(word)
        text = " ".join(new_text)
    
    # Format words and remove unwanted characters
    text = re.sub(r'https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
    text = re.sub(r'\<a href', ' ', text)
    text = re.sub(r'&amp;', '', text) 
    text = re.sub(r'[_"\-;%()|+&=*%.,!?:#$@\[\]/]', ' ', text)
    text = re.sub(r'<br />', ' ', text)
    text = re.sub(r'\'', ' ', text)
    
    # Optionally, remove stop words
    if remove_stopwords:
        text = text.split()
        stops = set(stopwords.words("english"))
        text = [w for w in text if not w in stops]
        text = " ".join(text)

    return text

In [9]:
# Clean the summaries and texts
clean_summaries = []
for summary in reviews.Summary[0:5000]:
    clean_summaries.append(clean_text(summary))
print("Summaries are complete.")

clean_texts = []
for text in reviews.Text[0:5000]:
    clean_texts.append(clean_text(text))
print("Texts are complete.")

Summaries are complete.
Texts are complete.


In [10]:
# Inspect the cleaned summaries and texts to ensure they have been cleaned well
for i in range(5):
    print("Clean Review #",i+1)
    print(clean_summaries[i])
    print(clean_texts[i])
    print()

Clean Review # 1
good quality dog food
bought several vitality canned dog food products found good quality product looks like stew processed meat smells better labrador finicky appreciates product better

Clean Review # 2
advertised
product arrived labeled jumbo salted peanuts peanuts actually small sized unsalted sure error vendor intended represent product jumbo

Clean Review # 3
delight says
confection around centuries light pillowy citrus gelatin nuts case filberts cut tiny squares liberally coated powdered sugar tiny mouthful heaven chewy flavorful highly recommend yummy treat familiar story c lewis lion witch wardrobe treat seduces edmund selling brother sisters witch

Clean Review # 4
cough medicine
looking secret ingredient robitussin believe found got addition root beer extract ordered good made cherry soda flavor medicinal

Clean Review # 5
great taffy
great taffy great price wide assortment yummy taffy delivery quick taffy lover deal



In [11]:
vocab_words = []
def count_words( text):
    for sentence in text:
        for word in sentence.split():
            if word not in vocab_words:
                vocab_words.append(word)

In [12]:
count_words(clean_summaries)
count_words(clean_texts)

In [13]:
vocab_words

['good',
 'quality',
 'dog',
 'food',
 'advertised',
 'delight',
 'says',
 'cough',
 'medicine',
 'great',
 'taffy',
 'nice',
 'expensive',
 'brands',
 'wonderful',
 'tasty',
 'yay',
 'barley',
 'healthy',
 'best',
 'hot',
 'sauce',
 'world',
 'cats',
 'love',
 'diet',
 'better',
 'regular',
 'fans',
 'new',
 'fresh',
 'greasy',
 'strawberry',
 'twizzlers',
 'yummy',
 'lots',
 'expect',
 'poor',
 'taste',
 'sweet',
 'candy',
 'home',
 'delivered',
 'twizlers',
 'always',
 'delicious',
 'product',
 'please',
 'sell',
 'mexico',
 'nasty',
 'flavor',
 'bargain',
 'price',
 'machine',
 'instant',
 'oatmeals',
 'irish',
 'oatmeal',
 'hurry',
 'satisfying',
 'gluten',
 'free',
 'way',
 'start',
 'day',
 'wife',
 'favorite',
 'breakfast',
 'would',
 'buy',
 'mcanns',
 'tastes',
 'lovers',
 'convenience',
 'hearty',
 'mushy',
 'next',
 'time',
 'order',
 'variety',
 'pack',
 'stuff',
 'like',
 'came',
 'back',
 'go',
 'nuts',
 'ass',
 'kickin',
 'peanuts',
 'roasts',
 'smooth',
 'brew',
 'gues

In [14]:
special_words = ["<PAD>", "<UNK>", "<GO>", "<EOS>"]
vocab_words = special_words + vocab_words
vocab2id = {word: i for i, word in enumerate(vocab_words)}
id2vocab = {i: word for i, word in enumerate(vocab_words)}

In [15]:
vocab2id 

{'<PAD>': 0,
 '<UNK>': 1,
 '<GO>': 2,
 '<EOS>': 3,
 'good': 4,
 'quality': 5,
 'dog': 6,
 'food': 7,
 'advertised': 8,
 'delight': 9,
 'says': 10,
 'cough': 11,
 'medicine': 12,
 'great': 13,
 'taffy': 14,
 'nice': 15,
 'expensive': 16,
 'brands': 17,
 'wonderful': 18,
 'tasty': 19,
 'yay': 20,
 'barley': 21,
 'healthy': 22,
 'best': 23,
 'hot': 24,
 'sauce': 25,
 'world': 26,
 'cats': 27,
 'love': 28,
 'diet': 29,
 'better': 30,
 'regular': 31,
 'fans': 32,
 'new': 33,
 'fresh': 34,
 'greasy': 35,
 'strawberry': 36,
 'twizzlers': 37,
 'yummy': 38,
 'lots': 39,
 'expect': 40,
 'poor': 41,
 'taste': 42,
 'sweet': 43,
 'candy': 44,
 'home': 45,
 'delivered': 46,
 'twizlers': 47,
 'always': 48,
 'delicious': 49,
 'product': 50,
 'please': 51,
 'sell': 52,
 'mexico': 53,
 'nasty': 54,
 'flavor': 55,
 'bargain': 56,
 'price': 57,
 'machine': 58,
 'instant': 59,
 'oatmeals': 60,
 'irish': 61,
 'oatmeal': 62,
 'hurry': 63,
 'satisfying': 64,
 'gluten': 65,
 'free': 66,
 'way': 67,
 'start': 6

In [16]:
source_data  = []
target_data = []
for w in clean_texts :
    words = w.strip().split()
    source_data.append(words)

In [17]:
for w in clean_summaries :
    words = w.strip().split()
    target_data.append(words)


In [18]:
target_data

[['good', 'quality', 'dog', 'food'],
 ['advertised'],
 ['delight', 'says'],
 ['cough', 'medicine'],
 ['great', 'taffy'],
 ['nice', 'taffy'],
 ['great', 'good', 'expensive', 'brands'],
 ['wonderful', 'tasty', 'taffy'],
 ['yay', 'barley'],
 ['healthy', 'dog', 'food'],
 ['best', 'hot', 'sauce', 'world'],
 ['cats', 'love', 'diet', 'food', 'better', 'regular', 'food'],
 ['cats', 'fans', 'new', 'food'],
 ['fresh', 'greasy'],
 ['strawberry', 'twizzlers', 'yummy'],
 ['lots', 'twizzlers', 'expect'],
 ['poor', 'taste'],
 ['love'],
 ['great', 'sweet', 'candy'],
 ['home', 'delivered', 'twizlers'],
 ['always', 'fresh'],
 ['twizzlers'],
 ['delicious', 'product'],
 ['twizzlers'],
 ['please', 'sell', 'mexico'],
 ['twizzlers', 'strawberry'],
 ['nasty', 'flavor'],
 ['great', 'bargain', 'price'],
 ['yummy'],
 ['best', 'hot', 'sauce', 'world'],
 ['great', 'machine'],
 ['taste'],
 ['best', 'instant', 'oatmeals'],
 ['good', 'instant'],
 ['great', 'irish', 'oatmeal', 'hurry'],
 ['satisfying'],
 ['love', 'glu

In [19]:
def process_data_index(datas,vocab2id) :
    data_indexs = []
    for words in datas:
        line_index = [vocab2id[w] if w in vocab2id else vocab2id["<UNK>"] for w in words]
        data_indexs.append(line_index)
    return data_indexs

In [20]:
target_data_ids = process_data_index(target_data, vocab2id)
source_data_ids = process_data_index(source_data, vocab2id)

In [21]:
print("vocab test: ", [id2vocab[i] for i in range(11)])
print("source test: ", source_data[2])
print("source index: ", source_data_ids[2])
print("target test: ", target_data[2])
print("target index: ", target_data_ids[2])

vocab test:  ['<PAD>', '<UNK>', '<GO>', '<EOS>', 'good', 'quality', 'dog', 'food', 'advertised', 'delight', 'says']
source test:  ['confection', 'around', 'centuries', 'light', 'pillowy', 'citrus', 'gelatin', 'nuts', 'case', 'filberts', 'cut', 'tiny', 'squares', 'liberally', 'coated', 'powdered', 'sugar', 'tiny', 'mouthful', 'heaven', 'chewy', 'flavorful', 'highly', 'recommend', 'yummy', 'treat', 'familiar', 'story', 'c', 'lewis', 'lion', 'witch', 'wardrobe', 'treat', 'seduces', 'edmund', 'selling', 'brother', 'sisters', 'witch']
source index:  [3054, 532, 3055, 282, 3056, 3057, 3058, 91, 318, 3059, 3027, 1200, 3060, 3061, 3062, 2102, 384, 1200, 1723, 904, 1967, 952, 498, 1832, 38, 484, 3063, 3064, 674, 3065, 3066, 3067, 3068, 484, 3069, 3070, 3071, 3072, 3073, 3067]
target test:  ['delight', 'says']
target index:  [9, 10]


In [22]:
def process_input_data(source_data_ids, target_indexs, vocab2id):
    source_inputs = []
    decoder_inputs, decoder_outputs = [], []
    for source, target in zip(source_data_ids, target_indexs):
        source_inputs.append([vocab2id["<GO>"]] + source + [vocab2id["<EOS>"]])
        decoder_inputs.append([vocab2id["<GO>"]] + target)
        decoder_outputs.append(target + [vocab2id["<EOS>"]])
    return source_inputs, decoder_inputs, decoder_outputs

source_input_ids, target_input_ids, target_output_ids = process_input_data(source_data_ids, target_data_ids, vocab2id)

In [23]:

print("encoder inputs: ", source_input_ids[1])
print("decoder inputs: ", target_input_ids[:2])
print("decoder outputs: ", target_output_ids[:2])

encoder inputs:  [2, 50, 287, 3049, 3050, 650, 94, 94, 1248, 349, 222, 641, 411, 3051, 1212, 3052, 3053, 50, 3050, 3]
decoder inputs:  [[2, 4, 5, 6, 7], [2, 8]]
decoder outputs:  [[4, 5, 6, 7, 3], [8, 3]]


In [25]:
maxlen = 10
source_input_ids = keras.preprocessing.sequence.pad_sequences(source_input_ids, padding='post', maxlen=20)
target_input_ids = keras.preprocessing.sequence.pad_sequences(target_input_ids, padding='post',  maxlen=maxlen)
target_output_ids = keras.preprocessing.sequence.pad_sequences(target_output_ids, padding='post',  maxlen=maxlen)
print(source_input_ids[:4])
print(target_input_ids[:10])
print(target_output_ids[:10])



[[   6    7 2072  778    4    5   50  213   87 3046 1734  836 1073   30
  3047 2973 3048   50   30    3]
 [   2   50  287 3049 3050  650   94   94 1248  349  222  641  411 3051
  1212 3052 3053   50 3050    3]
 [ 952  498 1832   38  484 3063 3064  674 3065 3066 3067 3068  484 3069
  3070 3071 3072 3073 3067    3]
 [   2  319  808 1263 3074  705  778  794 2101 3075 1316 1383 1338    4
   743 1176  960   55 1294    3]]
[[ 2  4  5  6  7  0  0  0  0  0]
 [ 2  8  0  0  0  0  0  0  0  0]
 [ 2  9 10  0  0  0  0  0  0  0]
 [ 2 11 12  0  0  0  0  0  0  0]
 [ 2 13 14  0  0  0  0  0  0  0]
 [ 2 15 14  0  0  0  0  0  0  0]
 [ 2 13  4 16 17  0  0  0  0  0]
 [ 2 18 19 14  0  0  0  0  0  0]
 [ 2 20 21  0  0  0  0  0  0  0]
 [ 2 22  6  7  0  0  0  0  0  0]]
[[ 4  5  6  7  3  0  0  0  0  0]
 [ 8  3  0  0  0  0  0  0  0  0]
 [ 9 10  3  0  0  0  0  0  0  0]
 [11 12  3  0  0  0  0  0  0  0]
 [13 14  3  0  0  0  0  0  0  0]
 [15 14  3  0  0  0  0  0  0  0]
 [13  4 16 17  3  0  0  0  0  0]
 [18 19 14  3  0 

In [26]:
class Encoder(keras.Model):
    def __init__(self, vocab_size, embedding_dim, hidden_units):
        super(Encoder, self).__init__()
        # Embedding Layer
        self.embedding = Embedding(vocab_size, embedding_dim, mask_zero=True)
        # Encode LSTM Layer
        self.encoder_lstm = LSTM(hidden_units, return_sequences=True, return_state=True, name="encode_lstm")
        
    def call(self, inputs):
        encoder_embed = self.embedding(inputs)
        encoder_outputs, state_h, state_c = self.encoder_lstm(encoder_embed)
        return encoder_outputs, state_h, state_c

In [27]:
class Decoder(keras.Model):
    def __init__(self, vocab_size, embedding_dim, hidden_units):
        super(Decoder, self).__init__()
        # Embedding Layer
        self.embedding = Embedding(vocab_size, embedding_dim, mask_zero=True)
        # Decode LSTM Layer
        self.decoder_lstm = LSTM(hidden_units, return_sequences=True, return_state=True, name="decode_lstm")
        # Attention Layer
        self.attention = Attention()
    
    def call(self, enc_outputs, dec_inputs, states_inputs):
        decoder_embed = self.embedding(dec_inputs)
        dec_outputs, dec_state_h, dec_state_c = self.decoder_lstm(decoder_embed, initial_state=states_inputs)
        attention_output = self.attention([dec_outputs, enc_outputs])
        
        return attention_output, dec_state_h, dec_state_c

In [28]:
def Seq2Seq(maxlen, embedding_dim, hidden_units, vocab_size):
    """
    seq2seq model
    """
    # Input Layer
    encoder_inputs = Input(shape=(maxlen,), name="encode_input")
    decoder_inputs = Input(shape=(None,), name="decode_input")
    # Encoder Layer
    encoder = Encoder(vocab_size, embedding_dim, hidden_units)
    enc_outputs, enc_state_h, enc_state_c = encoder(encoder_inputs)
    dec_states_inputs = [enc_state_h, enc_state_c]
    # Decoder Layer
    decoder = Decoder(vocab_size, embedding_dim, hidden_units)
    attention_output, dec_state_h, dec_state_c = decoder(enc_outputs, decoder_inputs, dec_states_inputs)
    # Dense Layer
    dense_outputs = Dense(vocab_size, activation='softmax', name="dense")(attention_output)
    # seq2seq model
    model = Model(inputs=[encoder_inputs, decoder_inputs], outputs=dense_outputs)
    
    return model

In [29]:
K.clear_session()

maxlen = 10
embedding_dim = 50
hidden_units = 128
vocab_size = len(vocab2id)

model = Seq2Seq(maxlen, embedding_dim, hidden_units, vocab_size)
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
encode_input (InputLayer)       [(None, 10)]         0                                            
__________________________________________________________________________________________________
encoder (Encoder)               ((None, 10, 128), (N 810898      encode_input[0][0]               
__________________________________________________________________________________________________
decode_input (InputLayer)       [(None, None)]       0                                            
__________________________________________________________________________________________________
decoder (Decoder)               ((None, None, 128),  810898      encoder[0][0]                    
______________________________________________________________________________________________

In [None]:
epochs = 49
batch_size = 32
val_rate = 0.2

loss_fn = keras.losses.SparseCategoricalCrossentropy()
model.compile(loss=loss_fn, optimizer='adam')
model.fit([source_input_ids, target_input_ids], target_output_ids, 
          batch_size=batch_size, epochs=epochs, validation_split=val_rate)

Epoch 1/49
Epoch 2/49
Epoch 3/49
  2/125 [..............................] - ETA: 18s - loss: 7.7018

In [48]:
model.save_weights("data/seq2seq_attention_weights.h5")

In [62]:
def encoder_infer(model):
    encoder_model = Model(inputs=model.get_layer('encoder').input, 
                        outputs=model.get_layer('encoder').output)
    return encoder_model

encoder_model = encoder_infer(model)
print(encoder_model.summary())

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
encode_input (InputLayer)    [(None, 10)]              0         
_________________________________________________________________
encoder (Encoder)            ((None, 10, 128), (None,  403998    
Total params: 403,998
Trainable params: 403,998
Non-trainable params: 0
_________________________________________________________________
None


In [63]:
def decoder_infer(model, encoder_model):
    encoder_output = encoder_model.get_layer('encoder').output[0]
    maxlen, hidden_units = encoder_output.shape[1:]
    
    dec_input = model.get_layer('decode_input').input
    enc_output = Input(shape=(maxlen, hidden_units), name='enc_output')
    dec_input_state_h = Input(shape=(hidden_units,), name='input_state_h')
    dec_input_state_c = Input(shape=(hidden_units,), name='input_state_c')
    dec_input_states = [dec_input_state_h, dec_input_state_c]

    decoder = model.get_layer('decoder')
    dec_outputs, out_state_h, out_state_c = decoder(enc_output, dec_input, dec_input_states)
    dec_output_states = [out_state_h, out_state_c]

    decoder_dense = model.get_layer('dense')
    dense_output = decoder_dense(dec_outputs)

    decoder_model = Model(inputs=[enc_output, dec_input, dec_input_states], 
                          outputs=[dense_output]+dec_output_states)
    return decoder_model

decoder_model = decoder_infer(model, encoder_model)
print(decoder_model.summary())

Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
enc_output (InputLayer)         [(None, 10, 128)]    0                                            
__________________________________________________________________________________________________
decode_input (InputLayer)       [(None, None)]       0                                            
__________________________________________________________________________________________________
input_state_h (InputLayer)      [(None, 128)]        0                                            
__________________________________________________________________________________________________
input_state_c (InputLayer)      [(None, 128)]        0                                            
____________________________________________________________________________________________

In [64]:
import numpy as np

maxlen = 10

def infer_predict(input_text, encoder_model, decoder_model):
    text_words = input_text.split()[:maxlen]
    input_id = [vocab2id[w] if w in vocab2id else vocab2id["<UNK>"] for w in text_words]
    input_id = [vocab2id["<GO>"]] + input_id + [vocab2id["<EOS>"]]
    if len(input_id) < maxlen:
        input_id = input_id + [vocab2id["<PAD>"]] * (maxlen-len(input_id))

    input_source = np.array([input_id])
    input_target = np.array([vocab2id["<GO>"]])
    
    # 编码器encoder预测输出
    enc_outputs, enc_state_h, enc_state_c = encoder_model.predict([input_source])
    dec_inputs = input_target
    dec_states_inputs = [enc_state_h, enc_state_c]

    result_id = []
    result_text = []
    for i in range(maxlen):
        # 解码器decoder预测输出
        dense_outputs, dec_state_h, dec_state_c = decoder_model.predict([enc_outputs, dec_inputs]+dec_states_inputs)
        pred_id = np.argmax(dense_outputs[0][0])
        result_id.append(pred_id)
        result_text.append(id2vocab[pred_id])
        if id2vocab[pred_id] == "<EOS>":
            break
        dec_inputs = np.array([[pred_id]])
        dec_states_inputs = [dec_state_h, dec_state_c]
    return result_id, result_text

In [66]:
input_text = "bought several vitali "
result_id, result_text = infer_predict(input_text, encoder_model, decoder_model)

print("Input: ", input_text)
print("Output: ", result_text, result_id)

Input:  bought several vitali 
Output:  ['<EOS>'] [3]
