# Predicting a response to customers' questions and comments

In [38]:
import re
import random
import time

import keras
import pandas as pd
import sklearn
import nltk
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import casual_tokenize

### Model parameters

In [2]:
MAX_VOCAB_SIZE = 2**13
MAX_MESSAGE_LEN = 30
UNK = 0
PAD = 1
START = 2
EMBEDDING_SIZE = 100
CONTEXT_SIZE = 100
BATCH_SIZE = 1
DROPOUT = 0.2
LEARNING_RATE=0.005

### Loading Data and Reshaping

In [3]:
tweets = pd.read_csv('data/twcs.csv')
tweets.head()
# in_response_to_tweet_id - this tweet is responding to a previous tweet with ID in the column
# response_tweet_id - this tweet is responded by a later with ID in the column
# inbound - the direction of customers to brands/companies (True)

Unnamed: 0,tweet_id,author_id,inbound,created_at,text,response_tweet_id,in_response_to_tweet_id
0,1,sprintcare,False,Tue Oct 31 22:10:47 +0000 2017,@115712 I understand. I would like to assist y...,2.0,3.0
1,2,115712,True,Tue Oct 31 22:11:45 +0000 2017,@sprintcare and how do you propose we do that,,1.0
2,3,115712,True,Tue Oct 31 22:08:27 +0000 2017,@sprintcare I have sent several private messag...,1.0,4.0
3,4,sprintcare,False,Tue Oct 31 21:54:49 +0000 2017,@115712 Please send us a Private Message so th...,3.0,5.0
4,5,115712,True,Tue Oct 31 21:49:35 +0000 2017,@sprintcare I did.,4.0,6.0


In [16]:
tweets.dtypes

tweet_id                     int64
author_id                   object
inbound                       bool
created_at                  object
text                        object
response_tweet_id           object
in_response_to_tweet_id    float64
dtype: object

In [4]:
# find the initial question/comment to the companies
first_inbound = tweets[pd.isnull(tweets.in_response_to_tweet_id) & tweets.inbound]

# join initial question with its response
inbounds_and_outbounds = pd.merge(first_inbound, tweets, left_on='tweet_id', 
                                  right_on='in_response_to_tweet_id').sample(frac=1)
inbounds_and_outbounds = inbounds_and_outbounds[inbounds_and_outbounds.inbound_y ^ True]

In [5]:
inbounds_and_outbounds.shape

(794299, 14)

amazon_firstround = inbounds_and_outbounds[inbounds_and_outbounds['text_x'].str.contains("@AmazonHelp")]

### Tokenizing and building vocab

In [172]:
# remove @ID tags
tags = re.compile('(\W@|^@)([a-zA-Z0-9_]+)')
x_text = inbounds_and_outbounds.text_x.apply(lambda txt: tags.sub('', txt).strip())
y_text = inbounds_and_outbounds.text_y.apply(lambda txt: tags.sub('', txt).strip())

In [182]:
np.save("data/x_text", x_text)
np.save("data/y_text", y_text)

In [178]:
x_text.apply(lambda text: len(text.split())).max()

111

In [73]:
y_text.apply(lambda text: len(text.split())).max()

56

In [179]:
# create a dictionary matching a unique number with a word
count_vec = CountVectorizer(tokenizer=casual_tokenize, max_features=MAX_VOCAB_SIZE - 3)
count_vec.fit(x_text + y_text)
analyzer = count_vec.build_analyzer()
vocab = {k: v + 3 for k, v in count_vec.vocabulary_.items()}
reverse_vocab = {v: k for k, v in vocab.items()}

In [174]:
MAX_VOCAB_SIZE - 3

8189

In [180]:
reverse_vocab

{3011: 'fyi',
 2297: 'do',
 4932: 'not',
 1322: 'buy',
 50: '#hulu',
 2908: 'for',
 6768: 'streaming',
 4339: 'live',
 7329: 'tv',
 121: '.',
 3662: 'i',
 3267: 'have',
 3841: 'internet',
 713: 'and',
 1382: 'cannot',
 1779: 'consistently',
 6767: 'stream',
 7774: 'without',
 1286: 'buffering',
 881: 'at',
 200: '1gb',
 4905: 'no',
 3: '!',
 7659: "we'd",
 1046: 'be',
 3244: 'happy',
 7178: 'to',
 3316: 'help',
 5113: 'out',
 7718: 'which',
 2158: 'device',
 815: 'are',
 7886: 'you',
 7486: 'using',
 412: '?',
 2309: 'does',
 7084: 'this',
 3238: 'happen',
 7770: 'with',
 421: 'a',
 6640: 'specific',
 1497: 'channel',
 3065: 'get',
 7052: 'that',
 2321: 'don',
 7985: '’',
 6907: 't',
 6630: 'spare',
 4447: 'lying',
 824: 'around',
 118: ',',
 1317: 'but',
 4844: 'need',
 1094: 'better',
 5376: 'plan',
 7032: 'than',
 2073: 'delaying',
 7472: 'us',
 186: '15',
 3443: 'hours',
 602: 'airport',
 6975: 'team',
 7058: 'their',
 1089: 'best',
 5110: 'our',
 1948: 'customers',
 7104: 'through

In [163]:
# creating sentence of equal length padded with 1 if length is shorter than 30
def to_word_idx(sentence):
    # from word to indices
    full_length = [vocab.get(tok, UNK) for tok in analyzer(sentence)] + [PAD] * MAX_MESSAGE_LEN
    return full_length[:MAX_MESSAGE_LEN]

def from_word_idx(word_idxs):
    # from indices to word
    return ' '.join(reverse_vocab[idx] for idx in word_idxs if idx > PAD).strip()

In [181]:
x = pd.np.vstack(x_text.apply(to_word_idx).values)
y = pd.np.vstack(y_text.apply(to_word_idx).values)

In [185]:
!pwd

/Users/QIAN/.Trash/Project_NLP


In [193]:
np.save("/Users/QIAN/Documents/GitHub/Project_NLP/data/x", x)
np.save("/Users/QIAN/Documents/GitHub/Project_NLP/data/y", y)

In [184]:
x[0]

array([3011, 2297, 4932, 1322,   50, 2908, 6768, 4339, 7329,  121, 3662,
       3267,    0, 3841,  713, 1382, 1779, 6767, 7774, 1286,  881,  200,
       2345,    1,    1,    1,    1,    1,    1,    1])

In [187]:
# Use simple random split of data - may cause class imbalance!
all_idx = list(range(len(x)))
train_idx = set(random.sample(all_idx, int(0.8 * len(all_idx))))
test_idx = {idx for idx in all_idx if idx not in train_idx}

train_x = x[list(train_idx)]
test_x = x[list(test_idx)]
train_y = y[list(train_idx)]
test_y = y[list(test_idx)]

assert train_x.shape == train_y.shape
assert test_x.shape == test_y.shape

print(f'Training data of shape {train_x.shape} and test data of shape {test_x.shape}.')

Training data of shape (635439, 30) and test data of shape (158860, 30).


In [196]:
np.save("/Users/QIAN/Documents/GitHub/Project_NLP/data/train_x", train_x)
np.save("/Users/QIAN/Documents/GitHub/Project_NLP/data/test_x", test_x)
np.save("/Users/QIAN/Documents/GitHub/Project_NLP/data/train_y", train_y)
np.save("/Users/QIAN/Documents/GitHub/Project_NLP/data/test_y", test_y)

train_x = np.load('data/train_x.npy')
train_y = np.load('data/train_y.npy')
test_x = np.load('data/test_x.npy')
test_y = np.load('data/test_y.npy')

In [84]:
train_y[0]

array([3427, 5254,    3, 4013, 7200, 7565, 7776, 7950, 5590, 3881,  108,
       7945, 3353, 3889, 1251, 7457, 7108, 3100, 7834, 5563, 7604, 5063,
       7950, 6183,  422,  429, 3947,    1,    1,    1])

### Model Creation

In [85]:
from keras.models import Model
from keras.optimizers import Adam
from keras.layers import Dense, Input, GRU, Dropout, Embedding, RepeatVector, concatenate, \
    TimeDistributed
from keras.utils import np_utils

In [86]:
def create_model():
    shared_embedding = Embedding(
        output_dim=EMBEDDING_SIZE,
        input_dim=MAX_VOCAB_SIZE,
        input_length=MAX_MESSAGE_LEN,
        name='embedding',
    )
    
    # ENCODER
    
    encoder_input = Input(
        shape=(MAX_MESSAGE_LEN,),
        dtype='int32',
        name='encoder_input',
    )
    
    embedded_input = shared_embedding(encoder_input)
    
    encoder_rnn = GRU(
        CONTEXT_SIZE,
        name='encoder',
        dropout=DROPOUT
    )
    
    context = RepeatVector(MAX_MESSAGE_LEN)(encoder_rnn(embedded_input))
    
    # DECODER
    
    last_word_input = Input(
        shape=(MAX_MESSAGE_LEN, ),
        dtype='int32',
        name='last_word_input',
    )
    
    embedded_last_word = shared_embedding(last_word_input)
    decoder_input = concatenate([embedded_last_word, context], axis=2)
    
    decoder_rnn = GRU(
        CONTEXT_SIZE,
        name='decoder',
        return_sequences=True,
        dropout=DROPOUT
    )
    
    decoder_output = decoder_rnn(decoder_input)
    
    next_word_dense = TimeDistributed(
        Dense(int(MAX_VOCAB_SIZE / 2), activation='relu'),
        name='next_word_dense',
    )(decoder_output)
    
    next_word = TimeDistributed(
        Dense(MAX_VOCAB_SIZE, activation='softmax'),
        name='next_word_softmax'
    )(next_word_dense)
    
    return Model(inputs=[encoder_input, last_word_input], outputs=[next_word])

s2s_model = create_model()
optimizer = Adam(lr=LEARNING_RATE, clipvalue=5.0)
s2s_model.compile(optimizer='adam', loss='categorical_crossentropy')

### Model training

In [87]:
def add_start_token(y_array):
    return np.hstack([
        START * np.ones((len(y_array), 1)),
        y_array[:, :-1],
    ])

def binarize_labels(labels):
    return np.array([np_utils.to_categorical(row, num_classes=MAX_VOCAB_SIZE)
                     for row in labels])

def respond_to(model, text):
    input_y = add_start_token(PAD * np.ones((1, MAX_MESSAGE_LEN)))
    idxs = np.array(to_word_idx(text)).reshape((1, MAX_MESSAGE_LEN))
    for position in range(MAX_MESSAGE_LEN - 1):
        prediction = model.predict([idxs, input_y]).argmax(axis=2)[0]
        input_y[:,position + 1] = prediction[position]
    return from_word_idx(model.predict([idxs, input_y]).argmax(axis=2)[0])

In [104]:
def train_mini_epoch(model, start_idx, end_idx):
    """ Batching seems necessary in Kaggle Jupyter Notebook environments, since
        `model.fit` seems to freeze on larger batches (somewhere 1k-10k).
    """
    b_train_y = binarize_labels(train_y[start_idx:end_idx])
    input_train_y = add_start_token(train_y[start_idx:end_idx])
    
    model.fit(
        [train_x[start_idx:end_idx], input_train_y], 
        b_train_y,
        epochs=1,
        batch_size=BATCH_SIZE,
    )
    
    rand_idx = random.sample(list(range(len(test_x))), SUB_BATCH_SIZE)
    print('Test results:', model.evaluate(
        [test_x[rand_idx], add_start_token(test_y[rand_idx])],
        binarize_labels(test_y[rand_idx])
    ))
    
    input_string = x_text.iloc[100]
    output_string = respond_to(model, input_string)
    print(f'> "{input_string}"\n< "{output_string}"')
    
    input_string = x_text.iloc[2500]
    output_string = respond_to(model, input_string)
    print(f'> "{input_string}"\n< "{output_string}"')

In [166]:
training_time_limit = 2*60 * 60  # seconds (notebooks terminate after 1 hour)
start_time = time.time()
stop_after = start_time + training_time_limit
SUB_BATCH_SIZE = 200

class TimesUpInterrupt(Exception):
    pass

try:
    for epoch in range(20):
        print(f'Training in epoch {epoch}...')
        for start_idx in range(0, len(train_x), SUB_BATCH_SIZE):
            train_mini_epoch(s2s_model, start_idx, start_idx + SUB_BATCH_SIZE)
            if time.time() > stop_after:
                raise TimesUpInterrupt
except KeyboardInterrupt:
    print("Halting training from keyboard interrupt.")
except TimesUpInterrupt:
    print(f"Halting after {time.time() - start_time} seconds spent training.")


Training in epoch 0...
Epoch 1/1
Test results: 3.3665725326538087
> "absolutely appalled that so far I’ve waited two days for two parcels to be delivered via #primedelivery - where are my items?!"
< "i'm sorry to hear this ! please contact us here : ^ sa"
> "why can’t i get into my amazon account? It says my password is incorrect and i know it’s correct i changed it 3 times now"
< "i'm sorry to hear this ! please contact us here : ^ sa"
Epoch 1/1
Test results: 3.2131352710723875
> "absolutely appalled that so far I’ve waited two days for two parcels to be delivered via #primedelivery - where are my items?!"
< "i'm sorry for the unexpected charge ! please reach out to us here : ^ wt"
> "why can’t i get into my amazon account? It says my password is incorrect and i know it’s correct i changed it 3 times now"
< "i'm sorry for the unexpected charge ! please don't provide your order details , we consider it personal information . our twitter page is visible to public . ^ mj"
Epoch 1/1
Test 

> "why can’t i get into my amazon account? It says my password is incorrect and i know it’s correct i changed it 3 times now"
< "i'm sorry for the delay . we'd like to help ! ^ ha"
Epoch 1/1
Test results: 2.744751009941101
> "absolutely appalled that so far I’ve waited two days for two parcels to be delivered via #primedelivery - where are my items?!"
< "i'm sorry for the trouble with your experience . please reach out to us via phone or chat here : https://t.co/vlvfjr4nn9 and we'll look into this right away . ^"
> "why can’t i get into my amazon account? It says my password is incorrect and i know it’s correct i changed it 3 times now"
< "i'm sorry for the trouble with your experience . we'd like to help ! without providing any account info , can you tell us more about what's going on ?"
Epoch 1/1
Test results: 2.752375202178955
> "absolutely appalled that so far I’ve waited two days for two parcels to be delivered via #primedelivery - where are my items?!"
< "i'm sorry for the delay 

> "why can’t i get into my amazon account? It says my password is incorrect and i know it’s correct i changed it 3 times now"
< "i'm sorry for the trouble with your order . please reach us here : ^ ac"
Epoch 1/1
Test results: 2.7252392387390136
> "absolutely appalled that so far I’ve waited two days for two parcels to be delivered via #primedelivery - where are my items?!"
< "i'm sorry for the trouble ! we'd like to help . please reach out to us here : https://t.co/vlvfjr4nn9 and we'll check . ^ gk"
> "why can’t i get into my amazon account? It says my password is incorrect and i know it’s correct i changed it 3 times now"
< "i'm sorry for the trouble ! we'd like to help . please reach out to us here : https://t.co/vlvfjr4nn9 and we'll get in touch with you . ^ gk"
Epoch 1/1
Test results: 2.655237817764282
> "absolutely appalled that so far I’ve waited two days for two parcels to be delivered via #primedelivery - where are my items?!"
< "i'm sorry for the trouble ! we'd like to help ! 

In [169]:
s2s_model.save('s2s_model.h5')  # creates a HDF5 file 'my_model.h5'
#del model  # deletes the existing model

#s2s_model = load_model('s2s_model.h5')

In [167]:
respond_to(s2s_model, x_text.iloc[100])

"i'm sorry for the delay ! please contact us here : https://t.co/vlvfjr4nn9 and we'll help you . ^ sq"

In [168]:
respond_to(s2s_model, x_text.iloc[2500])

"i'm sorry for the delay ! please contact us here : https://t.co/vlvfjr4nn9 and we'll help you . ^ sq"