In [1]:
from __future__ import print_function
import matplotlib.pyplot as plt
import numpy as np
import mxnet as mx
from mxnet import nd, autograd, gluon
import numpy as np
import re
import itertools
from collections import Counter
from sklearn import preprocessing
import pandas as pd
import os
import pickle
import boto3
from sagemaker import get_execution_role

# Set execution role
role = get_execution_role()
bucket ='sagemaker-nomadiq-data'

s3 = boto3.resource('s3')

s3.Bucket('sagemaker-nomadiq-data').download_file('vocab_labels.pickle','vocab_labels.pickle')
s3.Bucket('sagemaker-nomadiq-data').download_file('testnet.params','testnet.params')

ctx = mx.cpu()

# Loading saved parameters for forward pass
We redefine our architecture and load parameters with Gluon's load_parameters method. Now we're ready to make predictions!

In [2]:
with open(r'vocab_labels.pickle', 'rb') as f:
    vocabulary2,vocab_size2,sentence_size2,labelencoder2,unique_labels2,test_accuracy_list2 = pickle.load(f)

In [20]:
def clean_str(string):
    """
    Tokenization/string cleaning for all datasets except for SST.
    Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    """
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    string = re.sub(r"<b>", " ", string)
    string = re.sub(r"</b>", " ", string)
    string = re.sub(r"<br>", " ", string)
    string = re.sub(r"</br>", " ", string)
    string = re.sub(r"<p>", " ", string)
    string = re.sub(r"</p>", " ", string)
    string = re.sub(r"<ul>", " ", string)
    string = re.sub(r"</ul>", " ", string)
    string = re.sub(r"<li>", " ", string)
    string = re.sub(r"</li>", " ", string)
    return string.strip().lower()

def load_data_and_labels(string):
    """
    Loads and cleans training data
    """
    # Get raw data and create concatenated text string
    x_text = string
    x_text = [clean_str(sent) for sent in x_text]
    x_text = [s.split(" ") for s in x_text]
    return x_text

def pad_sentences_oos(sentences, padding_word="</s>"):
    """
    Pads all sentences to the same length. The length is defined by the longest sentence.
    Returns padded sentences.
    """
    padded_sentences = []
    for i in range(len(sentences)):
        sentence = sentences[i]
        num_padding = sentence_size2 - len(sentence)
        new_sentence = sentence + [padding_word] * num_padding
        padded_sentences.append(new_sentence)
    return padded_sentences

def fit_input_data(sentences, vocabulary):
    """
    Maps sentences and labels to vectors based on a vocabulary.
    """
    list_1 = []
    for sentence in sentences:
        list_2 = []
        for word in sentence:
            try:
                list_2.append(vocabulary2[word])
            except:
                list_2.append(0)
        list_1.append(list_2)
    x = np.array(list_1)
    return x

print("Loads and preprocessed data for the dataset. Returns input vectors, labels, vocabulary, and inverse vocabulary.")
# Load and preprocess data
# input_sent = ['surfing beach is sunny and beautiful today']
# sentences = load_data_and_labels(input_sent)
sentences_padded = pad_sentences_oos(sentences)
# vocabulary, vocabulary_inv = build_vocab(sentences_padded)
x = fit_input_data(sentences_padded, vocabulary2)

print('data shape:', x.shape)

Loads and preprocessed data for the dataset. Returns input vectors, labels, vocabulary, and inverse vocabulary.
data shape: (1, 461)


# Get recommendations based on string input

In [16]:
text_string = []
text_string.append(input("What type of vacation are you interested in? (Free-form entry): "))

# Take input and make prediction

print("Loads and preprocessed data for the dataset. Returns input vectors, labels, vocabulary, and inverse vocabulary.")
# Load and preprocess data
sentences = load_data_and_labels(text_string)
sentences_padded = pad_sentences_oos(sentences)
# vocabulary, vocabulary_inv = build_vocab(sentences_padded)
x = fit_input_data(sentences_padded, vocabulary2)

print('data shape:', x.shape)

batch_size = 50
vocab_size = vocab_size2
embed_size = 300
num_fc = 256
num_filters = 50
filter_size = 3
unique_labels = unique_labels2

# New input data to predict on
prod_data = gluon.data.DataLoader(gluon.data.ArrayDataset(x),
                                      batch_size=batch_size, shuffle=False)

# Net2 is a copy of the original architecture
net2 = gluon.nn.Sequential()
with net2.name_scope():
    net2.add(gluon.nn.Embedding(vocab_size, embed_size))
    net2.add(gluon.nn.Conv1D(channels = num_filters, kernel_size = filter_size, activation='relu'))
    net2.add(gluon.nn.MaxPool1D(pool_size=2,strides=2))
    net2.add(gluon.nn.Dropout(.5))
    net2.add(gluon.nn.Flatten())
    net2.add(gluon.nn.Dense(num_fc, activation = 'relu'))
    net2.add(gluon.nn.Dropout(.5))
    net2.add(gluon.nn.Dense(unique_labels))


print(net2)
# Loading the previously saved training parameters 
net2.load_parameters(r'testnet.params', ctx=ctx)

# Forward pass that results in a list of ndarrays with predicted ITK and softmax probabilities associated with the predicted ITK.  
prediction_list = []
probabilities = []
for i, data in enumerate(prod_data):
        data = data.as_in_context(ctx)
        output = net2(data).softmax()
        max_prob = np.amax(output, axis=1)
#         predictions = nd.argmax(output, axis=1)
        prediction_list =  nd.argsort(output,axis = 1)[0][::-1][:5]
#         prediction_list.append(predictions)
        probabilities.append(max_prob)
        
# Transform to numpy arrays in int and float dtypes (vs. ndarray) to apply to input datafram
pred2 = [pred.astype("int").asnumpy() for pred in prediction_list]
transformed_prediction = labelencoder2.inverse_transform(pred2)
prob2 = [prob.astype("float32").asnumpy() for prob in probabilities]

flat_predictions = [item for sublist in transformed_prediction for item in sublist]
flat_prob = [item for sublist in prob2 for item in sublist]
print("Here is your recommended destination: ")
for i, x in enumerate(flat_predictions):
    print(str(i)+":",x)


What type of vacation are you interested in? (Free-form entry):  fishing 


Loads and preprocessed data for the dataset. Returns input vectors, labels, vocabulary, and inverse vocabulary.
data shape: (1, 461)
Sequential(
  (0): Embedding(100710 -> 300, float32)
  (1): Conv1D(None -> 50, kernel_size=(3,), stride=(1,))
  (2): MaxPool1D(size=(2,), stride=(2,), padding=(0,), ceil_mode=False)
  (3): Dropout(p = 0.5, axes=())
  (4): Flatten
  (5): Dense(None -> 256, Activation(relu))
  (6): Dropout(p = 0.5, axes=())
  (7): Dense(None -> 200, linear)
)
Here is your recommended destination: 
0: Bangkok, Thailand
1: Havana, Cuba
2: London, United Kingdom
3: Los Angeles, California
4: Thailand


In [142]:
nd.argsort(output,axis = 1)[0][:5]

# import numpy as np
# x = np.arange(10)
# print("Original array:")
# print(x)
# np.random.shuffle(x)
# n = 1
# print (x[np.argsort(x)[-n:]])


[  54.   53.  134.   69.  178.]
<NDArray 5 @cpu(0)>

In [None]:
# # Pandas dataframe that displays input data and predicted ITK ("prediction") and softmax probability of those predictions ("probability") 
# pd.set_option('display.max_colwidth', -1)

# df = pd.read_csv('/tmp/eider-user/userfile/'+username+'/itk_data_oos2.csv', encoding = 'latin-1')

# # Run the entire dataset and save results to S3
# df['prediction'] = flat_predictions
# df['probability'] = flat_prob
# df.to_csv('/tmp/eider-user/userfile/'+username+'/results.csv',encoding = 'latin-1')
# s3.Bucket('itk-model-data').upload_file('/tmp/eider-user/userfile/'+username+'/results.csv', 'results.csv')
 
# # Print a sample of predictions
# df[:50]

- Balance out all data (by doubling it)
- Tripadvisor data
- Reddit (open API)
- Yelp