# Data Loading

In [1]:
import pickle
import pandas as pd
import json

### Load clean, processed data

In [3]:
df = pd.read_pickle("df3")

In [4]:
df.shape

(111009, 10)

In [5]:
## array of strings, 1 x n 
descriptions = df["description"].tolist()

## array of ints, 1 x n
prices = df["price"].tolist()

## array of ints, 1 x n
points = df["points"].tolist()

# Sentiment Analysis Feature Transformation

In [None]:
import nltk
import numpy as np

In [None]:
## must download vader_lexicon for vader sentiment algorithm
nltk.download_shell()

### Sentiment analysis algorithms. Each takes a single string as input.

In [None]:
def createDict():
    sentiment_dictionary = {}
    for line in open('afinn_dict.txt'):
        word,score = line.split('\t')
        sentiment_dictionary[word] = int(score)
    return sentiment_dictionary
sentiment_dictionary = createDict()

def sentimentAfinn(sentence):
    '''
    AFINN is a dictionary of polarity scores [-5,5] by word.
    This algorithm sums the scores for each word in the sentence, then
    classifies the entire sentence based on the sum's sign.
    '''
    sentence_tokens = sentence.split(' ')
    score = 0
    for token in sentence_tokens:
        score += sentiment_dictionary.get(token,0)
    ## if we want raw integer, not scaled
    return score
    ## if we want it to be scaled between -1 and 1
    # return np.sign(score)

In [None]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sentim_int_analyzer = SentimentIntensityAnalyzer()
def sentimentVader(sentence):    
    '''
    Sentiment object contains { 'pos', 'neg', 'neu', 'compound' } where pos+neg+neu=1, compound is [-1,1].
    This algorithm returns the 'pos', 'neu', 'neg' values
    '''
    ss = sentim_int_analyzer.polarity_scores(sentence)
    return [ss['pos'], ss['neu'], ss['neg']]

### Create sentiment feature vectors.

In [None]:
sentiments_sums = list(map(sentimentAfinn,descriptions))

In [None]:
sentiments_probs = list(map(sentimentVader,descriptions))

In [None]:
sentiments_negs = sentiment_probs[:,1].tolist()

### Saving the transformed feature vectors to file.

In [None]:
with open('sentiment_sums.json', 'w') as outfile:
    json.dump(sentiments_sums, outfile)

In [None]:
with open('sentiment_probabilities.json', 'w') as outfile:
    json.dump(sentiments_probs, outfile)

In [None]:
with open('sentiment_probabilities_negative.json', 'w') as outfile:
    json.dump(sentiments_negs, outfile)

### Graphical analysis.

In [None]:
import matplotlib.pyplot as plt

In [None]:
with open('sentiment_sums.json') as json_data:
    sentiment_sums = json.load(json_data)

In [None]:
with open('sentiment_probabilities.json') as json_data:
    sentiment_probs = json.load(json_data)

In [None]:
sentiment_probs = np.matrix(sentiment_probs)

In [None]:
# plotting sentiment sums against points
plt.plot(sentiment_sums, points, 'ro')
plt.show()

In [None]:
# plotting sentiment sums against prices
plt.plot(sentiment_sums, prices, 'ro')
plt.show()

In [None]:
# plotting positive probabilities against points
plt.plot(sentiment_probs[:,0], points, 'ro')
plt.show()

In [None]:
# plotting positive probabilities against price
plt.plot(sentiment_probs[:,0], prices, 'ro')
plt.show()

In [None]:
# plotting neutral probabilities against points
plt.plot(sentiment_probs[:,1], points, 'ro')
plt.show()

In [None]:
# plotting neutral probabilities against prices
plt.plot(sentiment_probs[:,1], prices, 'ro')
plt.show()

In [None]:
# plotting negative probabilities against points
plt.plot(sentiment_probs[:,2], points, 'ro')
plt.xlabel('probability that description has negative sentiment')
plt.ylabel('points')
plt.show()

In [None]:
# plotting negative probabilities against price
plt.plot(sentiment_probs[:,2], prices, 'ro')
plt.xlabel('probability that description has negative sentiment')
plt.ylabel('price')
plt.show()

# Neural Network

In [6]:
import numpy as np
import gensim
import nltk
import string

Using TensorFlow backend.


In [7]:
## must download stopwords for word embeddings
nltk.download_shell()

NLTK Downloader
---------------------------------------------------------------------------
    d) Download   l) List    u) Update   c) Config   h) Help   q) Quit
---------------------------------------------------------------------------
Downloader> d

Download which package (l=list; x=cancel)?
  Identifier> stopwords
    Downloading package stopwords to C:\Users\cit-
        labs\AppData\Roaming\nltk_data...
      Unzipping corpora\stopwords.zip.

---------------------------------------------------------------------------
    d) Download   l) List    u) Update   c) Config   h) Help   q) Quit
---------------------------------------------------------------------------
Downloader> q


In [87]:
MAX_SEQUENCE_LENGTH = 20
embedding_depth = 300

### Word Embedding

In [9]:
# will remove stopwords from sentences before word embedding transformation
stopwords = nltk.corpus.stopwords.words('english')
stopwords.extend(string.punctuation)
stopwords.append('')
stopwords.remove('not')
stopwords.remove('no')

In [10]:
tokenizer = nltk.tokenize.WhitespaceTokenizer()

In [16]:
# mapping 3 billion words to embedded vectors, obtained from Google
embedding_dict = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)

In [11]:
def normalize(word):
    """
    Returns a lower cased version of a word and removes
    punctuation.
    """
    return word.lower().strip(string.punctuation)
    
def tokenize(sentence):
    '''
    Converts string sentence into list of words. Strips punctuation,
    removes long words, makes lower case.
    '''
    tokens = []
    for token in tokenizer.tokenize(sentence):
        word = normalize(token)
        if word not in stopwords and len(word)<MAX_SEQUENCE_LENGTH:
            tokens.append(word)
    return tokens

def to_vector(word):
    """
    Convert a word to vector if we have a vector representation.
    """
    if word in embedding_dict:
        return embedding_dict[word]
    else:
        return np.zeros(300, dtype=float)

def sentence_to_vec(sentence):
    '''
    Converts sentence into a float matrix = list of word vectors.
    '''
    import numpy as np
    z = np.zeros([300,])
    sentence_matrix = [to_vector(token) for token in tokenize(sentence)]
    padding = MAX_SEQUENCE_LENGTH - len(sentence_matrix)
    if padding >= 0:
        for i in range(0,padding):
            sentence_matrix.append(z)
    else:
        sentence_matrix = sentence_matrix[:MAX_SEQUENCE_LENGTH]
    return np.array(sentence_matrix)

def sentences_to_vecs(sentences):
    '''
    Converts list of sentences into list of sentence embeddings, 
    each embedding is MAX_SEQUENCE_LENGTH x 300.
    '''
    training_data = [sentence_to_vec(sentence) for sentence in sentences]
    return np.array(training_data)

In [36]:
# map sentences from wine descriptions to word embedding vectors
embedded_sentences = sentences_to_vecs(descriptions)

In [38]:
# dimensions of each embedded sentences should be 20 x 300
print(embedded_sentences.shape)

(111009, 20, 300)


In [None]:
np.save('embedded.npy', embedded_sentences) 

In [6]:
embedded_sentences = np.load('embedded.npy')

### Train/Test split

In [89]:
from sklearn.model_selection import train_test_split

In [90]:
train_x, test_x, train_y, test_y = train_test_split(embedded_sentences, prices, test_size=0.1, random_state=0)

### Train the Network

In [83]:
from keras.models import Sequential, Model
from keras.layers import Activation, Dot, Embedding, Conv1D, MaxPooling1D, Merge, Highway, LSTM, Dense, Dropout, Reshape, ActivityRegularization, Input
from keras.optimizers import SGD, RMSprop
from keras import regularizers
from keras.constraints import non_neg
from keras.initializers import TruncatedNormal

def descriptions_to_price(embedding_depth, max_seq_len):
    '''
    Returns a model for sentiment analysis algorithm weight learning.
    Model inputs: array of sentence vectors (dimensions: sentences x 22 x 300), 
                  array of sentiment guesses for each algorithm (dimensions: sentences x algorithms x 3)
          outputs: array of sentiment answers (dimensions: sentences x 3)
    '''
    # input layers
    main_input = Input(shape=(max_seq_len,embedding_depth), dtype='float32', name='main_input')
    
    lstm = LSTM(300)
    lstm_main = lstm(main_input)
    
    dense_nn = Dense(1, activation='linear')
    dense_nn_main = dense_nn(lstm_main)

        
    # main model
    main_model = Model(inputs=[main_input], outputs=[dense_nn_main])
    op = RMSprop(lr=0.00007)
    main_model.compile(optimizer=op, loss='mean_absolute_error')
    
    return main_model

In [84]:
#build/compile model
model = descriptions_to_price(embedding_depth, MAX_SEQUENCE_LENGTH)

#view model summary
print (model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
main_input (InputLayer)      (None, 20, 300)           0         
_________________________________________________________________
lstm_6 (LSTM)                (None, 300)               721200    
_________________________________________________________________
dense_6 (Dense)              (None, 1)                 301       
Total params: 721,501
Trainable params: 721,501
Non-trainable params: 0
_________________________________________________________________
None


In [101]:
# train model on train set
history = model.fit([train_x], train_y, batch_size=128, epochs=20, validation_split=0.15)

Train on 84921 samples, validate on 14987 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


### Predicting Price

In [126]:
# use the model to make predictions on unseen data
predicted_prices = model.predict([test_x])
preds = predicted_prices.astype(type('float', (float,), {}))
preds2 = list(preds.reshape(-1,))

In [110]:
with open('nn_true.json', 'w') as outfile:
    json.dump(test_y, outfile)

In [131]:
with open('nn_predictions.json', 'w') as outfile:
    json.dump(preds2, outfile)

In [133]:
score = model.evaluate(test_x, test_y, batch_size = 128)



In [134]:
print(score)

14.1787605293
