In [1]:
import numpy as np
import pandas as pd
import six
import tensorflow as tf
import random
import string
import time
import os
from tqdm import tqdm

from sklearn.model_selection import train_test_split

from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.models import load_model, Sequential
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.layers import Dense, Flatten, Embedding
from keras.layers.convolutional import Conv1D, MaxPooling1D
#import keras

%matplotlib inline

Using TensorFlow backend.


In [2]:
# Read data
DATA_PATH = 'data/outputs/name_price_desc.csv'
df_wines = pd.read_csv(DATA_PATH, sep='|', low_memory=False)

# Clean pricing data
df_wines['price'] =  df_wines['price'].str.strip('[]')
df_wines = df_wines[df_wines['price'] != '']
df_wines = df_wines[~df_wines['price'].str.contains(' ')]
df_wines['price'] = df_wines['price'].astype(float)

print(df_wines.shape)
df_wines.head()

(15191, 3)


Unnamed: 0,name,price,description
0,Hall Napa Valley Cabernet Sauvignon 2013,54.99,"Dark garnet in color, the 2013 HALL Napa Valle..."
1,Rombauer Chardonnay 2017,36.99,Rombauer Vineyards was founded in 1982 by Koer...
2,Antinori Tignanello 2015,124.99,#24
3,Borne of Fire Cabernet Sauvignon 2016,19.99,"Like a phoenix rising from the ashes, we have ..."
4,Torbreck Woodcutters Shiraz 2017,21.99,This wine reflects the up and coming Shiraz vi...


In [126]:
NUM_WORDS = 2000

# create/fit tokenizer, convert to sequences and pad for model input
docs = df_wines['description'].astype(str)
t = Tokenizer(num_words=NUM_WORDS)
t.fit_on_texts(docs)
encoded_seq = t.texts_to_sequences(docs)
training_max_length = max([len(s) for s in encoded_seq])

# define vocabulary size (largest integer value)
vocab_size = len(t.word_index) + 1
print(vocab_size)

X = pad_sequences(encoded_seq, maxlen=training_max_length, padding='post')
y = df_wines['price']

# Split out the training/testing datasets
X_train,\
X_test,\
y_train,\
y_test = train_test_split(X, y, test_size=0.33, random_state=42)

# Print some statistics 
print("Sample count: ", len(encoded_seq))
print("Length of first sequence: ", len(encoded_seq[0]))
print("Max sequence length: ", training_max_length)
print("Sample: ", encoded_seq[0][:10])
print("X: ",X.shape)
print("y: ",y.shape)

21426
Total length:  15191
Length of first sequence:  49
Max sequence length:  297
Sample:  [48, 276, 6, 33, 1, 486, 98, 45, 42, 50]
X:  (15191, 297)
y:  (15191,)


In [127]:
LOSS_METRIC = 'mean_squared_error'

# Define model
model = Sequential()
model.add(Embedding(vocab_size, NUM_WORDS, input_length=training_max_length))
model.add(Conv1D(filters=16, kernel_size=8, activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(100, activation='relu'))
model.add(Dense(1, activation='linear'))
print(model.summary())

#d_model = Sequential()
#d_model.add(Embedding(vocab_size, 250, input_length=max_length))
#d_model.add(Dense(256, kernel_initializer='normal',activation='relu'))
#d_model.add(Dense(64, kernel_initializer='normal',activation='relu'))
#d_model.add(Dense(32, kernel_initializer='normal',activation='relu'))
#d_model.add(Dense(1, kernel_initializer='normal',activation='linear'))

checkpoint = ModelCheckpoint('data/models_weights/model_price.h5', 
                             monitor=LOSS_METRIC, 
                             verbose=0, 
                             save_best_only=True, 
                             mode='min')
early_stopping = EarlyStopping(monitor=LOSS_METRIC,
                               patience=3,
                               mode='min')
tboard = keras.callbacks.TensorBoard(log_dir='./Graph',
                                     histogram_freq=0, 
                                     write_graph=True, 
                                     write_images=True)
callbacks_list = [checkpoint,early_stopping,tboard]

# compile network
model.compile(loss=LOSS_METRIC, 
              optimizer='adam',
              metrics=['mse']
              )
# fit network
model.fit(X_train, 
          y_train, 
          epochs=500, 
          callbacks=callbacks_list,
          verbose=2)

# Save weights
model.save('data/models_weights/model_price.h5')

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_23 (Embedding)     (None, 297, 2000)         42852000  
_________________________________________________________________
conv1d_18 (Conv1D)           (None, 290, 16)           256016    
_________________________________________________________________
max_pooling1d_18 (MaxPooling (None, 145, 16)           0         
_________________________________________________________________
flatten_18 (Flatten)         (None, 2320)              0         
_________________________________________________________________
dense_53 (Dense)             (None, 100)               232100    
_________________________________________________________________
dense_54 (Dense)             (None, 1)                 101       
Total params: 43,340,217
Trainable params: 43,340,217
Non-trainable params: 0
________________________________________________________________

In [128]:
# Sample some predicted wines and prices
predictions = model.predict(X_test)
for i in range(1,4):
    print(t.sequences_to_texts(X_test)[i])
    print(predictions[i])
    print('\n')

when created the range they left howell mountain in a unique in napa valley on this eastern side of the valley one would expect to see the oak and found but the north air that across this makes howell mountain the and appellation in napa thus the and that the mountain help it blackberry minerality and dark chocolate the 2009 howell mountain cabernet sauvignon from the spectacular vineyard our source on howell mountain for
[109.22215]


brilliant ruby in color with a fresh fruity nose of red currants wild strawberries and a floral note the palate is packed with red fruit and hints of oak spice with velvety tannins
[27.36819]


the 2016 alta vineyard pinot a mix of and clones and was aged in french oak 50 new for 15 months before bottling this wine shows its high altitude location with a nose of bright berry fruit and ripe plums and a deeper color than what we typically see from the vineyard the texture of the alta is due to the time and ripening this vineyard the lush body and richness 

## Predict prices on fake wines

In [132]:
# Read in fake wine names and descriptions
df_fake_wines = pd.read_csv('data/outputs/DESC_v1_2.csv',
                            sep='|',
                            low_memory=False)

# Run the encodings as was done with training data
encoded_seq = t.texts_to_sequences(df_fake_wines['description'])
max_length = max([len(s) for s in encoded_seq])
fake_X = pad_sequences(encoded_seq, maxlen=training_max_length, padding='post')

print("Sample count: ", len(encoded_seq))
print("Length of first sequence: ", len(encoded_seq[0]))
print("Sample: ", encoded_seq[0][:10])

assert X_train.shape[1] == fake_X.shape[1]
fake_predictions = model.predict(fake_X)

# Save to DF and CSV
df_fake_wines['price'] = fake_predictions.astype(int)
df_fake_wines.to_csv('data/outputs/fakes.csv', sep='|')

Sample count:  1275
Length of first sequence:  62
Sample:  [4, 341, 6, 165, 44, 1, 53, 35, 380, 35]


In [133]:
df_fake_wines.head()

Unnamed: 0,name,description,price
0,Joseph Carr Reveliste Cinsault 2013,\n\nRaisage a trip back in time at the Frank F...,48
1,Carol Shelton Roche TBredi 2016,\nAromatics of this wine transporm nine expre...,42
2,Finca Bolgheri Pinot Grigio 2018,"This makes this opened scents, small whitehal...",71
3,Domaine de Cristict Chardonnay 2016,"\nDigest boasts an intensity, or gift W This...",104
4,Domaine Dujac Fils &amp; Pere Chambolle Rouge ...,"On the nose, aromas of grapefruit, lime and a...",22
