In [0]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

In [0]:
# Install the latest version of TensorFlow
!pip install -q -U tensorflow==1.7.0

In [3]:
import itertools
import os
import math
import numpy as np
import pandas as pd
import tensorflow as tf

from sklearn.preprocessing import LabelEncoder
from tensorflow import keras
layers = keras.layers

#Using TF 1.7
print("You have tensorflow version", tf.__version__)

You have tensorflow version 1.7.0


In [0]:
# Grabbing the dataset  - Original source (https://www.kaggle.com/zynicide/wine-reviews/data)
URL = "https://storage.googleapis.com/sara-cloud-ml/wine_data.csv"
path = tf.keras.utils.get_file(URL.split('/')[-1], URL)

In [0]:
#reading data using pandas
data = pd.read_csv(path)

In [6]:
#Shuffle the data
data = data.sample(frac=1)


#print the first 5 rows
data.head()

Unnamed: 0.1,Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,variety,winery
1296,1296,US,"Toffee, cinnamon apples and a hefty swirl of s...",,87,16.0,California,Santa Barbara County,Central Coast,Chardonnay,Byron
3942,3942,US,"A lovely hue, this is scented with crushed ros...",McCrone Vineyard,94,63.0,Oregon,Willamette Valley,,Pinot Noir,Ken Wright
36673,36673,US,The best of the 2008 Syrah lineup from Walter ...,C'est Syrah Beaux,92,38.0,Washington,Columbia Valley (WA),Columbia Valley,Syrah,Walter Dacon
121569,121569,US,Miraflores bring a Merlot-style balance to thi...,Herbert Vineyard,92,24.0,California,El Dorado,Sierra Foothills,Zinfandel,Miraflores
35119,35119,US,This has that distinctive diesel-fuel aroma of...,Camp 4 Vineyard,88,20.0,California,Santa Ynez Valley,Central Coast,Grenache Blanc,Tercero


In [0]:
#processing to limit thw number of wine varities in the dataset
data = data[pd.notnull(data['country'])]
data = data[pd.notnull(data['price'])]
data = data.drop(data.columns[0], axis=1)

variety_threshold = 500 #Anything that occurs less then 500 will be removed
value_counts = data['variety'].value_counts()
to_remove = value_counts[value_counts <= variety_threshold].index
data.replace(to_remove, np.nan, inplace = True)
data = data[pd.notnull(data['variety'])]

In [8]:
# Split data into train and test
train_size = int(len(data) * .8)
print("train Size : %d" % train_size)
print("test Size : %d" % (len(data) - train_size))

train Size : 95646
test Size : 23912


In [0]:
#train features
description_train  = data['description'][:train_size]
variety_train = data['variety'][:train_size]

#train lables
labels_train = data['price'][:train_size]

#test features

description_test = data['description'][train_size:]
variety_test = data['variety'][train_size:]

#test lables
labels_test = data['price'][train_size:]

In [0]:
#create a tokenizer to preprocess our text description
vocab_size = 12000 # This is hyperparameter, experiment with different values for the dataset
tokenize = keras.preprocessing.text.Tokenizer(num_words = vocab_size, char_level=False)
tokenize.fit_on_texts(description_train) # only fit on train

In [0]:
#Wide feature 1 : Sparse bag of words (bow) vocab_size vector
description_bow_train = tokenize.texts_to_matrix(description_train)
description_bow_test = tokenize.texts_to_matrix(description_test)

In [0]:
# Wide feature 2: one-hot vector of variety categories

# Use sklearn utility to convert label strings to numbered index
encoder = LabelEncoder()
encoder.fit(variety_train)
variety_train = encoder.transform(variety_train)
variety_test = encoder.transform(variety_test)
num_classes = np.max(variety_train) + 1

# Convert labels to one hot
variety_train = keras.utils.to_categorical(variety_train, num_classes)
variety_test = keras.utils.to_categorical(variety_test, num_classes)

In [0]:
# Define our wide model with the functional API
bow_inputs = layers.Input(shape=(vocab_size,))
variety_inputs = layers.Input(shape=(num_classes,))
merged_layer = layers.concatenate([bow_inputs, variety_inputs])
merged_layer = layers.Dense(256, activation='relu')(merged_layer)
predictions = layers.Dense(1)(merged_layer)
wide_model = keras.Model(inputs=[bow_inputs, variety_inputs], outputs=predictions)

                                                     

In [14]:
wide_model.compile(loss='mse', optimizer='adam', metrics=['accuracy'])
print(wide_model.summary())

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 12000)        0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 40)           0                                            
__________________________________________________________________________________________________
concatenate_1 (Concatenate)     (None, 12040)        0           input_1[0][0]                    
                                                                 input_2[0][0]                    
__________________________________________________________________________________________________
dense_1 (Dense)                 (None, 256)          3082496     concatenate_1[0][0]              
__________

In [0]:
# Deep model feature: word embeddings of wine descriptions
train_embed = tokenize.texts_to_sequences(description_train)
test_embed = tokenize.texts_to_sequences(description_test)

max_seq_length = 170
train_embed = keras.preprocessing.sequence.pad_sequences(
    train_embed, maxlen=max_seq_length, padding="post")
test_embed = keras.preprocessing.sequence.pad_sequences(
    test_embed, maxlen=max_seq_length, padding="post")

In [16]:

# Define our deep model with the Functional API
deep_inputs = layers.Input(shape=(max_seq_length,))
embedding = layers.Embedding(vocab_size, 8, input_length=max_seq_length)(deep_inputs)
embedding = layers.Flatten()(embedding)
embed_out = layers.Dense(1)(embedding)
deep_model = keras.Model(inputs=deep_inputs, outputs=embed_out)
print(deep_model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         (None, 170)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 170, 8)            96000     
_________________________________________________________________
flatten_1 (Flatten)          (None, 1360)              0         
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 1361      
Total params: 97,361
Trainable params: 97,361
Non-trainable params: 0
_________________________________________________________________
None


In [0]:
deep_model.compile(loss='mse',
                       optimizer='adam',
                       metrics=['accuracy'])

In [18]:
# Combine wide and deep into one model
merged_out = layers.concatenate([wide_model.output, deep_model.output])
merged_out = layers.Dense(1)(merged_out)
combined_model = keras.Model(wide_model.input + [deep_model.input], merged_out)
print(combined_model.summary())

combined_model.compile(loss='mse',
                       optimizer='adam',
                       metrics=['accuracy'])

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 12000)        0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 40)           0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            (None, 170)          0                                            
__________________________________________________________________________________________________
concatenate_1 (Concatenate)     (None, 12040)        0           input_1[0][0]                    
                                                                 input_2[0][0]                    
__________

In [19]:
# Run training
combined_model.fit([description_bow_train, variety_train] + [train_embed], labels_train, epochs=10, batch_size=128)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras._impl.keras.callbacks.History at 0x7f0ea93d21d0>

In [20]:

combined_model.evaluate([description_bow_test, variety_test] + [test_embed], labels_test, batch_size=128)



[710.7153868791689, 0.0727668116439362]

In [0]:
# Generate predictions
predictions = combined_model.predict([description_bow_test, variety_test] + [test_embed])

In [22]:

# Compare predictions with actual values for the first few items in our test dataset
num_predictions = 40
diff = 0

for i in range(num_predictions):
    val = predictions[i]
    print(description_test.iloc[i])
    print('Predicted: ', val[0], 'Actual: ', labels_test.iloc[i], '\n')
    diff += abs(val[0] - labels_test.iloc[i])

A good choice for fans of a generously oaked style, this is a weighty white with complex notes of caramel and vanilla that pick up a papaya undertone on the finish. Too powerful for most fish dishes but fine with cheese. Drink now to 2012.
Predicted:  21.471981 Actual:  17.0 

Light in body, and very dry and silky, this is an elegantly structured Pinot Noir. It shows modest, but clean, flavors of cherries, black raspberries, cocoa and vanilla. Drink now.
Predicted:  24.384624 Actual:  29.0 

The top wine from Château Chambert is huge and densely concentrated. It has a solid, black fruited feel to it along its immense tannins. It's richness has given its structure a sweeter almost jammy finish. The wine will certainly age many years.
Predicted:  78.966774 Actual:  70.0 

Rounded and smooth wine, with rich apricot and nectarine flavors as well as tight acidity. It is structured, pear and apple skins giving shape to a wine that is good to drink now and will also age.
Predicted:  17.071169

In [23]:
# Compare the average difference between actual price and the model's predicted price
print('Average prediction difference: ', diff / num_predictions)

Average prediction difference:  7.6529751181602474
