<a href="https://colab.research.google.com/github/ethan-jiang-1/100-Days-Of-ML-Code/blob/master/Keras_wide_deep.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q -U tensorflow==1.7.0


ORG Training from here: https://www.youtube.com/watch?v=XNKeayZW4dY

In [2]:
import itertools
import os
import math
import numpy as np
import pandas as pd
import tensorflow as tf

from sklearn.preprocessing import LabelEncoder
from tensorflow import keras
layers = keras.layers

print("Tensor flow versoin", tf.__version__)

URL = "https://storage.googleapis.com/sara-cloud-ml/wine_data.csv"
path = tf.keras.utils.get_file(URL.split("/")[-1], URL)


Tensor flow versoin 1.7.0


In [6]:
data = pd.read_csv(path)
data = data.sample(frac=1)
data.head()

data = data[pd.notnull(data["country"])]
data = data[pd.notnull(data["price"])]
data = data.drop(data.columns[0], axis=1)
data = data.drop(2000, axis=0)
# data.head()

variety_thredhold = 500
value_counts = data["variety"].value_counts()
to_remove = value_counts[value_counts <= variety_thredhold].index
data.replace(to_remove, np.nan, inplace=True)
data = data[pd.notnull(data["variety"])]
# data.head()

In [7]:
train_size = int(len(data)* 0.8)
print("train_size: ", train_size, "test_size", int(train_size/4))

desp_train = data["description"][:train_size]
variety_train = data["variety"][:train_size]
labels_train = data["price"][:train_size]

desp_test = data["description"][train_size:]
variety_test = data["variety"][train_size:]
labels_test = data["price"][train_size:]



train_size:  95645 test_size 23911


In [8]:
vocab_size = 12000
tokenize = keras.preprocessing.text.Tokenizer(num_words=vocab_size, char_level=False)
tokenize.fit_on_texts(desp_train)

desp_bow_train = tokenize.texts_to_matrix(desp_train)
desp_bow_test = tokenize.texts_to_matrix(desp_test)


In [9]:
encoder = LabelEncoder()
encoder.fit(variety_train)
variety_train = encoder.transform(variety_train)
variety_test = encoder.transform(variety_test)
num_classes = np.max(variety_train) + 1

variety_train = keras.utils.to_categorical(variety_train, num_classes)
variety_test = keras.utils.to_categorical(variety_test, num_classes)


In [12]:
bow_inputs = layers.Input(shape=(vocab_size,))
variety_inputs = layers.Input(shape=(num_classes,))
merged_layer = layers.concatenate([bow_inputs, variety_inputs])
merged_layer = layers.Dense(256, activation="relu")(merged_layer)
predictions = layers.Dense(1)(merged_layer)
wide_model = keras.Model(inputs=[bow_inputs, variety_inputs], outputs=predictions)

wide_model.compile(loss="mse", optimizer="adam", metrics=["accuracy"])
print(wide_model.summary())

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_5 (InputLayer)            (None, 12000)        0                                            
__________________________________________________________________________________________________
input_6 (InputLayer)            (None, 40)           0                                            
__________________________________________________________________________________________________
concatenate_3 (Concatenate)     (None, 12040)        0           input_5[0][0]                    
                                                                 input_6[0][0]                    
__________________________________________________________________________________________________
dense_5 (Dense)                 (None, 256)          3082496     concatenate_3[0][0]              
__________

In [13]:
train_embed = tokenize.texts_to_sequences(desp_train)
test_embed = tokenize.texts_to_sequences(desp_test)

max_seq_length = 170
train_embed = keras.preprocessing.sequence.pad_sequences(train_embed, maxlen=max_seq_length, padding="post")
test_embed = keras.preprocessing.sequence.pad_sequences(test_embed, maxlen=max_seq_length, padding="post")

In [17]:
deep_inputs = layers.Input(shape=(max_seq_length,))
embedding = layers.Embedding(vocab_size, 8, input_length=max_seq_length)(deep_inputs)
embedding = layers.Flatten()(embedding)
embed_out = layers.Dense(1)(embedding)
deep_model = keras.Model(inputs=deep_inputs, outputs=embed_out)
print(deep_model.summary()) 

deep_model.compile(loss="mse", optimizer="adam", metrics=['accuracy'])


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_9 (InputLayer)         (None, 170)               0         
_________________________________________________________________
embedding_3 (Embedding)      (None, 170, 8)            96000     
_________________________________________________________________
flatten_3 (Flatten)          (None, 1360)              0         
_________________________________________________________________
dense_9 (Dense)              (None, 1)                 1361      
Total params: 97,361
Trainable params: 97,361
Non-trainable params: 0
_________________________________________________________________
None


In [21]:
merged_out = layers.concatenate([wide_model.output, deep_model.output])
merged_out = layers.Dense(1)(merged_out)
combined_model = keras.Model(wide_model.input + [deep_model.input], merged_out)
print(combined_model.summary())

combined_model.compile(loss="mse",
                       optimizer="adam",
                       metrics=["accuracy"])


__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_5 (InputLayer)            (None, 12000)        0                                            
__________________________________________________________________________________________________
input_6 (InputLayer)            (None, 40)           0                                            
__________________________________________________________________________________________________
input_9 (InputLayer)            (None, 170)          0                                            
__________________________________________________________________________________________________
concatenate_3 (Concatenate)     (None, 12040)        0           input_5[0][0]                    
                                                                 input_6[0][0]                    
__________

In [23]:
combined_model.fit([desp_bow_train, variety_train] + [train_embed], labels_train, epochs=10, batch_size=128)

combined_model.evaluate([desp_bow_test, variety_test] + [test_embed], labels_test, batch_size=128)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


[786.9648861315546, 0.06687019070546218]

In [26]:
predictions = combined_model.predict([desp_bow_test, variety_test] + [test_embed])

num_predictions = 40
diff = 0

for i in range(num_predictions):
  val = predictions[i]
  print(desp_test.iloc[i])
  print("predicted", val[0], 'actual', labels_test.iloc[i])
  diff += abs(val[0] - labels_test.iloc[i])

print("average prediction diffreence", diff/num_predictions)


This has the richness of barrel fermented Sémillon (“French stainless steel”) without actually having any. Residual sugar just over 1%; gives it roundness and a hint of honey.
predicted 11.766367 actual 19.0
A vibrantly aromatic, dazzling wine with abundant notes of lychee and rose petal. It's on the lighter side of medium bodied and drinks dry with a lot of hang time on the pronounced, floral finish.
predicted 8.995081 actual 22.0
From winemaker David Kraus, this is simple but enjoyable. Spice, dark berry and blueberry on the nose lead into toasted oak and berry on the palate. A fun sip for fans of offbeat regions, it will pair well with heartier fare like beef and lamb.
predicted 15.129522 actual 15.0
A polished, softly delicious Merlot, at a good price for the quality. It's dry and richly tannic, with complex flavors of black cherries, red currants, red licorice, Indian spices and sandalwood. Ready now, but only 200 cases were produced.
predicted 28.669613 actual 24.0
Almost cherry 