In [1]:
import os
import pandas as pd
import numpy as np
import tensorflow as tf
import itertools
import math
from sklearn.preprocessing import LabelEncoder
from tensorflow import keras

In [2]:
layers = keras.layers
data = pd.read_csv("E:\\Columbia University\\course\\mlselfteach\\MachineLearning\\wine-reviews\\winemag-data_first150k.csv")
data = data.sample(frac = 1)

In [13]:
data

Unnamed: 0.1,Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,variety,winery
0,0,US,This tremendous 100% varietal wine hails from ...,Martha's Vineyard,96,235.0,California,Napa Valley,Napa,Cabernet Sauvignon,Heitz
1,1,Spain,"Ripe aromas of fig, blackberry and cassis are ...",Carodorum Selección Especial Reserva,96,110.0,Northern Spain,Toro,,Tinta de Toro,Bodega Carmen Rodríguez
2,2,US,Mac Watson honors the memory of a wine once ma...,Special Selected Late Harvest,96,90.0,California,Knights Valley,Sonoma,Sauvignon Blanc,Macauley
3,3,US,"This spent 20 months in 30% new French oak, an...",Reserve,96,65.0,Oregon,Willamette Valley,Willamette Valley,Pinot Noir,Ponzi
4,4,France,"This is the top wine from La Bégude, named aft...",La Brûlade,95,66.0,Provence,Bandol,,Provence red blend,Domaine de la Bégude
...,...,...,...,...,...,...,...,...,...,...,...
150925,150925,Italy,Many people feel Fiano represents southern Ita...,,91,20.0,Southern Italy,Fiano di Avellino,,White Blend,Feudi di San Gregorio
150926,150926,France,"Offers an intriguing nose with ginger, lime an...",Cuvée Prestige,91,27.0,Champagne,Champagne,,Champagne Blend,H.Germain
150927,150927,Italy,This classic example comes from a cru vineyard...,Terre di Dora,91,20.0,Southern Italy,Fiano di Avellino,,White Blend,Terredora
150928,150928,France,"A perfect salmon shade, with scents of peaches...",Grand Brut Rosé,90,52.0,Champagne,Champagne,,Champagne Blend,Gosset


In [3]:
#remove the rows with missing values
data = data[pd.notnull(data['country'])]
data = data[pd.notnull(data['price'])]
data = data.drop(data.columns[0],axis = 1)

In [4]:
variety_threshold = 80
value_counts = data['variety'].value_counts()
to_remove = value_counts[value_counts <= variety_threshold].index
data.replace(to_remove, np.nan, inplace = True)
data = data[pd.notnull(data['variety'])]

In [6]:
train_size = int(len(data) * 0.8)

In [7]:
description_train = data['description'][:train_size]
variety_train = data['variety'][:train_size]
label_train = data['price'][:train_size]

description_test = data['description'][train_size:]
variety_test = data['variety'][train_size:]
label_test = data['price'][train_size:]

In [8]:
vocal_size = 12000
tokenize = keras.preprocessing.text.Tokenizer(num_words = vocal_size, char_level = False)
tokenize.fit_on_texts(description_train)

In [9]:
#feature1
description_bow_train = tokenize.texts_to_matrix(description_train)
description_bow_test = tokenize.texts_to_matrix(description_test)

In [42]:
description_bow_train.shape

(104519, 12000)

### part2

In [None]:
#feature2
encoder = LabelEncoder()
encoder.fit(variety_train)
variety_train = encoder.transform(variety_train)
variety_test = encoder.transform(variety_test)
num_classes = np.max(variety_train) + 1

variety_train = keras.utils.to_categorical(variety_train, num_classes)
variety_test = keras.utils.to_categorical(variety_test, num_classes)

In [43]:
bow_inputs = layers.Input(shape = (vocal_size,))
variety_inputs = layers.Input(shape = (num_classes,))


In [45]:
merged_layer = layers.concatenate([bow_inputs,variety_inputs])

In [50]:
merged_layer = layers.Dense(256, activation = 'relu')(merged_layer)

In [52]:
precition = layers.Dense(1)(merged_layer)

In [54]:
wide_model = keras.Model(inputs = [bow_inputs, variety_inputs], outputs = precition)

In [56]:
wide_model.compile(loss='mse', optimizer='adam', metrics = ['accuracy'])
print(wide_model.summary())

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 12000)]      0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 97)]         0                                            
__________________________________________________________________________________________________
concatenate (Concatenate)       (None, 12097)        0           input_1[0][0]                    
                                                                 input_2[0][0]                    
__________________________________________________________________________________________________
dense (Dense)                   (None, 256)          3097088     concatenate[0][0]            

### part3

In [58]:
train_embed = tokenize.texts_to_sequences(description_train)


In [63]:
test_embed = tokenize.texts_to_sequences(description_test)

In [71]:
len(test_embed[30])

170

In [66]:
max_seq_length = 170
train_embed = keras.preprocessing.sequence.pad_sequences(train_embed, maxlen = max_seq_length, padding = "post")
test_embed = keras.preprocessing.sequence.pad_sequences(test_embed, maxlen = max_seq_length, padding = "post")

### part4

In [72]:
deep_inputs = layers.Input(shape=(max_seq_length,))
embedding = layers.Embedding(vocal_size,8,input_length = max_seq_length)(deep_inputs)


In [74]:
embedding = layers.Flatten()(embedding)
embed_out = layers.Dense(1)(embedding)
deep_model = keras.Model(inputs = deep_inputs, outputs = embed_out)
deep_model.summary()

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         [(None, 170)]             0         
_________________________________________________________________
embedding (Embedding)        (None, 170, 8)            96000     
_________________________________________________________________
flatten (Flatten)            (None, 1360)              0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 1361      
Total params: 97,361
Trainable params: 97,361
Non-trainable params: 0
_________________________________________________________________


In [75]:
deep_model.compile(loss='mse', optimizer='adam', metrics = ['accuracy'])

In [76]:
merged_out = layers.concatenate([wide_model.output,deep_model.output])
merged_out = layers.Dense(1)(merged_out)
combined_model = keras.Model(wide_model.input + [deep_model.input], merged_out)

In [78]:
combined_model.summary()

Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 12000)]      0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 97)]         0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            [(None, 170)]        0                                            
__________________________________________________________________________________________________
concatenate (Concatenate)       (None, 12097)        0           input_1[0][0]                    
                                                                 input_2[0][0]              

In [85]:
combined_model.compile(loss='mse', optimizer='adam', metrics = ['accuracy'])

In [96]:
variety_train = variety_train.astype('int')

In [104]:
labels_train = label_train.to_numpy()

In [106]:
combined_model.fit([description_bow_train, variety_train] + [train_embed], labels_train, epochs = 10, batch_size = 128)

Train on 104519 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x1b0a5955dc8>

In [108]:
labels_test = label_test.to_numpy()
combined_model.evaluate([description_bow_test, variety_test] + [test_embed], labels_test, batch_size = 128)



[688.2849106465634, 0.0]

In [109]:
predictions = combined_model.predict([description_bow_test, variety_test] + [test_embed])

In [111]:
num_prediction = 40
diff = 0

for i in range(num_prediction):
    val = predictions[i]
    print(description_test.iloc[i])
    print("Predicted:",val[0],"Actual:",label_test.iloc[i],'\n')
    diff += abs(val[0] - label_test.iloc[i])

A seriously ageworthy wine that has a crystal clarity, a sharp texture that then opens out with inviting green plum and apricot flavors. It remains intensely fresh, but richness develops under the taut minerality. Keep for many years. Imported by Blue Danube Wine Co.
Predicted: 112.160324 Actual: 44.0 

This new sister label from Elk Cove offers 100% Willamette Valley Pinot Noir at an everyday price. Fresh cranberry and raspberry fruit is framed by herb-tinged tannins. The immaculate craftsmanship and clean direct flavors are light years beyond budget Pinot Noir from most other regions. This is your perfect salmon wine.
Predicted: 32.343662 Actual: 19.0 

This might be called banana Port, for a rich, banana flavor is what dominates. Of course, with Muscat, you will also find tangerine and orange, like a tangy sauce over the richer, creamier banana. You've got to think—pair this with a banana split? Could be the dessert of the century!
Predicted: 18.88732 Actual: 28.0 

This vintage wil