In [1]:
import numpy as np
import pandas as pd

Data Files can be found here:
    
    https://www.kaggle.com/zynicide/wine-reviews
        
The dataset was scraped from WineEnthusiest.

The file used is:

winemag-data_first150k.csv

In [2]:
# read in the wine review data, view the data to make sure it appeared
sommData = pd.read_csv('Data/winemag-data_first150k.csv')
sommData.head()

Unnamed: 0.1,Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,variety,winery
0,0,US,This tremendous 100% varietal wine hails from ...,Martha's Vineyard,96,235.0,California,Napa Valley,Napa,Cabernet Sauvignon,Heitz
1,1,Spain,"Ripe aromas of fig, blackberry and cassis are ...",Carodorum Selección Especial Reserva,96,110.0,Northern Spain,Toro,,Tinta de Toro,Bodega Carmen Rodríguez
2,2,US,Mac Watson honors the memory of a wine once ma...,Special Selected Late Harvest,96,90.0,California,Knights Valley,Sonoma,Sauvignon Blanc,Macauley
3,3,US,"This spent 20 months in 30% new French oak, an...",Reserve,96,65.0,Oregon,Willamette Valley,Willamette Valley,Pinot Noir,Ponzi
4,4,France,"This is the top wine from La Bégude, named aft...",La Brûlade,95,66.0,Provence,Bandol,,Provence red blend,Domaine de la Bégude


In [3]:
# trim the data to remove the unnamed column
sommData = sommData[sommData.columns[1:11]]
sommData.head()

Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,variety,winery
0,US,This tremendous 100% varietal wine hails from ...,Martha's Vineyard,96,235.0,California,Napa Valley,Napa,Cabernet Sauvignon,Heitz
1,Spain,"Ripe aromas of fig, blackberry and cassis are ...",Carodorum Selección Especial Reserva,96,110.0,Northern Spain,Toro,,Tinta de Toro,Bodega Carmen Rodríguez
2,US,Mac Watson honors the memory of a wine once ma...,Special Selected Late Harvest,96,90.0,California,Knights Valley,Sonoma,Sauvignon Blanc,Macauley
3,US,"This spent 20 months in 30% new French oak, an...",Reserve,96,65.0,Oregon,Willamette Valley,Willamette Valley,Pinot Noir,Ponzi
4,France,"This is the top wine from La Bégude, named aft...",La Brûlade,95,66.0,Provence,Bandol,,Provence red blend,Domaine de la Bégude


In [4]:
# replace NaN with XXXXX for each  string based column
stringColList = ["country", "description", "designation", "province", "region_1", "region_2", "variety", "winery"]
for col in stringColList:
    sommData[col].fillna("none", inplace=True)
    
sommData.head()

Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,variety,winery
0,US,This tremendous 100% varietal wine hails from ...,Martha's Vineyard,96,235.0,California,Napa Valley,Napa,Cabernet Sauvignon,Heitz
1,Spain,"Ripe aromas of fig, blackberry and cassis are ...",Carodorum Selección Especial Reserva,96,110.0,Northern Spain,Toro,none,Tinta de Toro,Bodega Carmen Rodríguez
2,US,Mac Watson honors the memory of a wine once ma...,Special Selected Late Harvest,96,90.0,California,Knights Valley,Sonoma,Sauvignon Blanc,Macauley
3,US,"This spent 20 months in 30% new French oak, an...",Reserve,96,65.0,Oregon,Willamette Valley,Willamette Valley,Pinot Noir,Ponzi
4,France,"This is the top wine from La Bégude, named aft...",La Brûlade,95,66.0,Provence,Bandol,none,Provence red blend,Domaine de la Bégude


In [5]:
# replace Nan with 0 for number based columns
numColList = ["points", "price"]
for col in numColList:
    sommData[col].fillna(0, inplace=True)
    
sommData.head()

Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,variety,winery
0,US,This tremendous 100% varietal wine hails from ...,Martha's Vineyard,96,235.0,California,Napa Valley,Napa,Cabernet Sauvignon,Heitz
1,Spain,"Ripe aromas of fig, blackberry and cassis are ...",Carodorum Selección Especial Reserva,96,110.0,Northern Spain,Toro,none,Tinta de Toro,Bodega Carmen Rodríguez
2,US,Mac Watson honors the memory of a wine once ma...,Special Selected Late Harvest,96,90.0,California,Knights Valley,Sonoma,Sauvignon Blanc,Macauley
3,US,"This spent 20 months in 30% new French oak, an...",Reserve,96,65.0,Oregon,Willamette Valley,Willamette Valley,Pinot Noir,Ponzi
4,France,"This is the top wine from La Bégude, named aft...",La Brûlade,95,66.0,Provence,Bandol,none,Provence red blend,Domaine de la Bégude


In [6]:
# remove dupes
dups = sommData[sommData.duplicated('description')]
sommData = sommData.drop_duplicates(subset='description')
print('Total unique reviews:', len(sommData))

Total unique reviews: 97821


In [7]:
# remove varities with only 1 description
counts = sommData['variety'].value_counts()
sommData= sommData[sommData['variety'].isin(counts[counts > 3].index)]
sommData.head()

Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,variety,winery
0,US,This tremendous 100% varietal wine hails from ...,Martha's Vineyard,96,235.0,California,Napa Valley,Napa,Cabernet Sauvignon,Heitz
1,Spain,"Ripe aromas of fig, blackberry and cassis are ...",Carodorum Selección Especial Reserva,96,110.0,Northern Spain,Toro,none,Tinta de Toro,Bodega Carmen Rodríguez
2,US,Mac Watson honors the memory of a wine once ma...,Special Selected Late Harvest,96,90.0,California,Knights Valley,Sonoma,Sauvignon Blanc,Macauley
3,US,"This spent 20 months in 30% new French oak, an...",Reserve,96,65.0,Oregon,Willamette Valley,Willamette Valley,Pinot Noir,Ponzi
4,France,"This is the top wine from La Bégude, named aft...",La Brûlade,95,66.0,Provence,Bandol,none,Provence red blend,Domaine de la Bégude


In [8]:
# create the X and y values
X = sommData["description"]
y = sommData["variety"]

print(X.shape, y.shape)

(97367,) (97367,)


In [9]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from keras.utils import to_categorical

# split the data into train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

# vectorize the description data
vectorizer = CountVectorizer()
vectorizer.fit(X_train)
X_train_vec = vectorizer.transform(X_train)

X_test_vec = vectorizer.transform(X_test)

# Step 1: Label-encode data set
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
encoded_y_train = label_encoder.transform(y_train)
encoded_y_test = label_encoder.transform(y_test)

# Step 2: Convert encoded labels to one-hot-encoding
y_train_categorical = to_categorical(encoded_y_train)
y_test_categorical = to_categorical(encoded_y_test)

print(y_test_categorical.shape, y_train_categorical.shape)

Using TensorFlow backend.


(24342, 364) (73025, 364)


In [10]:
# Create Deep Learning Model
from keras.models import Sequential
from keras.layers import Dense

model = Sequential()
model.add(Dense(units=1000, activation='relu', input_dim=X_train_vec.shape[1]))
#model.add(Dense(units=500, activation='relu'))
model.add(Dense(units=364, activation='softmax'))

In [11]:
# compile the model and fit it
model.compile(optimizer='adam', 
              loss='categorical_crossentropy', 
              metrics=['accuracy'])
model.fit(
    X_train_vec,
    y_train_categorical,
    epochs=3,
    shuffle=True,
    verbose=364
)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x1c294a03da0>

In [12]:
model_loss, model_accuracy = model.evaluate(X_test_vec, y_test_categorical, verbose=364)
print(f"Normal Neural Network - Loss: {model_loss}, Accuracy: {model_accuracy}")

Normal Neural Network - Loss: 1.7968224101916364, Accuracy: 0.613918330472337


In [14]:
# test the data
encoded_predictions = model.predict_classes(X_test_vec[:5])
prediction_labels = label_encoder.inverse_transform(encoded_predictions)
print(f"Predicted classes: {prediction_labels}")
print(f"Actual Labels: {list(y_train[:5])}")

Predicted classes: ['Bordeaux-style White Blend' "Nero d'Avola" 'Grüner Veltliner' 'Syrah'
 'Cabernet Franc']
Actual Labels: ['Chardonnay', 'Syrah', 'Syrah', 'Bordeaux-style Red Blend', 'Grenache']
