This notebook will train a deep learning natural language algorithm to try to predict the country of origin of a wine base on its description.

First we import the required libraries.

In [1]:
import numpy as np
import pandas as pd #dataframes / data manipulation

#model
from sklearn.model_selection import train_test_split

#natural language toolkit
import re
import nltk 
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

#keras
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Bidirectional
from tensorflow.keras.layers import Dropout

#show wordcloud
from wordcloud import WordCloud
import matplotlib.pyplot as plt
%matplotlib inline

#suppress warnings
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

Now we need to import the datasets.

In [2]:
#we will select only the two columns that will be used: description and country
columns = ['description', 'country']
df_1 = pd.read_csv('./datasets/winemag-data_first150k.csv', usecols=columns)
df_2 = pd.read_csv('./datasets/winemag-data-130k-v2.csv', usecols=columns)

print(df_1.shape)
print(df_2.shape)

(150930, 2)
(129971, 2)


Now we clean and shape the dataframes

In [3]:
#looking for null entries to be removed
print(df_1.info())
print('--------------------------------------')
print(df_2.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150930 entries, 0 to 150929
Data columns (total 2 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   country      150925 non-null  object
 1   description  150930 non-null  object
dtypes: object(2)
memory usage: 2.3+ MB
None
--------------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 129971 entries, 0 to 129970
Data columns (total 2 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   country      129908 non-null  object
 1   description  129971 non-null  object
dtypes: object(2)
memory usage: 2.0+ MB
None


In [4]:
#there seem to be some null values for countries in both datasets
#we can see that by comparing the total entries with the non-null number
#we'll drop them off
df_1.dropna(axis=0, inplace=True)
df_2.dropna(axis=0, inplace=True)

#then we'll concatenate both into one dataframe
df = pd.concat([df_1, df_2])
print(df.shape)

(280833, 2)


In [5]:
#let's see what we got so far
df.describe()

Unnamed: 0,country,description
count,280833,280833
unique,50,169370
top,US,A little bit funky and unsettled when you pop ...
freq,116901,7


In [6]:
#removing special characters
df['description'] = df['description'].apply(lambda x: re.sub('[^a-zA-Z]', ' ', x))

#converting everything o lowercase so there's no distinction between the same word
df['description'] = df['description'].apply(lambda x: x.lower())

#now to remove the stopwords (the ones that don't affect the result)
stop_words = stopwords.words('english')
df['description'] = df['description'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))

#lemmatizing (removing conjugation to convert all the words to their base form)
lemmatizer = WordNetLemmatizer()
df['description'] = df['description'].apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split()]))

#encode the labels
df['country'] = df['country'].factorize()[0]

In [7]:
#now let's see how our dataframe looks
df.head(15)

Unnamed: 0,country,description
0,0,tremendous varietal wine hail oakville aged th...
1,1,ripe aroma fig blackberry cassis softened swee...
2,0,mac watson honor memory wine made mother treme...
3,0,spent month new french oak incorporates fruit ...
4,2,top wine la b gude named highest point vineyar...
5,1,deep dense pure opening bell toro winner aroma...
6,1,slightly gritty black fruit aroma include swee...
7,1,lush cedary black fruit aroma luxe offer note ...
8,0,named vineyard formerly bottled delancellotti ...
9,0,producer source two block vineyard wine one hi...


Now we prepare the data for training and testing

In [9]:
#split the data into train and test
X_train, X_test, y_train, y_test = train_test_split(df['description'], df['country'])

#check how many unique words we have
vocab_size = len(set(' '.join(X_train).split()))
print('Vocab size: ', vocab_size)

Vocab size:  31150


In [11]:
#encoding the words with numbers for the model to be able to process them
#each unique word turns into an integer, then converted to a binary vector
train = [one_hot(d, vocab_size) for d in X_train]
test = [one_hot(d, vocab_size) for d in X_test]


[[ 7803 28525 23894 ...     0     0     0]
 [11256 30594 29370 ...     0     0     0]
 [13708 29576 14298 ...     0     0     0]
 ...
 [22507 25900 19741 ...     0     0     0]
 [27456    55 14298 ...     0     0     0]
 [27456 29354  6014 ...     0     0     0]]


In [12]:
#padding every description to the length of 100 words
max_length = 100
padded_train = pad_sequences(train, maxlen=max_length, padding='post')
padded_test = pad_sequences(test, maxlen=max_length, padding='post')

Here comes the model

In [15]:
#embedding size
embedding_size = 32
#number of classes - count the number of countries available
num_classes = df['country'].nunique()
#model
model = Sequential()
model.add(Embedding(vocab_size, embedding_size, input_length=max_length))
#bidirectional LSTM
model.add(Bidirectional(LSTM(32)))
model.add(Dropout(0.25))
model.add(Dense(num_classes, activation='softmax'))
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 100, 32)           996800    
                                                                 
 bidirectional_1 (Bidirectio  (None, 64)               16640     
 nal)                                                            
                                                                 
 dropout_1 (Dropout)         (None, 64)                0         
                                                                 
 dense_1 (Dense)             (None, 50)                3250      
                                                                 
Total params: 1,016,690
Trainable params: 1,016,690
Non-trainable params: 0
_________________________________________________________________


And the training

In [16]:
model.fit(padded_train, y_train, epochs=10, verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x222a1e481f0>

In [17]:
loss, accuracy = model.evaluate(padded_test, y_test, verbose=1)
print('Accuracy: %f' % (accuracy*100))

Accuracy: 89.408767
