In [1]:
# Import dependencies
import pandas as pd
from pathlib import Path
import gensim
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [2]:
# Read in data from csv file
wine_data = Path("Resources/winemag-data-130k-v2.csv")
# Convert to dataframe
wine_df = pd.read_csv(wine_data)
wine_df.head(5)

Unnamed: 0,id,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery
0,0,Italy,"Aromas include tropical fruit, broom, brimston...",Vulkà Bianco,87,,Sicily & Sardinia,Etna,,Kerin O’Keefe,@kerinokeefe,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia
1,1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Douro,,,Roger Voss,@vossroger,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos
2,2,US,"Tart and snappy, the flavors of lime flesh and...",,87,14.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm
3,3,US,"Pineapple rind, lemon pith and orange blossom ...",Reserve Late Harvest,87,13.0,Michigan,Lake Michigan Shore,,Alexander Peartree,,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian
4,4,US,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87,65.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks


In [3]:
# Clean and transform data
# Drop unnecessary columns
df = wine_df.loc[:,["country", "description", "points",
                                    "region_1", "variety"]]
# Drop null values
df = df.dropna(how='any')
df.head(5)

Unnamed: 0,country,description,points,region_1,variety
0,Italy,"Aromas include tropical fruit, broom, brimston...",87,Etna,White Blend
2,US,"Tart and snappy, the flavors of lime flesh and...",87,Willamette Valley,Pinot Gris
3,US,"Pineapple rind, lemon pith and orange blossom ...",87,Lake Michigan Shore,Riesling
4,US,"Much like the regular bottling from 2012, this...",87,Willamette Valley,Pinot Noir
5,Spain,Blackberry and raspberry aromas show a typical...,87,Navarra,Tempranillo-Merlot


In [4]:
# Transform words into vectors using NLP
# Drop stop words
nltk.download('stopwords')
stop_words = set(stopwords.words('english')) 
df['description'] = df['description'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))
df.head()

[nltk_data] Downloading package stopwords to /Users/dzz_/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,country,description,points,region_1,variety
0,Italy,"Aromas include tropical fruit, broom, brimston...",87,Etna,White Blend
2,US,"Tart snappy, flavors lime flesh rind dominate....",87,Willamette Valley,Pinot Gris
3,US,"Pineapple rind, lemon pith orange blossom star...",87,Lake Michigan Shore,Riesling
4,US,"Much like regular bottling 2012, comes across ...",87,Willamette Valley,Pinot Noir
5,Spain,Blackberry raspberry aromas show typical Navar...,87,Navarra,Tempranillo-Merlot


In [5]:
# Lemmatization
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()
def lemmatize_words(text):
    words = text.split()
    words = [lemmatizer.lemmatize(word,pos='v') for word in words]
    return ' '.join(words)
df['description'] = df['description'].apply(lemmatize_words)
df.head()

[nltk_data] Downloading package wordnet to /Users/dzz_/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Unnamed: 0,country,description,points,region_1,variety
0,Italy,"Aromas include tropical fruit, broom, brimston...",87,Etna,White Blend
2,US,"Tart snappy, flavor lime flesh rind dominate. ...",87,Willamette Valley,Pinot Gris
3,US,"Pineapple rind, lemon pith orange blossom star...",87,Lake Michigan Shore,Riesling
4,US,"Much like regular bottle 2012, come across rat...",87,Willamette Valley,Pinot Noir
5,Spain,Blackberry raspberry aromas show typical Navar...,87,Navarra,Tempranillo-Merlot


In [6]:
# Tokenize descriptions to split sentences into word components
descriptions = df.description.apply(gensim.utils.simple_preprocess)
descriptions

0         [aromas, include, tropical, fruit, broom, brim...
2         [tart, snappy, flavor, lime, flesh, rind, domi...
3         [pineapple, rind, lemon, pith, orange, blossom...
4         [much, like, regular, bottle, come, across, ra...
5         [blackberry, raspberry, aromas, show, typical,...
                                ...                        
129965    [while, rich, beautiful, dry, wine, also, offe...
129967    [citation, give, much, decade, bottle, age, pr...
129968    [well, drained, gravel, soil, give, wine, cris...
129969    [dry, style, pinot, gris, crisp, acidity, it, ...
129970    [big, rich, off, dry, power, intense, spicines...
Name: description, Length: 108724, dtype: object

In [7]:
# Word2Vec model
# Create the model
model = gensim.models.Word2Vec(window=1, min_count=1, workers=8)
# Build its vocabulary
model.build_vocab(descriptions, progress_per=100)
# Train the model
model.train(descriptions, total_examples=model.corpus_count, epochs= model.epochs)
# Save the model to use later
model.save("Resources/descriptions.model")

In [8]:
# Test the model looking for similar words
model.wv.most_similar("dry")

[('dried', 0.6033702492713928),
 ('concentrate', 0.5238678455352783),
 ('chop', 0.5152648091316223),
 ('candy', 0.5115837454795837),
 ('concentrated', 0.5088780522346497),
 ('lace', 0.5053457617759705),
 ('provençal', 0.5044975876808167),
 ('crunchy', 0.5009744763374329),
 ('alpine', 0.5009072422981262),
 ('restrained', 0.5008205771446228)]

In [9]:
# Test the model with similarity of different words
model.wv.similarity(w1="sweet",w2="fruity")

0.38912898

In [10]:
# Separate data in target and features variables
# Preprocess data
# Divide into train and test data

In [11]:
# Input vector data into ML algorithm
# Apply K-Means Clustering
# Generate scatter plot of results

In [12]:
# Input vector data into ML algorithm
# Apply 2nd Model (TBD)
# Generate scatter plot of results

In [13]:
# Classify using Random Forest

In [14]:
# Print accuracy score and confusion matrix