In [1]:
# Import dependencies
import pandas as pd
from pathlib import Path
import gensim
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [2]:
# Read in data from csv file
wine_data = Path("Resources/winemag-data-130k-v2.csv")
# Convert to dataframe
wine_df = pd.read_csv(wine_data)
wine_df.head(5)

FileNotFoundError: [Errno 2] No such file or directory: 'Resources/wine_tasting_data.csv'

In [3]:
# Clean and transform data
# Drop unnecessary columns
df = wine_df.loc[:,["country", "description", "points",
                                    "province", "variety"]]
# Drop null values
df = df.dropna(how='any')
df.head(5)

NameError: name 'wine_df' is not defined

In [16]:
# Transform words into vectors using NLP
# Drop stop words
nltk.download('stopwords')
stop_words = set(stopwords.words('english')) 
df['description'] = df['description'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))
df.head()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\katyp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,country,description,points,province,variety
0,Italy,"Aromas include tropical fruit, broom, brimston...",87,Sicily & Sardinia,White Blend
1,Portugal,"This ripe fruity, wine smooth still structured...",87,Douro,Portuguese Red
2,US,"Tart snappy, flavors lime flesh rind dominate....",87,Oregon,Pinot Gris
3,US,"Pineapple rind, lemon pith orange blossom star...",87,Michigan,Riesling
4,US,"Much like regular bottling 2012, comes across ...",87,Oregon,Pinot Noir


In [17]:
# Lemmatization
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()
def lemmatize_words(text):
    words = text.split()
    words = [lemmatizer.lemmatize(word,pos='v') for word in words]
    return ' '.join(words)
df['description'] = df['description'].apply(lemmatize_words)
df.head()

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\katyp\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Unnamed: 0,country,description,points,province,variety
0,Italy,"Aromas include tropical fruit, broom, brimston...",87,Sicily & Sardinia,White Blend
1,Portugal,"This ripe fruity, wine smooth still structured...",87,Douro,Portuguese Red
2,US,"Tart snappy, flavor lime flesh rind dominate. ...",87,Oregon,Pinot Gris
3,US,"Pineapple rind, lemon pith orange blossom star...",87,Michigan,Riesling
4,US,"Much like regular bottle 2012, come across rat...",87,Oregon,Pinot Noir


In [18]:
# Tokenize descriptions to split sentences into word components
descriptions = df.description.apply(gensim.utils.simple_preprocess)
descriptions

0         [aromas, include, tropical, fruit, broom, brim...
1         [this, ripe, fruity, wine, smooth, still, stru...
2         [tart, snappy, flavor, lime, flesh, rind, domi...
3         [pineapple, rind, lemon, pith, orange, blossom...
4         [much, like, regular, bottle, come, across, ra...
                                ...                        
129966    [notes, honeysuckle, cantaloupe, sweeten, deli...
129967    [citation, give, much, decade, bottle, age, pr...
129968    [well, drained, gravel, soil, give, wine, cris...
129969    [dry, style, pinot, gris, crisp, acidity, it, ...
129970    [big, rich, off, dry, power, intense, spicines...
Name: description, Length: 129907, dtype: object

In [19]:
# Word2Vec model
# Create the model
model = gensim.models.Word2Vec(window=1, min_count=1, workers=8)
# Build its vocabulary
model.build_vocab(descriptions, progress_per=100)
# Train the model
model.train(descriptions, total_examples=model.corpus_count, epochs= model.epochs)
# Save the model to use later
model.save("Resources/descriptions.model")

In [32]:
# Test the model looking for similar words
model.wv.most_similar("fleshy")

[('plump', 0.8143642544746399),
 ('pulpy', 0.760719358921051),
 ('plush', 0.7291730642318726),
 ('mouthfilling', 0.7283833026885986),
 ('chunky', 0.7249197959899902),
 ('luscious', 0.7169947624206543),
 ('lush', 0.7168979048728943),
 ('weighty', 0.7153026461601257),
 ('ripe', 0.7143080234527588),
 ('juicy', 0.7003235220909119)]

In [21]:
# Test the model with similarity of different words
model.wv.similarity(w1="sweet",w2="fruity")

0.38071415

In [57]:
import numpy as np
# Get vector representations of descriptions

# define our function that creates description vectors:
def get_desc_vec(document):
    return np.array(sum(model.wv[word] for word in document)/len(document))

desc_vecs = descriptions.apply(get_desc_vec).tolist()

# for 
# sum(model.wv[d] for d in descriptions[0])/len(descriptions[0])
# model.wv['aromas']
# for desc in descriptions

In [60]:
desc_vec_df = pd.DataFrame(desc_vecs)

In [63]:
desc_vec_df.values

array([[-0.38042325,  0.44547886, -0.0219199 , ..., -0.21928608,
        -0.01562374, -0.13634494],
       [-0.24923013,  0.71666735,  0.26131153, ..., -0.42186096,
        -0.37194335, -0.28483555],
       [-0.5302367 ,  0.40328625, -0.01353906, ..., -0.47789583,
        -0.04605041, -0.25209254],
       ...,
       [-0.1534711 ,  0.37801662,  0.2226702 , ..., -0.30313656,
        -0.1456415 , -0.11643026],
       [-0.21349978,  0.41269293,  0.1691624 , ..., -0.3299476 ,
        -0.30745447, -0.24195202],
       [-0.33081272,  0.28393245,  0.1426199 , ..., -0.24387349,
        -0.24720109, -0.13013107]], dtype=float32)

In [87]:
import string

a red wine that does not smell like horse
a red horse that does not smell like wine

a tart not sweet wine
a sweet wine not tart



'your custom word here' in model.wv
user_input = 'Red wine with, juicy..... fruity dessert flavors laser-like dkljgbioerubgowrbgowergbwoergb'

# Text box that takes user input js
# [  'Red wine with juicy fruity dessert flavors' ]  => pass to flask in a post

'Red wine with juicy fruity dessert flavors'
user_input_nopunc = user_input.translate(str.maketrans(string.punctuation, ' '*len(string.punctuation)))
print(user_input)
print(user_input_nopunc)
# in flask
description = [word.lower().strip() for word in user_input_nopunc.split()]
clean_description = [w for w in description if w in model.wv]
sum(model.wv[word] for word in clean_description)
# take words in the description, get the vectors (if they exist), combine them and find similar with cosine similarity



# import your model
# import your description vectors

# Then  create routes and stuff




Red wine with, juicy..... fruity dessert flavors laser-like dkljgbioerubgowrbgowergbwoergb
Red wine with  juicy      fruity dessert flavors laser like dkljgbioerubgowrbgowergbwoergb


array([-4.1463056 ,  3.5198565 , -0.39890164, -2.3764222 ,  1.3593053 ,
       -5.4809394 ,  2.610977  ,  6.725995  , -0.7629415 , -3.8254595 ,
       -1.2591147 , -6.3113275 , -2.6524723 ,  1.40657   ,  3.1393418 ,
        1.3172214 ,  3.0909328 ,  0.23630965, -1.6514609 , -6.045948  ,
        2.900023  , -2.378191  ,  2.5172698 , -1.0759978 , -2.4066083 ,
        0.05549923,  0.5137062 , -1.6655021 , -3.0221932 ,  6.2869415 ,
        6.265183  , -2.1053426 , -1.3081836 , -3.5878778 , -0.7973597 ,
        3.4088144 , -2.4633539 ,  2.6721    ,  0.49540153, -5.732395  ,
        4.876373  , -5.068238  , -1.4859351 ,  1.7876196 ,  4.846452  ,
       -2.6730149 , -2.1790814 ,  0.76659846,  2.2480245 ,  3.0366516 ,
        0.40182132, -0.48237708,  1.0448866 , -2.2691832 , -2.7876863 ,
        4.7928405 ,  2.8515894 ,  1.3177036 , -2.578859  ,  4.0406995 ,
       -4.571622  ,  0.84690464,  2.0032504 , -1.0264851 , -4.81399   ,
        2.564782  ,  5.9346056 ,  1.8282502 , -0.9531972 ,  2.98

In [22]:
# Separate data in target and features variables
# Preprocess data
# Divide into train and test data

In [23]:
# Input vector data into ML algorithm
# Apply K-Means Clustering
# Generate scatter plot of results

In [24]:
# Input vector data into ML algorithm
# Apply 2nd Model (TBD)
# Generate scatter plot of results

In [25]:
# Classify using Random Forest

In [26]:
# Print accuracy score and confusion matrix