In [103]:
# Import dependencies
import pandas as pd
import numpy as np
from pathlib import Path
import gensim
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.manifold import TSNE

In [89]:
# Read in data from csv file
wine_data = Path("Resources/winemag-data-130k-v2.csv")
# Convert to dataframe
wine_df = pd.read_csv(wine_data)
wine_df.head(5)

Unnamed: 0,id,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery
0,0,Italy,"Aromas include tropical fruit, broom, brimston...",Vulkà Bianco,87,,Sicily & Sardinia,Etna,,Kerin O’Keefe,@kerinokeefe,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia
1,1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Douro,,,Roger Voss,@vossroger,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos
2,2,US,"Tart and snappy, the flavors of lime flesh and...",,87,14.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm
3,3,US,"Pineapple rind, lemon pith and orange blossom ...",Reserve Late Harvest,87,13.0,Michigan,Lake Michigan Shore,,Alexander Peartree,,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian
4,4,US,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87,65.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks


In [90]:
# Clean and transform data
# Drop unnecessary columns
df = wine_df.loc[:,["country", "description", "points",
                                    "province", "variety"]]
# Drop null values
df = df.dropna(how='any')
df.head(5)

Unnamed: 0,country,description,points,province,variety
0,Italy,"Aromas include tropical fruit, broom, brimston...",87,Sicily & Sardinia,White Blend
1,Portugal,"This is ripe and fruity, a wine that is smooth...",87,Douro,Portuguese Red
2,US,"Tart and snappy, the flavors of lime flesh and...",87,Oregon,Pinot Gris
3,US,"Pineapple rind, lemon pith and orange blossom ...",87,Michigan,Riesling
4,US,"Much like the regular bottling from 2012, this...",87,Oregon,Pinot Noir


In [91]:
# Transform words into vectors using NLP
# Drop stop words
nltk.download('stopwords')
stop_words = set(stopwords.words('english')) 
df['description'] = df['description'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))
df.head()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\katyp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,country,description,points,province,variety
0,Italy,"Aromas include tropical fruit, broom, brimston...",87,Sicily & Sardinia,White Blend
1,Portugal,"This ripe fruity, wine smooth still structured...",87,Douro,Portuguese Red
2,US,"Tart snappy, flavors lime flesh rind dominate....",87,Oregon,Pinot Gris
3,US,"Pineapple rind, lemon pith orange blossom star...",87,Michigan,Riesling
4,US,"Much like regular bottling 2012, comes across ...",87,Oregon,Pinot Noir


In [93]:
# Lemmatization
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()
def lemmatize_words(text):
    words = text.split()
    words = [lemmatizer.lemmatize(word,pos='v') for word in words]
    return ' '.join(words)
df['description'] = df['description'].apply(lemmatize_words)
df.head()

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\katyp\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Unnamed: 0,country,description,points,province,variety
0,Italy,"Aromas include tropical fruit, broom, brimston...",87,Sicily & Sardinia,White Blend
1,Portugal,"This ripe fruity, wine smooth still structured...",87,Douro,Portuguese Red
2,US,"Tart snappy, flavor lime flesh rind dominate. ...",87,Oregon,Pinot Gris
3,US,"Pineapple rind, lemon pith orange blossom star...",87,Michigan,Riesling
4,US,"Much like regular bottle 2012, come across rat...",87,Oregon,Pinot Noir


In [94]:
# Tokenize descriptions to split sentences into word components
descriptions = df.description.apply(gensim.utils.simple_preprocess)
descriptions

0         [aromas, include, tropical, fruit, broom, brim...
1         [this, ripe, fruity, wine, smooth, still, stru...
2         [tart, snappy, flavor, lime, flesh, rind, domi...
3         [pineapple, rind, lemon, pith, orange, blossom...
4         [much, like, regular, bottle, come, across, ra...
                                ...                        
129966    [notes, honeysuckle, cantaloupe, sweeten, deli...
129967    [citation, give, much, decade, bottle, age, pr...
129968    [well, drained, gravel, soil, give, wine, cris...
129969    [dry, style, pinot, gris, crisp, acidity, it, ...
129970    [big, rich, off, dry, power, intense, spicines...
Name: description, Length: 129907, dtype: object

In [95]:
# Word2Vec model
# Create the model
model = gensim.models.Word2Vec(window=1, min_count=1, workers=8)
# Build its vocabulary
model.build_vocab(descriptions, progress_per=100)
# Train the model
model.train(descriptions, total_examples=model.corpus_count, epochs= model.epochs)
# Save the model to use later
model.save("Resources/descriptions.model")

In [97]:
# Test the model looking for similar words
model.wv.most_similar("bright")

[('vibrant', 0.8031609654426575),
 ('lively', 0.7552875280380249),
 ('fragrant', 0.7213945984840393),
 ('buoyant', 0.7045851945877075),
 ('vivid', 0.700950562953949),
 ('zesty', 0.6973592638969421),
 ('pristine', 0.6869233846664429),
 ('zippy', 0.6846666932106018),
 ('vivacious', 0.6789547801017761),
 ('crisp', 0.6666170954704285)]

In [98]:
# Test the model with similarity of different words
model.wv.similarity(w1="sweet",w2="fruity")

0.39353532

In [100]:
# Get vector representations of descriptions
# Define our function that creates description vectors:
def get_desc_vec(document):
    return np.array(sum(model.wv[word] for word in document)/len(document))

desc_vecs = descriptions.apply(get_desc_vec).tolist()

# for 
# sum(model.wv[d] for d in descriptions[0])/len(descriptions[0])
# model.wv['aromas']
# for desc in descriptions

In [101]:
desc_vec_df = pd.DataFrame(desc_vecs)

In [102]:
desc_vec_df.values

array([[-0.38943514,  0.46795204, -0.01659683, ..., -0.44666412,
         0.01279598, -0.316577  ],
       [-0.2078109 ,  0.40830687,  0.09883289, ..., -0.60397404,
        -0.2888849 , -0.2510083 ],
       [-0.6055327 ,  0.2598576 , -0.15290378, ..., -0.7390816 ,
        -0.03161457, -0.14972173],
       ...,
       [-0.07642491,  0.16925098,  0.13420069, ..., -0.36354685,
        -0.24250098, -0.10991914],
       [-0.11606023,  0.19864772,  0.12376181, ..., -0.62272316,
        -0.3185051 , -0.16753477],
       [-0.25898352,  0.16406868,  0.18591437, ..., -0.32889518,
        -0.35066143, -0.1944136 ]], dtype=float32)

In [106]:
# Use TSNE as our clustering algorithm
# Build the TSNE model
tsne_model=TSNE(metric='cosine', perplexity=50, n_components=2, learning_rate='auto',
                  init='random')

In [107]:
# Fit the TSNE model
tsne_model.fit(desc_vec_df)

In [108]:
# Make predictions about the wine recommendations based on the trained model
wine_recs = tsne_model.predict(desc_vec_df)

AttributeError: 'TSNE' object has no attribute 'predict'

In [None]:
# X = np.array([[0, 0, 0], [0, 1, 1], [1, 0, 1], [1, 1, 1]])
# X_embedded = TSNE(n_components=2, learning_rate='auto',
                  init='random', perplexity=3).fit_transform(X)
# X_embedded.shape


# For your data it would look something like
# df = yourDataFrame
# TSNE(metric='cosine', perplexity=50, n_components=2, learning_rate='auto',
                  init='random', perplexity=3).fit_transform(df) (edited) 

In [87]:
import string

a red wine that does not smell like horse
a red horse that does not smell like wine

a tart not sweet wine
a sweet wine not tart



'your custom word here' in model.wv
user_input = 'Red wine with, juicy..... fruity dessert flavors laser-like dkljgbioerubgowrbgowergbwoergb'

# Text box that takes user input js
# [  'Red wine with juicy fruity dessert flavors' ]  => pass to flask in a post

'Red wine with juicy fruity dessert flavors'
user_input_nopunc = user_input.translate(str.maketrans(string.punctuation, ' '*len(string.punctuation)))
print(user_input)
print(user_input_nopunc)
# in flask
description = [word.lower().strip() for word in user_input_nopunc.split()]
clean_description = [w for w in description if w in model.wv]
sum(model.wv[word] for word in clean_description)
# take words in the description, get the vectors (if they exist), combine them and find similar with cosine similarity



# import your model
# import your description vectors

# Then  create routes and stuff




Red wine with, juicy..... fruity dessert flavors laser-like dkljgbioerubgowrbgowergbwoergb
Red wine with  juicy      fruity dessert flavors laser like dkljgbioerubgowrbgowergbwoergb


array([-4.1463056 ,  3.5198565 , -0.39890164, -2.3764222 ,  1.3593053 ,
       -5.4809394 ,  2.610977  ,  6.725995  , -0.7629415 , -3.8254595 ,
       -1.2591147 , -6.3113275 , -2.6524723 ,  1.40657   ,  3.1393418 ,
        1.3172214 ,  3.0909328 ,  0.23630965, -1.6514609 , -6.045948  ,
        2.900023  , -2.378191  ,  2.5172698 , -1.0759978 , -2.4066083 ,
        0.05549923,  0.5137062 , -1.6655021 , -3.0221932 ,  6.2869415 ,
        6.265183  , -2.1053426 , -1.3081836 , -3.5878778 , -0.7973597 ,
        3.4088144 , -2.4633539 ,  2.6721    ,  0.49540153, -5.732395  ,
        4.876373  , -5.068238  , -1.4859351 ,  1.7876196 ,  4.846452  ,
       -2.6730149 , -2.1790814 ,  0.76659846,  2.2480245 ,  3.0366516 ,
        0.40182132, -0.48237708,  1.0448866 , -2.2691832 , -2.7876863 ,
        4.7928405 ,  2.8515894 ,  1.3177036 , -2.578859  ,  4.0406995 ,
       -4.571622  ,  0.84690464,  2.0032504 , -1.0264851 , -4.81399   ,
        2.564782  ,  5.9346056 ,  1.8282502 , -0.9531972 ,  2.98

In [22]:
# Separate data in target and features variables
# Preprocess data
# Divide into train and test data

In [23]:
# Input vector data into ML algorithm
# Apply K-Means Clustering
# Generate scatter plot of results

In [24]:
# Input vector data into ML algorithm
# Apply 2nd Model (TBD)
# Generate scatter plot of results

In [25]:
# Classify using Random Forest

In [26]:
# Print accuracy score and confusion matrix