In [23]:
# Import dependencies
import pandas as pd
import numpy as np
from pathlib import Path
import gensim
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import hvplot.pandas
from sklearn.manifold import TSNE

In [2]:
# Read in data from csv file
wine_data = Path("Resources/wine_tasting_data.csv")
# Convert to dataframe
wine_df = pd.read_csv(wine_data)
wine_df.head(5)

Unnamed: 0,id,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery
0,0,Italy,"Aromas include tropical fruit, broom, brimston...",Vulkà Bianco,87,,Sicily & Sardinia,Etna,,Kerin O’Keefe,@kerinokeefe,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia
1,1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Douro,,,Roger Voss,@vossroger,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos
2,2,US,"Tart and snappy, the flavors of lime flesh and...",,87,14.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm
3,3,US,"Pineapple rind, lemon pith and orange blossom ...",Reserve Late Harvest,87,13.0,Michigan,Lake Michigan Shore,,Alexander Peartree,,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian
4,4,US,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87,65.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks


In [3]:
# Clean and transform data
# Drop unnecessary columns
df = wine_df.loc[:,["country", "description", "points",
                                    "province", "variety"]]
# Drop null values
df = df.dropna(how='any')
df.head(5)

Unnamed: 0,country,description,points,province,variety
0,Italy,"Aromas include tropical fruit, broom, brimston...",87,Sicily & Sardinia,White Blend
1,Portugal,"This is ripe and fruity, a wine that is smooth...",87,Douro,Portuguese Red
2,US,"Tart and snappy, the flavors of lime flesh and...",87,Oregon,Pinot Gris
3,US,"Pineapple rind, lemon pith and orange blossom ...",87,Michigan,Riesling
4,US,"Much like the regular bottling from 2012, this...",87,Oregon,Pinot Noir


In [4]:
# Transform words into vectors using NLP
# Drop stop words
nltk.download('stopwords')
stop_words = set(stopwords.words('english')) 
df['description'] = df['description'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))
df.head()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\katyp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,country,description,points,province,variety
0,Italy,"Aromas include tropical fruit, broom, brimston...",87,Sicily & Sardinia,White Blend
1,Portugal,"This ripe fruity, wine smooth still structured...",87,Douro,Portuguese Red
2,US,"Tart snappy, flavors lime flesh rind dominate....",87,Oregon,Pinot Gris
3,US,"Pineapple rind, lemon pith orange blossom star...",87,Michigan,Riesling
4,US,"Much like regular bottling 2012, comes across ...",87,Oregon,Pinot Noir


In [5]:
# Lemmatization
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()
def lemmatize_words(text):
    words = text.split()
    words = [lemmatizer.lemmatize(word,pos='v') for word in words]
    return ' '.join(words)
df['description'] = df['description'].apply(lemmatize_words)
df.head()

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\katyp\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Unnamed: 0,country,description,points,province,variety
0,Italy,"Aromas include tropical fruit, broom, brimston...",87,Sicily & Sardinia,White Blend
1,Portugal,"This ripe fruity, wine smooth still structured...",87,Douro,Portuguese Red
2,US,"Tart snappy, flavor lime flesh rind dominate. ...",87,Oregon,Pinot Gris
3,US,"Pineapple rind, lemon pith orange blossom star...",87,Michigan,Riesling
4,US,"Much like regular bottle 2012, come across rat...",87,Oregon,Pinot Noir


In [6]:
# Tokenize descriptions to split sentences into word components
descriptions = df.description.apply(gensim.utils.simple_preprocess)
descriptions

0         [aromas, include, tropical, fruit, broom, brim...
1         [this, ripe, fruity, wine, smooth, still, stru...
2         [tart, snappy, flavor, lime, flesh, rind, domi...
3         [pineapple, rind, lemon, pith, orange, blossom...
4         [much, like, regular, bottle, come, across, ra...
                                ...                        
129966    [notes, honeysuckle, cantaloupe, sweeten, deli...
129967    [citation, give, much, decade, bottle, age, pr...
129968    [well, drained, gravel, soil, give, wine, cris...
129969    [dry, style, pinot, gris, crisp, acidity, it, ...
129970    [big, rich, off, dry, power, intense, spicines...
Name: description, Length: 129907, dtype: object

In [7]:
# Word2Vec model
# Create the model
model = gensim.models.Word2Vec(window=1, min_count=1, workers=8)
# Build its vocabulary
model.build_vocab(descriptions, progress_per=100)
# Train the model
model.train(descriptions, total_examples=model.corpus_count, epochs= model.epochs)
# Save the model to use later
model.save("Resources/descriptions.model")

In [8]:
# Test the model looking for similar words
model.wv.most_similar("bright")

[('vibrant', 0.8024426698684692),
 ('lively', 0.7567315697669983),
 ('fragrant', 0.7262653708457947),
 ('buoyant', 0.7104728817939758),
 ('vivacious', 0.6958246827125549),
 ('zesty', 0.6907148361206055),
 ('vivid', 0.6835038065910339),
 ('pristine', 0.6820457577705383),
 ('tangy', 0.6703503131866455),
 ('zippy', 0.669557511806488)]

In [9]:
# Test the model with similarity of different words
model.wv.similarity(w1="sweet",w2="fruity")

0.38500667

In [10]:
# Get vector representations of descriptions
# Define our function that creates description vectors:
def get_desc_vec(document):
    return np.array(sum(model.wv[word] for word in document)/len(document))

desc_vecs = descriptions.apply(get_desc_vec).tolist()

# for 
# sum(model.wv[d] for d in descriptions[0])/len(descriptions[0])
# model.wv['aromas']
# for desc in descriptions

In [11]:
desc_vec_df = pd.DataFrame(desc_vecs)

In [12]:
desc_vec_df.values

array([[-3.2287678e-01,  4.2750660e-01,  4.8979133e-02, ...,
        -3.0122405e-01, -7.7887988e-03, -1.4934148e-01],
       [-2.7926189e-01,  6.9246006e-01,  3.1242606e-01, ...,
        -5.3443313e-01, -3.2203817e-01, -2.8658155e-01],
       [-4.8814583e-01,  3.3910978e-01,  7.1349597e-05, ...,
        -5.9554225e-01, -4.5477889e-02, -2.7531296e-01],
       ...,
       [-1.5286139e-01,  3.0928230e-01,  2.4416322e-01, ...,
        -3.7142408e-01, -1.3334008e-01, -1.8069005e-01],
       [-1.2828818e-01,  3.1350413e-01,  2.2191252e-01, ...,
        -5.3796482e-01, -2.6219651e-01, -2.4770807e-01],
       [-3.2451081e-01,  2.1076699e-01,  2.6772302e-01, ...,
        -3.7617919e-01, -2.1448298e-01, -9.8234408e-02]], dtype=float32)

In [13]:
# Instantiate the PCA instance and declare the number of PCA variables
pca = PCA(n_components=2)

In [15]:
# Fit the PCA model on the vectors DataFrame
desc_vec_pca = pca.fit_transform(desc_vec_df)

# Review the first 5 rows of list data
desc_vec_pca[:5]

array([[-0.80185485,  0.47670925],
       [ 0.4509913 , -0.06411069],
       [-0.24441469,  1.0398031 ],
       [-0.7116029 ,  1.0006641 ],
       [ 0.9244745 , -0.41865736]], dtype=float32)

In [28]:
# Calculate the PCA explained variance ratio
pca.explained_variance_ratio_
print(pca.explained_variance_ratio_)
print("Total explained variance = .29")

[0.16118119 0.1292076 ]
Total explained variance = .29


In [18]:
# Create the PCA DataFrame
desc_vec_pca_df = pd.DataFrame(
    desc_vec_pca,
    columns=["PCA1", "PCA2"]
)

# Review the PCA DataFrame
desc_vec_pca_df.head()

Unnamed: 0,PCA1,PCA2
0,-0.801855,0.476709
1,0.450991,-0.064111
2,-0.244415,1.039803
3,-0.711603,1.000664
4,0.924474,-0.418657


In [21]:
# Create a a list to store inertia values and the values of k
inertia = []
k = list(range(1, 11))

# Append the value of the computed inertia from the `inertia_` attribute of the KMeans model instance
for i in k:
    k_model = KMeans(n_clusters=i, random_state=1)
    k_model.fit(desc_vec_pca_df)
    inertia.append(k_model.inertia_)

# Define a DataFrame to hold the values for k and the corresponding inertia
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)

# Review the DataFrame
df_elbow.head()



Unnamed: 0,k,inertia
0,1,76024.078125
1,2,48086.492188
2,3,27068.642578
3,4,21286.632812
4,5,16307.448242


In [26]:
# Plot the Elbow Curve
df_elbow.hvplot.line(
    x="k", 
    y="inertia", 
    title="Elbow Curve", 
    xticks=k
)


In [27]:
# Determine optimal number of clusters based on elbow frame above
print("Optimal number of clusters = 5")

Optimal number of clusters = 5


In [30]:
# Define the model with 3 clusters
model = KMeans(n_clusters=5, random_state=1)

# Fit the model
model.fit(desc_vec_pca_df)

# Make predictions
k_5 = model.predict(desc_vec_pca_df)

# Create a copy of the PCA DataFrame
desc_vec_pca_predictions_df = desc_vec_pca_df.copy()

# Add a class column with the labels
desc_vec_pca_predictions_df["wine_segments"] = k_5



In [31]:
# Plot the clusters
desc_vec_pca_predictions_df.hvplot.scatter(
    x="PCA1",
    y="PCA2",
    by="wine_segments"
)

In [None]:
# Use TSNE as our clustering algorithm
# Build the TSNE model
tsne_model=TSNE(metric='cosine', perplexity=50, n_components=2, learning_rate='auto',
                  init='random')

In [None]:
# Fit the TSNE model
tsne_model.fit(desc_vec_df)

In [None]:
# Make predictions about the wine recommendations based on the trained model
wine_recs = tsne_model.predict(desc_vec_df)

In [None]:
# X = np.array([[0, 0, 0], [0, 1, 1], [1, 0, 1], [1, 1, 1]])
# X_embedded = TSNE(n_components=2, learning_rate='auto',
                  init='random', perplexity=3).fit_transform(X)
# X_embedded.shape


# For your data it would look something like
# df = yourDataFrame
# TSNE(metric='cosine', perplexity=50, n_components=2, learning_rate='auto',
                  init='random', perplexity=3).fit_transform(df) (edited) 

In [None]:
import string

a red wine that does not smell like horse
a red horse that does not smell like wine

a tart not sweet wine
a sweet wine not tart



'your custom word here' in model.wv
user_input = 'Red wine with, juicy..... fruity dessert flavors laser-like dkljgbioerubgowrbgowergbwoergb'

# Text box that takes user input js
# [  'Red wine with juicy fruity dessert flavors' ]  => pass to flask in a post

'Red wine with juicy fruity dessert flavors'
user_input_nopunc = user_input.translate(str.maketrans(string.punctuation, ' '*len(string.punctuation)))
print(user_input)
print(user_input_nopunc)
# in flask
description = [word.lower().strip() for word in user_input_nopunc.split()]
clean_description = [w for w in description if w in model.wv]
sum(model.wv[word] for word in clean_description)
# take words in the description, get the vectors (if they exist), combine them and find similar with cosine similarity



# import your model
# import your description vectors

# Then  create routes and stuff




In [None]:
# Separate data in target and features variables
# Preprocess data
# Divide into train and test data

In [None]:
# Input vector data into ML algorithm
# Apply K-Means Clustering
# Generate scatter plot of results

In [None]:
# Input vector data into ML algorithm
# Apply 2nd Model (TBD)
# Generate scatter plot of results

In [None]:
# Classify using Random Forest

In [None]:
# Print accuracy score and confusion matrix