In [73]:
# Import dependencies
import pandas as pd
import numpy as np
from pathlib import Path
import gensim
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import hvplot.pandas
from sklearn.manifold import TSNE

In [74]:
# Read in data from csv file
wine_data = Path("Resources/wine_tasting_data.csv")
# Convert to dataframe
wine_df = pd.read_csv(wine_data)
wine_df.head(5)

Unnamed: 0,id,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery
0,0,Italy,"Aromas include tropical fruit, broom, brimston...",Vulkà Bianco,87,,Sicily & Sardinia,Etna,,Kerin O’Keefe,@kerinokeefe,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia
1,1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Douro,,,Roger Voss,@vossroger,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos
2,2,US,"Tart and snappy, the flavors of lime flesh and...",,87,14.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm
3,3,US,"Pineapple rind, lemon pith and orange blossom ...",Reserve Late Harvest,87,13.0,Michigan,Lake Michigan Shore,,Alexander Peartree,,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian
4,4,US,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87,65.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks


In [75]:
# Clean and transform data
# Drop unnecessary columns
df = wine_df.loc[:,["country", "description", "points",
                                    "province", "variety"]]
# Drop null values
df = df.dropna(how='any')
df.head(5)

Unnamed: 0,country,description,points,province,variety
0,Italy,"Aromas include tropical fruit, broom, brimston...",87,Sicily & Sardinia,White Blend
1,Portugal,"This is ripe and fruity, a wine that is smooth...",87,Douro,Portuguese Red
2,US,"Tart and snappy, the flavors of lime flesh and...",87,Oregon,Pinot Gris
3,US,"Pineapple rind, lemon pith and orange blossom ...",87,Michigan,Riesling
4,US,"Much like the regular bottling from 2012, this...",87,Oregon,Pinot Noir


In [76]:
# List number of descriptions for each country
df["country"].value_counts()

US                        54504
France                    22093
Italy                     19540
Spain                      6645
Portugal                   5691
Chile                      4471
Argentina                  3800
Austria                    3345
Australia                  2329
Germany                    2165
New Zealand                1419
South Africa               1401
Israel                      505
Greece                      466
Canada                      257
Hungary                     146
Bulgaria                    141
Romania                     120
Uruguay                     109
Turkey                       90
Slovenia                     87
Georgia                      86
England                      74
Croatia                      73
Mexico                       70
Moldova                      59
Brazil                       52
Lebanon                      35
Morocco                      28
Peru                         16
Ukraine                      14
Serbia  

In [77]:
# Create list of country names to keep
countries_to_keep = ["US", "France", "Italy", "Spain", "Portugal", "Chile", "Argentina", "Austria",
                  "Australia", "Germany", "New Zealand", "South Africa"]

In [78]:
# Drop countries with less than 1000 descriptions
# Choose cutoff value to create list of countries
countries_to_replace = values.index[values<1000]

# Replace in dataframe
for ctry in countries_to_replace:
    df["country"] = df["country"].replace(ctry, "Other")

# Check to make sure binning was successful
df["country"].value_counts()

reduced_df=df.loc[df["country"].isin(countries_to_keep), :]
reduced_df.head()


Unnamed: 0,country,description,points,province,variety
0,Italy,"Aromas include tropical fruit, broom, brimston...",87,Sicily & Sardinia,White Blend
1,Portugal,"This is ripe and fruity, a wine that is smooth...",87,Douro,Portuguese Red
2,US,"Tart and snappy, the flavors of lime flesh and...",87,Oregon,Pinot Gris
3,US,"Pineapple rind, lemon pith and orange blossom ...",87,Michigan,Riesling
4,US,"Much like the regular bottling from 2012, this...",87,Oregon,Pinot Noir


In [79]:
# Transform words into vectors using NLP
# Drop stop words
nltk.download('stopwords')
stop_words = set(stopwords.words('english')) 
reduced_df['description'] = reduced_df['description'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))
reduced_df.head()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\katyp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reduced_df['description'] = reduced_df['description'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))


Unnamed: 0,country,description,points,province,variety
0,Italy,"Aromas include tropical fruit, broom, brimston...",87,Sicily & Sardinia,White Blend
1,Portugal,"This ripe fruity, wine smooth still structured...",87,Douro,Portuguese Red
2,US,"Tart snappy, flavors lime flesh rind dominate....",87,Oregon,Pinot Gris
3,US,"Pineapple rind, lemon pith orange blossom star...",87,Michigan,Riesling
4,US,"Much like regular bottling 2012, comes across ...",87,Oregon,Pinot Noir


In [80]:
# Lemmatization
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()
def lemmatize_words(text):
    words = text.split()
    words = [lemmatizer.lemmatize(word,pos='v') for word in words]
    return ' '.join(words)
reduced_df['description'] = reduced_df['description'].apply(lemmatize_words)
df.head()

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\katyp\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reduced_df['description'] = reduced_df['description'].apply(lemmatize_words)


Unnamed: 0,country,description,points,province,variety
0,Italy,"Aromas include tropical fruit, broom, brimston...",87,Sicily & Sardinia,White Blend
1,Portugal,"This is ripe and fruity, a wine that is smooth...",87,Douro,Portuguese Red
2,US,"Tart and snappy, the flavors of lime flesh and...",87,Oregon,Pinot Gris
3,US,"Pineapple rind, lemon pith and orange blossom ...",87,Michigan,Riesling
4,US,"Much like the regular bottling from 2012, this...",87,Oregon,Pinot Noir


In [87]:
# Tokenize descriptions to split sentences into word components
descriptions = reduced_df.description.apply(gensim.utils.simple_preprocess)
descriptions

0         [aromas, include, tropical, fruit, broom, brim...
1         [this, ripe, fruity, wine, smooth, still, stru...
2         [tart, snappy, flavor, lime, flesh, rind, domi...
3         [pineapple, rind, lemon, pith, orange, blossom...
4         [much, like, regular, bottle, come, across, ra...
                                ...                        
129966    [notes, honeysuckle, cantaloupe, sweeten, deli...
129967    [citation, give, much, decade, bottle, age, pr...
129968    [well, drained, gravel, soil, give, wine, cris...
129969    [dry, style, pinot, gris, crisp, acidity, it, ...
129970    [big, rich, off, dry, power, intense, spicines...
Name: description, Length: 127403, dtype: object

In [88]:
# Word2Vec model
# Create the model
model = gensim.models.Word2Vec(window=1, min_count=1, workers=8)
# Build its vocabulary
model.build_vocab(descriptions, progress_per=100)
# Train the model
model.train(descriptions, total_examples=model.corpus_count, epochs= model.epochs)
# Save the model to use later
model.save("Resources/descriptions.model")

In [89]:
# Test the model looking for similar words
model.wv.most_similar("bright")

[('vibrant', 0.8117208480834961),
 ('lively', 0.7254738211631775),
 ('buoyant', 0.7095807194709778),
 ('fragrant', 0.7078791856765747),
 ('fresh', 0.6873645186424255),
 ('zippy', 0.6827369332313538),
 ('pristine', 0.6824049949645996),
 ('tangy', 0.6778289079666138),
 ('vivacious', 0.6745824217796326),
 ('zesty', 0.6710382699966431)]

In [90]:
# Test the model with similarity of different words
model.wv.similarity(w1="sweet",w2="fruity")

0.40857884

In [91]:
# Get vector representations of descriptions
# Define our function that creates description vectors:
def get_desc_vec(document):
    return np.array(sum(model.wv[word] for word in document)/len(document))

desc_vecs = descriptions.apply(get_desc_vec).tolist()

# for 
# sum(model.wv[d] for d in descriptions[0])/len(descriptions[0])
# model.wv['aromas']
# for desc in descriptions

In [92]:
desc_vec_df = pd.DataFrame(desc_vecs)

In [93]:
desc_vec_df.values

array([[-0.55449146,  0.19766429, -0.30084243, ..., -0.05302555,
        -0.25185087,  0.6412549 ],
       [-0.6162724 ,  0.08403193,  0.10849904, ..., -0.19549136,
         0.14542754,  0.20628357],
       [-0.30113566,  0.01148193, -0.10268091, ..., -0.11800358,
        -0.09291719,  0.47837812],
       ...,
       [-0.577846  ,  0.0466941 , -0.13651074, ..., -0.10573187,
         0.14370443,  0.08180183],
       [-0.5669327 ,  0.13558073, -0.1966822 , ...,  0.08868691,
        -0.00208101,  0.24837333],
       [-0.39772883,  0.14820276, -0.0643643 , ...,  0.0228875 ,
         0.12213542,  0.26866028]], dtype=float32)

In [94]:
# Elbow Method to find the optimal number of clusters
num_clusters_range = range(1, 11)

wcss = []
for k in num_clusters_range:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(desc_vec_df)
    wcss.append(kmeans.inertia_)

# Create a DataFrame for the elbow curve
df_elbow = pd.DataFrame({"k_value": num_clusters_range, "inertia_value": wcss})

# Plot the elbow curve using hvplot
elbow_curve = df_elbow.hvplot.line(x="k_value", y="inertia_value", title="Elbow Curve")
elbow_curve



In [96]:
# Apply k-means clustering with the optimal number of clusters
optimal_k = 4 # Add the optimal k value based on the elbow method
kmeans = KMeans(n_clusters=optimal_k, random_state=42)
cluster_assignments = kmeans.fit_predict(desc_vec_df)

# Add the cluster assignments to your dataframe
reduced_df['cluster'] = cluster_assignments

# Display the clusters
for cluster_id in range(optimal_k):
    print(f"Cluster {cluster_id}:")
    print(reduced_df[reduced_df['cluster'] == cluster_id]['variety'].value_counts().head())
    print("\n")



Cluster 0:
Red Blend             5671
Cabernet Sauvignon    4563
Pinot Noir            4220
Nebbiolo              2237
Syrah                 2151
Name: variety, dtype: int64


Cluster 1:
Chardonnay         6100
Riesling           4041
Sauvignon Blanc    3028
White Blend        1429
Sparkling Blend    1393
Name: variety, dtype: int64


Cluster 2:
Bordeaux-style Red Blend    3767
Chardonnay                  2321
Pinot Noir                  2183
Portuguese Red              1986
Rosé                        1374
Name: variety, dtype: int64


Cluster 3:
Pinot Noir                  6286
Cabernet Sauvignon          4150
Chardonnay                  3021
Red Blend                   2506
Bordeaux-style Red Blend    1920
Name: variety, dtype: int64




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reduced_df['cluster'] = cluster_assignments


In [97]:
# Instantiate the PCA instance and declare the number of PCA variables
pca = PCA(n_components=2)

In [98]:
# Fit the PCA model on the vectors DataFrame
desc_vec_pca = pca.fit_transform(desc_vec_df)

# Review the first 5 rows of list data
desc_vec_pca[:5]

array([[-0.7985846 ,  0.49699354],
       [ 0.43304646, -0.1053462 ],
       [-0.25528803,  1.0839775 ],
       [-0.69594145,  1.0208821 ],
       [ 0.88764244, -0.3716799 ]], dtype=float32)

In [100]:
# Calculate the PCA explained variance ratio
pca.explained_variance_ratio_
print(pca.explained_variance_ratio_)
print("Total explained variance = .29")

[0.16012695 0.12832723]
Total explained variance = .29


In [101]:
# Create the PCA DataFrame
desc_vec_pca_df = pd.DataFrame(
    desc_vec_pca,
    columns=["PCA1", "PCA2"]
)

# Review the PCA DataFrame
desc_vec_pca_df.head()

Unnamed: 0,PCA1,PCA2
0,-0.798585,0.496994
1,0.433046,-0.105346
2,-0.255288,1.083977
3,-0.695941,1.020882
4,0.887642,-0.37168


In [102]:
# Create a a list to store inertia values and the values of k
inertia = []
k = list(range(1, 11))

# Append the value of the computed inertia from the `inertia_` attribute of the KMeans model instance
for i in k:
    k_model = KMeans(n_clusters=i, random_state=1)
    k_model.fit(desc_vec_pca_df)
    inertia.append(k_model.inertia_)

# Define a DataFrame to hold the values for k and the corresponding inertia
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)

# Review the DataFrame
df_elbow.head()



Unnamed: 0,k,inertia
0,1,74032.15625
1,2,46826.914062
2,3,26515.660156
3,4,20805.017578
4,5,15961.84082


In [103]:
# Plot the Elbow Curve
df_elbow.hvplot.line(
    x="k", 
    y="inertia", 
    title="Elbow Curve", 
    xticks=k
)


In [104]:
# Determine optimal number of clusters based on elbow frame above
print("Optimal number of clusters = 5")

Optimal number of clusters = 5


In [105]:
# Define the model with 5 clusters
model = KMeans(n_clusters=5, random_state=1)

# Fit the model
model.fit(desc_vec_pca_df)

# Make predictions
k_5 = model.predict(desc_vec_pca_df)

# Create a copy of the PCA DataFrame
desc_vec_pca_predictions_df = desc_vec_pca_df.copy()

# Add a class column with the labels
desc_vec_pca_predictions_df["wine_segments"] = k_5
desc_vec_pca_predictions_df.head()



Unnamed: 0,PCA1,PCA2,wine_segments
0,-0.798585,0.496994,1
1,0.433046,-0.105346,4
2,-0.255288,1.083977,1
3,-0.695941,1.020882,1
4,0.887642,-0.37168,4


In [106]:
# Plot the clusters
desc_vec_pca_predictions_df.hvplot.scatter(
    x="PCA1",
    y="PCA2",
    by="wine_segments"
)

In [107]:
cluster_wine_df=reduced_df.copy()
cluster_wine_df["wine_segments"]=k_5
sorted_df=cluster_wine_df.sort_values("wine_segments")
sorted_df.head(20)

Unnamed: 0,country,description,points,province,variety,cluster,wine_segments
129970,France,"Big, rich off-dry, power intense spiciness rou...",90,Alsace,Gewürztraminer,2,0
24624,Italy,"Crisp citrusy support aromas melon peach, clea...",87,Southern Italy,Falanghina,1,0
76610,US,"Part Limited Release series, quite dry despite...",88,Washington,Riesling,3,0
76618,US,"This wine well-rounded, lightly toasty, soft p...",91,Oregon,Pinot Noir,3,0
24620,US,"Strong fruit powerful oak, make popular Califo...",87,California,Chardonnay,3,0
76624,France,A restrain nose give little away palate reveal...,90,Alsace,Sparkling Blend,1,0
76627,France,Not ripe apple juicy yellow plum note appear n...,90,Alsace,Sparkling Blend,1,0
76628,US,"Principally Dundee Hills fruit, like bowl ripe...",90,Oregon,Pinot Gris,3,0
24615,Italy,This blend Friulano Pinot Bianco Friuli northe...,87,Northeastern Italy,White Blend,3,0
24614,Italy,"Rather standard red cherry flavors, unique Rip...",87,Veneto,Red Blend,1,0


In [None]:
# Use TSNE as our clustering algorithm
# Build the TSNE model
tsne_model=TSNE(metric='cosine', perplexity=50, n_components=2, learning_rate='auto',
                  init='random')

In [31]:
sorted_df["country"].value_counts()

US                        54504
France                    22093
Italy                     19540
Spain                      6645
Portugal                   5691
Chile                      4471
Argentina                  3800
Austria                    3345
Australia                  2329
Germany                    2165
New Zealand                1419
South Africa               1401
Israel                      505
Greece                      466
Canada                      257
Hungary                     146
Bulgaria                    141
Romania                     120
Uruguay                     109
Turkey                       90
Slovenia                     87
Georgia                      86
England                      74
Croatia                      73
Mexico                       70
Moldova                      59
Brazil                       52
Lebanon                      35
Morocco                      28
Peru                         16
Ukraine                      14
Macedoni

In [None]:
# Fit the TSNE model
tsne_model.fit(desc_vec_df)

In [None]:
# Make predictions about the wine recommendations based on the trained model
wine_recs = tsne_model.predict(desc_vec_df)

In [None]:
# X = np.array([[0, 0, 0], [0, 1, 1], [1, 0, 1], [1, 1, 1]])
# X_embedded = TSNE(n_components=2, learning_rate='auto',
                  init='random', perplexity=3).fit_transform(X)
# X_embedded.shape


# For your data it would look something like
# df = yourDataFrame
# TSNE(metric='cosine', perplexity=50, n_components=2, learning_rate='auto',
                  init='random', perplexity=3).fit_transform(df) (edited) 

In [None]:
import string

a red wine that does not smell like horse
a red horse that does not smell like wine

a tart not sweet wine
a sweet wine not tart



'your custom word here' in model.wv
user_input = 'Red wine with, juicy..... fruity dessert flavors laser-like dkljgbioerubgowrbgowergbwoergb'

# Text box that takes user input js
# [  'Red wine with juicy fruity dessert flavors' ]  => pass to flask in a post

'Red wine with juicy fruity dessert flavors'
user_input_nopunc = user_input.translate(str.maketrans(string.punctuation, ' '*len(string.punctuation)))
print(user_input)
print(user_input_nopunc)
# in flask
description = [word.lower().strip() for word in user_input_nopunc.split()]
clean_description = [w for w in description if w in model.wv]
sum(model.wv[word] for word in clean_description)
# take words in the description, get the vectors (if they exist), combine them and find similar with cosine similarity



# import your model
# import your description vectors

# Then  create routes and stuff




In [None]:
# Separate data in target and features variables
# Preprocess data
# Divide into train and test data

In [None]:
# Input vector data into ML algorithm
# Apply K-Means Clustering
# Generate scatter plot of results

In [None]:
# Input vector data into ML algorithm
# Apply 2nd Model (TBD)
# Generate scatter plot of results

In [None]:
# Classify using Random Forest

In [None]:
# Print accuracy score and confusion matrix