In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from nltk.stem.porter import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
import nltk
#nltk.download('stopwords')
from nltk.corpus import stopwords
from rake_nltk import Rake

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/davidalvarez/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# load dataset
wine = pd.read_csv('winemag-data-130k-v2.csv')
wine.head(2)

Unnamed: 0.1,Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery
0,0,Italy,"Aromas include tropical fruit, broom, brimston...",Vulkà Bianco,87,,Sicily & Sardinia,Etna,,Kerin O’Keefe,@kerinokeefe,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia
1,1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Douro,,,Roger Voss,@vossroger,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos


In [3]:
# Get a set of all the varieties
varieties = set(wine['variety'])
len(varieties)

708

In [4]:
# Get a set of all English stop words
english_stop_words = set(stopwords.words('english'))
len(english_stop_words)


179

In [5]:
# Working set of fluff words
new_words = set(["alongside", "include", "offering", "already", "certainly", "although", "rather", "comes", "nonetheless", "wine"])

In [6]:
# Combine English stop words, varieties and fluff words into one big set of stop words to avoid data leakage
stop_words = varieties | english_stop_words | new_words
len(stop_words)

897

In [7]:
# Use RAKE (Rapid Automatic Keyword Extraction algorithm) to extract keywords and lowercase
r = Rake()    

def rake_implement(x,r):
    r.extract_keywords_from_text(x)
    return r.get_ranked_phrases()

descriptions = wine['description'].apply(lambda x: rake_implement(x,r))
descriptions

0         [dried sage alongside brisk acidity, aromas in...
1         [juicy red berry fruits, still structured, fir...
2         [green pineapple pokes, crisp acidity undersco...
3         [orange blossom start, mango giving way, sligh...
4         [pleasantly unfussy country wine, hearty winte...
                                ...                        
129966    [yet wraps, tart tangerine, light spätlese, in...
129967    [secondary fruit compote highlights, coconut f...
129968    [drained gravel soil gives, serious structure,...
129969    [structure still developing, baked apple flavo...
129970    [rounded texture, opulent feel, lychees domina...
Name: description, Length: 129971, dtype: object

In [8]:
# # Get the textual descriptions of the wines and make lowercase
# descriptions = wine['description'].str.lower()
# # remove anything that is not a letter:
# def remove_nonletters(description):
#     return ''.join([letter for letter in description if letter in 'abcdefghijklmnopqrstuvwxyz ']).split()
# descriptions = descriptions.apply(remove_nonletters)
# descriptions.head()

In [9]:
# Lemmatizing words and joining without commas
lemmatizer = WordNetLemmatizer()
def lemmatize(description):
    return map(lemmatizer.lemmatize, description)
lemmatized_descriptions = descriptions.map(lemmatize).str.join(' ')
lemmatized_descriptions.head()

0    dried sage alongside brisk acidity aromas incl...
1    juicy red berry fruits still structured firm t...
2    green pineapple pokes crisp acidity underscori...
3    orange blossom start mango giving way slightly...
4    pleasantly unfussy country wine hearty winter ...
Name: description, dtype: object

In [10]:
stemmer = PorterStemmer()
# def stem(description):
#     return map(stemmer.stem, description)
# stemmed_descriptions = lemmatized_descriptions.map(stem).str.join(' ')
# stemmed_descriptions.head()


In [11]:
# 0    aroma includ tropic fruit broom brimston and d...
# 1    thi is ripe and fruiti a wine that is smooth w...
# 2    tart and snappi the flavor of lime flesh and r...
# 3    pineappl rind lemon pith and orang blossom sta...
# 4    much like the regular bottl from thi come acro...
# Name: description, dtype: object

In [12]:
# Vectorizing descriptions and creating matrix
tfidf_vectorizer = TfidfVectorizer(use_idf=False, stop_words=stop_words, max_features=2500, binary=True)
tfidf_matrix = tfidf_vectorizer.fit_transform(lemmatized_descriptions)
tfidf_matrix.todense()

matrix([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]])

In [14]:
# Take in an input and preprocess
# Note: this is actually a description of Stranahan's Colorado Single Malt
# which is a whiskey, not a wine

# new_description = '''Vanilla and brown sugar, plus a hefty dose of 
# dried apricot and peppery spice. This Colorado single malt shows off
# textbook American oak-derived flavors.'''

#new_description = wine.description[351]

new_description = "apple, brimstone, sage, fruit, citrus, herb"

new_description = new_description.lower()
new_description = ''.join([i for i in new_description if i in 'abcdefghijklmnopqrstuvwxzy '])
new_description = map(lemmatizer.lemmatize, new_description)
new_description = ''.join(map(stemmer.stem, new_description))
new_description

'apple brimstone sage fruit citrus herb'

In [15]:
# Vectorize input
new_vector = tfidf_vectorizer.transform([new_description])
new_vector

<1x2500 sparse matrix of type '<class 'numpy.float64'>'
	with 5 stored elements in Compressed Sparse Row format>

In [16]:
# new_vector = count_vectorizer.transform([new_description])
# new_vector

In [17]:
# Find cosine similarity of input and description matrix
similarities = cosine_similarity(new_vector, tfidf_matrix)
similarities

array([[0.5976143 , 0.        , 0.        , ..., 0.        , 0.09534626,
        0.1118034 ]])

In [19]:
similarities.shape

(1, 129971)

In [92]:
np.argsort(similarities[0][-4:-1])

array([1, 0, 2])

In [143]:
# Extract most similar index
most_similar_index = np.argsort(similarities)[0][-1]
most_similar_indices = np.argsort(similarities)[0][-1:-4:-1]
print(most_similar_index)
print(most_similar_indices)

0
[     0 124533  43376]


In [54]:
# Extract most similar description
most_similar_description = ' '.join(descriptions[most_similar_index])
most_similar_description

'dried sage alongside brisk acidity aromas include tropical fruit offering unripened apple dried herb overly expressive palate citrus broom brimstone'

In [55]:
# Compare input description and most similar description
new_set = set(new_description.split()) - stop_words
print(new_set)
similar_set = set(lemmatized_descriptions[most_similar_index].split()) - stop_words
print(similar_set)

{'brimstone', 'apple', 'sage', 'herb', 'citrus', 'fruit'}
{'brimstone', 'overly', 'tropical', 'apple', 'expressive', 'sage', 'acidity', 'palate', 'dried', 'herb', 'citrus', 'fruit', 'brisk', 'aromas', 'unripened', 'broom'}


In [56]:
# Extract the shared words
common_words = new_set & similar_set
common_words

{'apple', 'brimstone', 'citrus', 'fruit', 'herb', 'sage'}

In [57]:
# Give info for the most similar wine
most_similar_wine = wine.iloc[most_similar_index]
most_similar_wine

Unnamed: 0                                                               0
country                                                              Italy
description              Aromas include tropical fruit, broom, brimston...
designation                                                   Vulkà Bianco
points                                                                  87
price                                                                  NaN
province                                                 Sicily & Sardinia
region_1                                                              Etna
region_2                                                               NaN
taster_name                                                  Kerin O’Keefe
taster_twitter_handle                                         @kerinokeefe
title                                    Nicosia 2013 Vulkà Bianco  (Etna)
variety                                                        White Blend
winery                   

In [144]:
# Returns top 3 most similar wines
most_similar_wines = wine.iloc[most_similar_indices]
most_similar_wines

Unnamed: 0.1,Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery
0,0,Italy,"Aromas include tropical fruit, broom, brimston...",Vulkà Bianco,87,,Sicily & Sardinia,Etna,,Kerin O’Keefe,@kerinokeefe,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia
124533,124533,Italy,This Sicilian Inzolia releases fresh aromas of...,,85,12.0,Sicily & Sardinia,Sicilia,,,,D'Alessandro 2010 Inzolia (Sicilia),Inzolia,D'Alessandro
43376,43376,US,"Very dry and slightly musty, the fruit tastes ...",,84,10.0,Idaho,Idaho,,Paul Gregutt,@paulgwine,Ste. Chapelle 1999 Chardonnay (Idaho),Chardonnay,Ste. Chapelle


In [None]:
# I love comments! #

In [58]:
wine.iloc[33553]

Unnamed: 0                                                           33553
country                                                        New Zealand
description              You've got to dig for them a little, but event...
designation                                                            NaN
points                                                                  88
price                                                                   18
province                                                       Marlborough
region_1                                                               NaN
region_2                                                               NaN
taster_name                                                            NaN
taster_twitter_handle                                                  NaN
title                      Kim Crawford 2016 Sauvignon Blanc (Marlborough)
variety                                                    Sauvignon Blanc
winery                   

In [162]:
#np.where(wine['title'].str.contains('Kim Crawford'))

(array([  2216,   4015,  12554,  33553,  34962,  40110,  45075,  50338,
         77468,  80525,  80647,  86112, 105094, 106158, 111590, 113882,
        115882, 123017, 126534, 129848]),)

In [161]:
#np.where(wine['title'].str.contains('Kim Crawford 2016 Sauvignon Blanc'))

(array([33553]),)

In [None]:
#instantiating and generating the count matrix
count_vectorizer = CountVectorizer()
count_matrix = count_vectorizer.fit_transform(wine['title'])
#count_matrix.todense()

In [193]:
search_title = 'Kim Crawford Sauvignon Blanc'
search_title_vector = count_vectorizer.transform([new_title])

In [194]:
search_title_cosine = cosine_similarity(search_title_vector, count_matrix)
search_title

'Kim Crawford Sauvignon Blanc'

In [195]:
search_title_index = np.argsort(search_title_cosine)[0][-1]
print(search_title_index)

80647


In [196]:
wine.iloc[80647]

Unnamed: 0                                                           80647
country                                                        New Zealand
description              This medium-bodied Sauvignon Blanc strums the ...
designation                                                            NaN
points                                                                  90
price                                                                   19
province                                                       Marlborough
region_1                                                               NaN
region_2                                                               NaN
taster_name                                                 Joe Czerwinski
taster_twitter_handle                                               @JoeCz
title                      Kim Crawford 2008 Sauvignon Blanc (Marlborough)
variety                                                    Sauvignon Blanc
winery                   

In [197]:
search_title_description = ' '.join(descriptions[search_title_index])
search_title_description

'bodied sauvignon blanc strums wine finishes long crushed tomato leaf right chords pristine cleanliness pretty melody creaminesss rounds combining notes citrusy fruit midpalate medium make hint'

In [198]:
search_title_description = search_title_description.lower()
search_title_description = ''.join([i for i in search_title_description if i in 'abcdefghijklmnopqrstuvwxzy '])
search_title_description = map(lemmatizer.lemmatize, search_title_description)
search_title_description = ''.join(map(stemmer.stem, search_title_description))
search_title_description

'bodied sauvignon blanc strums wine finishes long crushed tomato leaf right chords pristine cleanliness pretty melody creaminesss rounds combining notes citrusy fruit midpalate medium make hint'

In [199]:
new_title_vector = tfidf_vectorizer.transform([search_title_description])
new_title_vector

<1x2500 sparse matrix of type '<class 'numpy.float64'>'
	with 19 stored elements in Compressed Sparse Row format>

In [200]:
new_title_similarities = cosine_similarity(new_title_vector, tfidf_matrix)
new_title_similarities

array([[0.06131393, 0.        , 0.        , ..., 0.        , 0.        ,
        0.05735393]])

In [208]:
title_most_similar_index = np.argsort(new_title_similarities)[0][-2]
title_most_similar_indices = np.argsort(new_title_similarities)[0][-2:-5:-1]
print(title_most_similar_index)
print(title_most_similar_indices)

37261
[ 37261 109718  80278]


In [209]:
new_set_title = set(search_title_description.split()) - stop_words
print(new_set_title)
similar_set_title = set(lemmatized_descriptions[title_most_similar_index].split()) - stop_words
print(similar_set_title)

{'midpalate', 'creaminesss', 'strums', 'blanc', 'tomato', 'make', 'crushed', 'notes', 'chords', 'pretty', 'fruit', 'combining', 'rounds', 'finishes', 'citrusy', 'right', 'bodied', 'long', 'sauvignon', 'melody', 'pristine', 'leaf', 'cleanliness', 'hint', 'medium'}
{'blend', 'tomato', 'pea', 'blanc', 'notes', 'hit', 'herbal', 'fruit', 'slightly', 'right', 'bodied', 'sauvignon', 'stone', 'leaf', 'hint', 'grapefruit', 'snow', 'medium', 'plump'}


In [210]:
common_words_title = new_set_title & similar_set_title
common_words_title

{'blanc',
 'bodied',
 'fruit',
 'hint',
 'leaf',
 'medium',
 'notes',
 'right',
 'sauvignon',
 'tomato'}

In [212]:
most_similar_wines_title = wine.iloc[title_most_similar_indices]
most_similar_wines_title

Unnamed: 0.1,Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery
37261,37261,New Zealand,"This is a medium-bodied, slightly plump Sauvig...",,88,15.0,Marlborough,,,Joe Czerwinski,@JoeCz,Twin Islands 2013 Sauvignon Blanc (Marlborough),Sauvignon Blanc,Twin Islands
109718,109718,New Zealand,"This is a medium-bodied, slightly plump Sauvig...",,88,15.0,Marlborough,,,Joe Czerwinski,@JoeCz,Twin Islands 2013 Sauvignon Blanc (Marlborough),Sauvignon Blanc,Twin Islands
80278,80278,New Zealand,This is on the more herbal side of the Sauvign...,Single Vineyard,88,15.0,Marlborough,,,Joe Czerwinski,@JoeCz,Mount Fishtail 2014 Single Vineyard Sauvignon ...,Sauvignon Blanc,Mount Fishtail


In [186]:
most_similar_title_wine = wine.iloc[most_similar_title_index]
most_similar_title_wine

Unnamed: 0                                                          123017
country                                                        New Zealand
description              Grassy, herbal notes accent grapefruit and lim...
designation                                         Small Parcels Spitfire
points                                                                  90
price                                                                   26
province                                                       Marlborough
region_1                                                               NaN
region_2                                                               NaN
taster_name                                                 Joe Czerwinski
taster_twitter_handle                                               @JoeCz
title                    Kim Crawford 2014 Small Parcels Spitfire Sauvi...
variety                                                    Sauvignon Blanc
winery                   