In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem.porter import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /home/land/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
wine = pd.read_csv('winemag-data-130k-v2.csv')
wine.head(2)

Unnamed: 0.1,Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery
0,0,Italy,"Aromas include tropical fruit, broom, brimston...",Vulkà Bianco,87,,Sicily & Sardinia,Etna,,Kerin O’Keefe,@kerinokeefe,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia
1,1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Douro,,,Roger Voss,@vossroger,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos


In [3]:
# Get a set of all the varieties
varieties = set(wine['variety'])
len(varieties)

708

In [4]:
# Get a st of all English stop words
english_stop_words = set(stopwords.words('english'))
len(english_stop_words)


179

In [5]:
# Combine English Stop Words and Varieties into one big set of stop words to avoid data leakage
stop_words = varieties | english_stop_words
len(stop_words)

887

In [6]:
# Get the textual descriptions of the wines and make lowercase
descriptions = wine['description'].str.lower()
# remove anything that is not a letter:
def remove_nonletters(description):
    return ''.join([letter for letter in description if letter in 'abcdefghijklmnopqrstuvwxyz ']).split()
descriptions = descriptions.apply(remove_nonletters)
descriptions.head()

0    [aromas, include, tropical, fruit, broom, brim...
1    [this, is, ripe, and, fruity, a, wine, that, i...
2    [tart, and, snappy, the, flavors, of, lime, fl...
3    [pineapple, rind, lemon, pith, and, orange, bl...
4    [much, like, the, regular, bottling, from, thi...
Name: description, dtype: object

In [7]:
lemmatizer = WordNetLemmatizer()
def lemmatize(description):
    return map(lemmatizer.lemmatize, description)
lemmatized_descriptions = descriptions.map(lemmatize)
lemmatized_descriptions.head()

0    <map object at 0x7f5a50116690>
1    <map object at 0x7f5a50116750>
2    <map object at 0x7f5a501164d0>
3    <map object at 0x7f5a50116650>
4    <map object at 0x7f5a50116490>
Name: description, dtype: object

In [8]:
stemmer = PorterStemmer()
def stem(description):
    return map(stemmer.stem, description)
stemmed_descriptions = lemmatized_descriptions.map(stem).str.join(' ')
stemmed_descriptions.head()


0    aroma includ tropic fruit broom brimston and d...
1    thi is ripe and fruiti a wine that is smooth w...
2    tart and snappi the flavor of lime flesh and r...
3    pineappl rind lemon pith and orang blossom sta...
4    much like the regular bottl from thi come acro...
Name: description, dtype: object

In [9]:
tfidf_vectorizer = TfidfVectorizer(stop_words=stop_words, max_features=2500)
tfidf_matrix = tfidf_vectorizer.fit_transform(stemmed_descriptions)
tfidf_matrix.todense()

matrix([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]])

In [10]:
# Note: this is actually a description of Stranahan's Colorado Single Malt
# which is a whiskey, not a wine

new_description = '''Vanilla and brown sugar, plus a hefty dose of 
dried apricot and peppery spice. This Colorado single malt shows off
textbook American oak-derived flavors.'''

new_description = new_description.lower()
new_description = ''.join([i for i in new_description if i in 'abcdefghijklmnopqrstuvwxzy '])
new_description = map(lemmatizer.lemmatize, new_description)
new_description = ''.join(map(stemmer.stem, new_description))
new_description

'vanilla and brown sugar plus a hefty dose of dried apricot and peppery spice this colorado single malt shows offtextbook american oakderived flavors'

In [11]:
new_vector = tfidf_vectorizer.transform([new_description])
new_vector

<1x2500 sparse matrix of type '<class 'numpy.float64'>'
	with 7 stored elements in Compressed Sparse Row format>

In [12]:
similarities = cosine_similarity(new_vector, tfidf_matrix)
similarities

array([[0.        , 0.        , 0.        , ..., 0.0269923 , 0.03014873,
        0.        ]])

In [23]:
most_similar_index = np.argsort(similarities)[0][-1]
most_similar_index

106617

In [24]:
most_similar_description = ' '.join(descriptions[most_similar_index])
most_similar_description

'from the fine spring mountain appellation this has sweetandsour flavors of apricots pineapples and brown sugar'

In [25]:
new_set = set(new_description.split()) - stop_words
print(new_set)
similar_set = set(stemmed_descriptions[most_similar_index].split()) - stop_words
print(similar_set)

{'single', 'apricot', 'oakderived', 'plus', 'peppery', 'spice', 'american', 'vanilla', 'dose', 'hefty', 'sugar', 'offtextbook', 'colorado', 'dried', 'malt', 'brown', 'flavors', 'shows'}
{'ha', 'sweetandsour', 'apricot', 'appel', 'mountain', 'thi', 'spring', 'fine', 'pineappl', 'sugar', 'brown', 'flavor'}


In [26]:
common_words = new_set & similar_set
common_words

{'apricot', 'brown', 'sugar'}

In [27]:
most_similar_wine = wine.iloc[most_similar_index]
most_similar_wine

Unnamed: 0                                                          106617
country                                                                 US
description              From the fine Spring Mountain appellation, thi...
designation                                                            NaN
points                                                                  82
price                                                                   29
province                                                        California
region_1                                          Spring Mountain District
region_2                                                              Napa
taster_name                                                            NaN
taster_twitter_handle                                                  NaN
title                    Keenan 2006 Chardonnay (Spring Mountain District)
variety                                                         Chardonnay
winery                   