In [1]:
import pandas as pd
#Loading in my data set
metadata = pd.read_csv('wine_metadata.csv', low_memory=False)
metadata.head(5)

Unnamed: 0.1,Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery
0,0.0,Italy,"Aromas include tropical fruit, broom, brimston...",Vulkà Bianco,87.0,,Sicily & Sardinia,Etna,,Kerin O’Keefe,@kerinokeefe,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia
1,1.0,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87.0,15.0,Douro,,,Roger Voss,@vossroger,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos
2,2.0,US,"Tart and snappy, the flavors of lime flesh and...",,87.0,14.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm
3,3.0,US,"Pineapple rind, lemon pith and orange blossom ...",Reserve Late Harvest,87.0,13.0,Michigan,Lake Michigan Shore,,Alexander Peartree,,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian
4,4.0,US,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87.0,65.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks


In [2]:
#Take the top 10% of all wines in the data set
m = metadata['points'].quantile(0.90)
print(m)

92.0


In [3]:
# Filter out all qualified wines into a new DataFrame
top_wines = metadata.copy().loc[metadata['points'] >= m]
top_wines.shape

(7605, 14)

In [4]:
metadata.shape

(129868, 14)

In [5]:
#Put these wines into an ordered list based on their points
top_wines = top_wines.sort_values('points', ascending=False)
#Print the top 10
top_wines[['designation', 'country', 'variety', 'points']].head(10)

Unnamed: 0,designation,country,variety,points
7335,Occhio di Pernice,Italy,Prugnolo Gentile,100.0
36528,Brut,France,Champagne Blend,100.0
345,Rare,Australia,Muscat,100.0
39286,Masseto,Italy,Merlot,100.0
42197,Barca-Velha,Portugal,Portuguese Red,100.0
41835,Cuvée Constance 500ml,France,Chenin Blanc,99.0
39287,Messorio,Italy,Merlot,99.0
44994,Cà d'Morissio Riserva,Italy,Nebbiolo,99.0
36529,Clos du Mesnil Brut Blanc de Blancs,France,Chardonnay,99.0
1557,Precious Mountain Vineyard,US,Pinot Noir,99.0


In [6]:
#Print the description field
metadata['description'].head()

0    Aromas include tropical fruit, broom, brimston...
1    This is ripe and fruity, a wine that is smooth...
2    Tart and snappy, the flavors of lime flesh and...
3    Pineapple rind, lemon pith and orange blossom ...
4    Much like the regular bottling from 2012, this...
Name: description, dtype: object

In [7]:
#Import TfIdfVectorizer from scikit-learn
from sklearn.feature_extraction.text import TfidfVectorizer

#Define a TF-IDF Vectorizer Object. Drop all english stopwords
tfidf = TfidfVectorizer(stop_words='english')

#Replace NaN with an empty string
metadata['description'] = metadata['description'].fillna('')

#Drop any empty fields
metadata = metadata.dropna()

#Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(metadata['description'])

#Output the shape of the matrix (X, Y), which has X entries and Y different feature words used in these entries. 
tfidf_matrix.shape

(7587, 9510)

In [8]:
#Show me the last 10 "feature words".
tfidf.get_feature_names()[9500:9510]

['zones',
 'zooming',
 'zooms',
 'zotovich',
 'zucca',
 'élévage',
 'émilion',
 'étoile',
 'über',
 'überbest']

In [9]:
# Import linear_kernel
from sklearn.metrics.pairwise import linear_kernel

# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [10]:
cosine_sim.shape

(7587, 7587)

In [11]:
cosine_sim[1]

array([0.00355367, 1.        , 0.00379597, ..., 0.0045665 , 0.03366889,
       0.00364933])

In [12]:
#Construct a reverse map of indices and wine names
indices = pd.Series(metadata.index, index=metadata['designation']).drop_duplicates()

In [13]:
indices[:10]

designation
Vintner's Reserve Wild Child Block     4
Mountain Cuvée                        10
Signature Selection                   23
King Ridge Vineyard                   25
Hyland                                35
Estate                                60
Alder Ridge Vineyard                  62
Golden Horn                           64
Inspired                              67
Old Vine                              71
dtype: int64

In [14]:
#This function accepts a wine's designation (i.e. its name) as input and outputs the most similar wines
def rec_wine(designation, cosine_sim=cosine_sim):
    # Get the index of the inputted wine
    idx = indices[designation]

    #List all cos_sim scores by their indices
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the wines by similarity to the inputted wine
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 5 most similar wines
    sim_scores = sim_scores[1:6]

    # Get the wine indices
    wine_indices = [i[0] for i in sim_scores]

    # Return the designations (names) of the top 5 most similar wines
    return metadata['designation'].iloc[wine_indices]