# Cleaning Data

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from ast import literal_eval
from sklearn.metrics.pairwise import pairwise_kernels
import pickle
import os.path

In [2]:
# Read in CSV file
wine_file = 'db/first_1000.csv'

In [3]:
df = pd.read_csv(wine_file)

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,id,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery
0,0,3,US,"Pineapple rind, lemon pith and orange blossom ...",Reserve Late Harvest,87,13,Michigan,Lake Michigan Shore,,Alexander Peartree,,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian
1,1,4,US,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87,65,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine¬¨‚Ä†,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks
2,2,5,Spain,Blackberry and raspberry aromas show a typical...,Ars In Vitro,87,15,Northern Spain,Navarra,,Michael Schachner,@wineschach,Tandem 2011 Ars In Vitro Tempranillo-Merlot (N...,Tempranillo-Merlot,Tandem
3,3,6,Italy,"Here's a bright, informal red that opens with ...",Belsito,87,16,Sicily & Sardinia,Vittoria,,Kerin O‚Äö√Ñ√¥Keefe,@kerinokeefe,Terre di Giurfo 2013 Belsito Frappato (Vittoria),Frappato,Terre di Giurfo
4,4,9,France,This has great depth of flavor with its fresh ...,Les Natures,87,27,Alsace,Alsace,,Roger Voss,@vossroger,Jean-Baptiste Adam 2012 Les Natures Pinot Gris...,Pinot Gris,Jean-Baptiste Adam


In [5]:
# Drop rows without a price
df = df[df.price.notnull()]

In [6]:
df.count()

Unnamed: 0               1001
id                       1001
country                  1001
description              1001
designation              1001
points                   1001
price                    1001
province                 1001
region_1                 1001
region_2                  453
taster_name               780
taster_twitter_handle     749
title                    1001
variety                  1001
winery                   1001
dtype: int64

In [7]:
# Drop rows without a designation
df = df[df.designation.notnull()]

In [8]:
df.count()

Unnamed: 0               1001
id                       1001
country                  1001
description              1001
designation              1001
points                   1001
price                    1001
province                 1001
region_1                 1001
region_2                  453
taster_name               780
taster_twitter_handle     749
title                    1001
variety                  1001
winery                   1001
dtype: int64

In [9]:
df = df[df.region_1.notnull()]

In [10]:
df.count()

Unnamed: 0               1001
id                       1001
country                  1001
description              1001
designation              1001
points                   1001
price                    1001
province                 1001
region_1                 1001
region_2                  453
taster_name               780
taster_twitter_handle     749
title                    1001
variety                  1001
winery                   1001
dtype: int64

# Recommender System with Test Data

In [11]:
# Define the TD-IDF Vectorizer Object and remove english stop words
tfidf = TfidfVectorizer(stop_words='english')

In [12]:
# Construct the matrix
tfidf = tfidf.fit_transform(df['description'])

In [13]:
# Shape of matrix; 70,175 wines described by 24,253 different words
tfidf.shape

(1001, 3994)

In [14]:
# Compute the cosine similarity matrix
# tfidf shows rate of when words appear in entire dataset
cosine_sim = pairwise_kernels(tfidf, tfidf, metric='cosine', n_jobs=-1)

In [15]:
# Construct a reverse map of indices and wine titles
indices = pd.Series(df.index, index=df['title']).drop_duplicates()


In [16]:
# Function that takes wine title as an input and outputs most similar wines within our dataset
def get_recommendations(title, cosine_sim=cosine_sim):
    # Get index of wine that matches title
    idx = indices[title]
    # Get the pairwise similarity scores of all wines with the input wine
    sim_scores = list(enumerate(cosine_sim[idx]))
    # Sort wines based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    # Get the scores of the 5 most similar wines
    sim_scores = sim_scores[1:6]
    # Get the wine indices
    wine_indices = [i[0] for i in sim_scores]
    # Return the top 5 most similar wines
    # Figure out how to return price and points
    return df['title'].iloc[wine_indices]


In [17]:
get_recommendations('Cantine di Dolianova 2010 Dolia  (Monica di Sardegna)')

455    Paternoster 2007 Synthesi  (Aglianico del Vult...
773    Capolino Perlingieri 2011 Preta Falanghina (Sa...
11     Bianchi 2011 Signature Selection Merlot (Paso ...
873    Orion Wines 2016 The Wanted Zin Blush Rosato (...
650    Carpineto 2006 Sant'Ercolano  (Vino Nobile di ...
Name: title, dtype: object

In [18]:
def clean_data(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        #Check if taster exists. If not, return empty string
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''

In [19]:
features = ['taster_name', 'variety', 'province']

for feature in features:
    df[feature] = df[feature].apply(clean_data)

In [20]:
def create_soup(x):
    return ' '.join(x['taster_name']) + ' ' + ' '.join(x['variety']) + ' '.join(x['province'])

In [21]:
# Create a new soup feature
df['soup'] = df.apply(create_soup, axis=1)

In [22]:
# Import CountVectorizer and create the count matrix
from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(df['soup'])

In [23]:
# Compute the Cosine Similarity matrix based on the count_matrix
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim2 = cosine_similarity(count_matrix, count_matrix)

In [24]:
# Reset index of your main DataFrame and construct reverse mapping as before
df = df.reset_index()
indices = pd.Series(df.index, index=df['title'])

In [25]:
get_recommendations('Cantine di Dolianova 2010 Dolia  (Monica di Sardegna)', cosine_sim2)

1    Sweet Cheeks 2012 Vintner's Reserve Wild Child...
2    Tandem 2011 Ars In Vitro Tempranillo-Merlot (N...
3     Terre di Giurfo 2013 Belsito Frappato (Vittoria)
4    Jean-Baptiste Adam 2012 Les Natures Pinot Gris...
5    Kirkland Signature 2011 Mountain Cuv‚àö¬©e Cab...
Name: title, dtype: object

In [29]:
result = get_recommendations('Cantine di Dolianova 2010 Dolia  (Monica di Sardegna)', cosine_sim2)

In [31]:
for x in result.index:
    price = df.loc[x,'price']
    title = df.loc[x,'title']
    points = df.loc[x, 'points']
    variety = df.loc[x, 'variety']
    print(f"{title}, price: ${price}, points: {points}, grape type: {variety}")

Sweet Cheeks 2012 Vintner's Reserve Wild Child Block Pinot Noir (Willamette Valley), price: $65, points: 87, grape type: pinotnoir
Tandem 2011 Ars In Vitro Tempranillo-Merlot (Navarra), price: $15, points: 87, grape type: tempranillo-merlot
Terre di Giurfo 2013 Belsito Frappato (Vittoria), price: $16, points: 87, grape type: frappato
Jean-Baptiste Adam 2012 Les Natures Pinot Gris (Alsace), price: $27, points: 87, grape type: pinotgris
Kirkland Signature 2011 Mountain Cuv‚àö¬©e Cabernet Sauvignon (Napa Valley), price: $19, points: 87, grape type: cabernetsauvignon
