# Cleaning Data

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from ast import literal_eval
from sklearn.metrics.pairwise import pairwise_kernels
import pickle
import os.path

In [None]:
# Read in CSV file
wine_file = 'db/final_data.csv'

In [None]:
df = pd.read_csv(wine_file)

In [None]:
df.head()

In [None]:
# Drop rows without a price
df = df[df.price.notnull()]

In [None]:
df.count()

In [None]:
# Drop rows without a designation
df = df[df.designation.notnull()]

In [None]:
df.count()

In [None]:
df = df[df.region_1.notnull()]

In [None]:
df.count()

In [None]:
len(df)

In [None]:
# Export file as a CSV
df.to_csv('db/clean_final.csv')

# Recommender System with Test Data

In [None]:
# Define the TD-IDF Vectorizer Object and remove english stop words
tfidf = TfidfVectorizer(stop_words='english')

In [None]:
# Construct the matrix
tfidf = tfidf.fit_transform(df['description'])

In [None]:
# Serialize indices data using pickle
with open('tfidf.pk1', 'wb') as pickle_file:
    pickle.dump(tfidf, pickle_file)

In [None]:
# Shape of matrix; 40,999 wines described by 20,595 different words
tfidf.shape

In [None]:
# Compute the cosine similarity matrix
cosine_sim = pairwise_kernels(tfidf, tfidf, metric='cosine', n_jobs=-1)

In [None]:
# Construct a reverse map of indices and wine titles
indices = pd.Series(df.index, index=df['title']).drop_duplicates()

In [None]:
# Function that takes wine title as an input and outputs most similar wines within our dataset
def get_recommendations(title, cosine_sim=cosine_sim):
    # Get index of wine that matches title
    idx = indices[title]
    # Get the pairwise similarity scores of all wines with the input wine
    sim_scores = list(enumerate(cosine_sim[idx]))
    # Sort wines based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    # Get the scores of the 5 most similar wines
    sim_scores = sim_scores[1:6]
    # Get the wine indices
    wine_indices = [i[0] for i in sim_scores]
    # Return the top 5 most similar wines
    return df['title'].iloc[wine_indices]


In [None]:
get_recommendations('Cantine di Dolianova 2010 Dolia  (Monica di Sardegna)')

In [None]:
def clean_data(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        #Check if taster exists. If not, return empty string
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''

In [None]:
features = ['taster_name', 'variety', 'province']

for feature in features:
    df[feature] = df[feature].apply(clean_data)

In [None]:
def create_soup(x):
    return ' '.join(x['taster_name']) + ' ' + ' '.join(x['variety']) + ' '.join(x['province'])

In [None]:
# Create a new soup feature
df['soup'] = df.apply(create_soup, axis=1)

In [None]:
# Import CountVectorizer and create the count matrix
from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(df['soup'])

In [None]:
# Compute the Cosine Similarity matrix based on the count_matrix
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim2 = cosine_similarity(count_matrix, count_matrix)

In [None]:
# Reset index of your main DataFrame and construct reverse mapping as before
df = df.reset_index()
indices = pd.Series(df.index, index=df['title'])

In [None]:
get_recommendations('Cantine di Dolianova 2010 Dolia  (Monica di Sardegna)', cosine_sim2)

In [None]:
result = get_recommendations('Cantine di Dolianova 2010 Dolia  (Monica di Sardegna)', cosine_sim2)

In [None]:
for x in result.index:
    price = df.loc[x,'price']
    title = df.loc[x,'title']
    points = df.loc[x, 'points']
    variety = df.loc[x, 'variety']
    print(f"{title}, price: ${price}, points: {points}, grape type: {variety}")