# **ME 193, Spring 2021**
## Final Project Notebook 


**Notes**:

1. 

In [6]:
import os
import numpy as np
import pandas as pd
from IPython.display import JSON
import matplotlib.pyplot as plt
import seaborn as sns

import sklearn.tree as tree
import sklearn.metrics as mt
from sklearn.metrics import classification_report
import sklearn.ensemble as ens
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor, NearestNeighbors
import sklearn.model_selection as ms
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import normalize


# string workings
import string
from operator import itemgetter
from collections import Counter, OrderedDict

import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords

from gensim.models.phrases import Phrases, Phraser
from gensim.models import Word2Vec

# Amazon ML
import boto3
import sagemaker
from sagemaker import get_execution_role

## Load kaggle database

In [7]:
df_reds = pd.read_csv(os.path.join("Data", "winemag-data_first150k.csv"))
print(len(df_reds))
df_reds.drop(['Unnamed: 0'], axis=1, inplace = True)
df_reds['name'] = df_reds['winery'] + ' ' + df_reds['variety']
df_reds.head()

150930


Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,variety,winery,name
0,US,This tremendous 100% varietal wine hails from ...,Martha's Vineyard,96,235.0,California,Napa Valley,Napa,Cabernet Sauvignon,Heitz,Heitz Cabernet Sauvignon
1,Spain,"Ripe aromas of fig, blackberry and cassis are ...",Carodorum Selección Especial Reserva,96,110.0,Northern Spain,Toro,,Tinta de Toro,Bodega Carmen Rodríguez,Bodega Carmen Rodríguez Tinta de Toro
2,US,Mac Watson honors the memory of a wine once ma...,Special Selected Late Harvest,96,90.0,California,Knights Valley,Sonoma,Sauvignon Blanc,Macauley,Macauley Sauvignon Blanc
3,US,"This spent 20 months in 30% new French oak, an...",Reserve,96,65.0,Oregon,Willamette Valley,Willamette Valley,Pinot Noir,Ponzi,Ponzi Pinot Noir
4,France,"This is the top wine from La Bégude, named aft...",La Brûlade,95,66.0,Provence,Bandol,,Provence red blend,Domaine de la Bégude,Domaine de la Bégude Provence red blend


In [8]:
# sample descrption
df_reds.description[0]

'This tremendous 100% varietal wine hails from Oakville and was aged over three years in oak. Juicy red-cherry fruit and a compelling hint of caramel greet the palate, framed by elegant, fine tannins and a subtle minty tone in the background. Balanced and rewarding from start to finish, it has years ahead of it to develop further nuance. Enjoy 2022–2030.'

In [9]:
# split data 
desc_train, desc_test, name_train, name_test = train_test_split(df_reds['description'],df_reds['name'], test_size=0.2, random_state=42)
df_des_test = pd.DataFrame(list(zip(name_test, desc_test)),columns =['name', 'description'])

In [10]:
#df_des = df_reds[['name','description']]
df_des = pd.DataFrame(list(zip(name_train, desc_train)),columns =['name', 'description'])
df_des.head()

Unnamed: 0,name,description
0,Bonterra Sauvignon Blanc,"Nicely dry and crisp in clean acidity, this Sa..."
1,FiàNobile Nero d'Avola,"This Nero d'Avola has aromas of earth, black c..."
2,Sada Vermentino,This elegant Vermentino from Tuscany has a lov...
3,Keenan Merlot,"Not many Merlots deserve time in the cellar, b..."
4,Paumanok Cabernet Sauvignon,"Crisp, elegant black-plum and cassis flavors a..."


## Pre processing text descriptions

In [11]:
all_descriptions = df_des["description"].tolist()
all_descriptions = [des for des in all_descriptions]
#all_descriptions = [item for sublist in flavors_list for item in sublist]
#print(all_descriptions[1])

# Manipulate text
full_corpus = ' '.join(all_descriptions)
sentences_tokenized = sent_tokenize(full_corpus)
stop_words = set(stopwords.words('english')) 
punctuation_table = str.maketrans({key: None for key in string.punctuation})
sno = SnowballStemmer('english')

def normalize_text(raw_text):
    try:
        word_list = word_tokenize(raw_text)
        normalized_sentence = []
        for w in word_list:
            try:
                w = str(w)
                lower_case_word = str.lower(w)
                stemmed_word = sno.stem(lower_case_word)
                no_punctuation = stemmed_word.translate(punctuation_table)
                if len(no_punctuation) > 1 and no_punctuation not in stop_words:
                    normalized_sentence.append(no_punctuation)
            except:
                continue
        return normalized_sentence
    except:
        return ''

In [12]:
print(sentences_tokenized[:10])

['Nicely dry and crisp in clean acidity, this Sauvignon Blanc has flavors of limes, gooseberries, and vanilla.', "But there's a lot of that notorious feline spray aroma and taste that detracts.", "This Nero d'Avola has aromas of earth, black currants, game and a note of grilled bell peppers.", 'The palate offers fleeting black cherry and black berry, accented by black pepper and a green note of dried sage.', 'It finishes on a bitter note.', 'This elegant Vermentino from Tuscany has a lovely floral fragrance of white flowers and stone fruit accompanied by succulent peach and ripe pear flavors.', 'The fruit richness is balanced by crisp freshness.', 'Not many Merlots deserve time in the cellar, but this one does.', "Such are its mountain tannins that it's in lockdown mode, with a tough, astringent finish.", 'But just below the surface are voluptuously ripe flavors of blackberries, cherries and mulberries.']


In [16]:
normalized_sentences = []
for s in sentences_tokenized:
    normalized_text = normalize_text(s)
    normalized_sentences.append(normalized_text)

phrases = Phrases(normalized_sentences)
phrases = Phrases(phrases[normalized_sentences])

ngrams = Phraser(phrases)

KeyboardInterrupt: 

In [15]:
phrased_sentences = []
for sent in normalized_sentences:
    phrased_sentence = ngrams[sent]
    phrased_sentences.append(phrased_sentence)

full_list_words = [item for sublist in phrased_sentences for item in sublist]

KeyboardInterrupt: 

In [17]:
descriptor_mapping = pd.read_csv(os.path.join("Data", "descriptor_mapping.csv")).set_index('raw descriptor')

def return_mapped_descriptor(word):
    if word in list(descriptor_mapping.index):
        normalized_word = descriptor_mapping['level_3'][word]
        return normalized_word
    else:
        return word

normalized_sentences = []
for sent in phrased_sentences:
    normalized_sentence = []
    for word in sent:
        normalized_word = return_mapped_descriptor(word)
        normalized_sentence.append(str(normalized_word))
    normalized_sentence.append('.')
    normalized_sentence_concat = ' '.join(normalized_sentence)
    normalized_sentences.append(normalized_sentence_concat)

KeyboardInterrupt: 

## Train model

In [69]:
# Store training data
with open('wine_corpus.txt', 'w') as f:
    for item in normalized_sentences:
        f.write("{}\n".format(item))


Uploaded this data to AWS sagemaker blazingtext algo and trained the algorithim. Below we load in this training model.

In [6]:
!tar -xvzf dw_model.tar.gz

x vectors.txt
x eval.json
x vectors.bin


In [19]:
num_points = len(open('vectors.txt','r').read().split('\n'))

first_line = True
index_to_word = []
with open("vectors.txt","r") as f:
    for line_num, line in enumerate(f):
        if first_line:
            dim = int(line.strip().split()[1])
            word_vecs = np.zeros((num_points, dim), dtype=float)
            first_line = False
            continue
        line = line.strip()
        word = line.split()[0]
        vec = word_vecs[line_num-1]
        for index, vec_val in enumerate(line.split()[1:]):
            vec[index] = float(vec_val)
        index_to_word.append(word)
        if line_num >= num_points:
            break
word_vecs = normalize(word_vecs, copy=False, return_norm=False)

names_vecs = list(zip(index_to_word, word_vecs))

names_vecs_filtered = [n for n in names_vecs if n[0] in list(descriptor_mapping['level_3'])]

names_vecs_df = pd.DataFrame(names_vecs_filtered, columns=['word', 'vector'])
names_vecs_df.sort_values(by=['word'], inplace=True)
names_vecs_df.to_csv('word_vectors.csv')

In [18]:
wine_reviews = list(df_des['description'])

def return_descriptor_from_mapping(word):
    if word in list(descriptor_mapping.index):
        descriptor_to_return = descriptor_mapping['level_3'][word]
        return descriptor_to_return

descriptorized_reviews = []
for review in wine_reviews:
    normalized_review = normalize_text(review)
    phrased_review = ngrams[normalized_review]
    descriptors_only = [return_descriptor_from_mapping(word) for word in phrased_review]
    no_nones = [str(d) for d in descriptors_only if d is not None]
    descriptorized_review = ' '.join(no_nones)
    descriptorized_reviews.append(descriptorized_review)



NameError: name 'names_vecs_df' is not defined

In [20]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit(descriptorized_reviews)

dict_of_idf_weightings = pd.DataFrame(zip(X.get_feature_names(), X.idf_), columns=['word', 'idf'])

vectors_and_idf = pd.merge(left=names_vecs_df, right=dict_of_idf_weightings, left_on='word', right_on='word', how='inner')
vectors_and_idf['word_vec_idf'] = vectors_and_idf['vector']*vectors_and_idf['idf']
vectors_and_idf = vectors_and_idf[['word', 'word_vec_idf']]
vectors_and_idf.set_index('word', inplace=True)
vectors_and_idf.to_csv('word_vectors_idf.csv')

wine_review_vectors = []
for d in descriptorized_reviews:
    descriptor_count = 0
    weighted_review_terms = []
    terms = d.split(' ')
    
    for term in terms:
        if term in list(vectors_and_idf.index):
            weighted_word_vector = vectors_and_idf.at[term, 'word_vec_idf']
            weighted_review_terms.append(weighted_word_vector)
            descriptor_count += 1
        else:
            continue
    
    try:
        review_vector = sum(weighted_review_terms)/len(weighted_review_terms)
    except:
        review_vector = []
    
    vector_and_count = [terms, review_vector, descriptor_count]
    wine_review_vectors.append(vector_and_count)



NameError: name 'wine_dataset_relevant' is not defined

In [22]:
wine_review_vectors_df = pd.DataFrame(wine_review_vectors, columns=['descriptors', 'review_vector', 'descriptor_count'])
full_wine_df = pd.concat([df_des, wine_review_vectors_df], axis=1)
full_wine_df.dropna(how='any', inplace=True)
full_wine_df.to_csv('wine_review_vectors.csv')
full_wine_df.head()

Unnamed: 0,name,description,descriptors,review_vector,descriptor_count
0,Bonterra Sauvignon Blanc,"Nicely dry and crisp in clean acidity, this Sa...","[dry, crisp, clean, lime, gooseberry, vanilla]","[0.35553248552071864, 0.12664773379889957, 0.1...",6
1,FiàNobile Nero d'Avola,"This Nero d'Avola has aromas of earth, black c...","[earth, black_currant, game, bell_pepper, cher...","[0.08857546076556227, 0.2916090560683173, 0.36...",11
2,Sada Vermentino,This elegant Vermentino from Tuscany has a lov...,"[elegant, white_flower, stone, fruit, succulen...","[0.1750577682634161, 0.06123672302175332, 0.43...",12
3,Keenan Merlot,"Not many Merlots deserve time in the cellar, b...","[voluptuous, ripe, blackberry, cherry, mulberry]","[0.13224498524604655, 0.08915907766171047, 0.3...",5
4,Paumanok Cabernet Sauvignon,"Crisp, elegant black-plum and cassis flavors a...","[crisp, elegant, plum, cassis, fresh, herb, dr...","[0.31281359266367176, -0.12118656106173605, 0....",13


In [58]:
X_in = pd.DataFrame(full_wine_df['review_vector'].values.tolist())
X_in.dropna(inplace=True)

knn = NearestNeighbors(n_neighbors=10, algorithm= 'brute', metric='cosine')
model_knn = knn.fit(X_in)

In [1]:
name_test = "Ponzi Pinot Noir"

wine_test_vector = full_wine_df.loc[full_wine_df['name'] == name_test]['review_vector'].values.tolist()
distance, indice = model_knn.kneighbors(wine_test_vector, n_neighbors=9)
distance_list = distance[0].tolist()[1:]
indice_list = indice[0].tolist()[1:]

main_wine = full_wine_df.loc[full_wine_df['name'] == name_test]

print('Similar to:', name_test)
print('The input wine is:', list(main_wine['descriptors'])[0])
print('_________')

n = 1
for d, i in zip(distance_list, indice_list):
    wine_name = full_wine_df['name'][i]
    wine_descriptors = full_wine_df['description'][i]
    print('Suggestion', str(n), ':', wine_name, 'distance of', "{:.3f}".format(d))
    print('and descriptors:', wine_descriptors)
    print('')
    n+=1


NameError: name 'full_wine_df' is not defined

array([ 0.35553249,  0.12664773,  0.10936317, -0.1851383 , -0.08707049,
       -0.12340965, -0.42339151,  0.14701959, -0.04237249, -0.17965019,
       -0.16699341,  0.38249065, -0.09582436, -0.0750661 , -0.2373822 ,
        0.01867774,  0.17329456, -0.10708956, -0.51685525,  0.05198096,
        0.28612537,  0.1801171 , -0.21623886,  0.16318324, -0.33091892,
       -0.6527566 ,  0.15825395,  0.08690007, -0.36082418,  0.4051321 ,
        0.37151455,  0.33333824, -0.15032933, -0.2182742 , -0.56823643,
        0.08133293,  0.42180671, -0.49673054, -0.25735572,  0.10638138,
        0.31177079,  0.51083162, -0.16639676,  0.10728115,  0.16802243,
       -0.36764116,  0.39267951,  0.09907775,  0.07439442, -0.05432674,
       -0.47212013, -0.06742511, -0.07993969,  0.14794919, -0.15414799,
       -0.06095944, -0.06507937,  0.44334476,  0.3237873 ,  0.15351034,
       -0.00682876, -0.27190679,  0.53236101,  0.09943219,  0.16252907,
       -0.02505279, -0.25117823, -0.06461226, -0.10616461, -0.37