# **ME 193, Spring 2021**
## Final Project Notebook 


**Notes**:

1. 

In [15]:
import os
import numpy as np
import pandas as pd
from IPython.display import JSON
import matplotlib.pyplot as plt
import seaborn as sns

import sklearn.tree as tree
import sklearn.metrics as mt
from sklearn.metrics import classification_report
import sklearn.ensemble as ens
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor, NearestNeighbors
import sklearn.model_selection as ms
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# string workings
import string
from operator import itemgetter
from collections import Counter, OrderedDict

import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords
nltk.download('stopwords')

from gensim.models.phrases import Phrases, Phraser
from gensim.models import Word2Vec
'''
# Amazon ML
import boto3
import sagemaker
from sagemaker import get_execution_role
'''

[nltk_data] Downloading package punkt to /Users/dylan/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /Users/dylan/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


'\n# Amazon ML\nimport boto3\nimport sagemaker\nfrom sagemaker import get_execution_role\n'

## Load kaggle database

In [11]:
df_reds = pd.read_csv(os.path.join("Data", "winemag-data_first150k.csv"))
print(len(df_reds))
df_reds.drop(['Unnamed: 0'], axis=1, inplace = True)
df_reds['name'] = df_reds['winery'] + ' ' + df_reds['variety']
df_reds.head()

150930


Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,variety,winery,name
0,US,This tremendous 100% varietal wine hails from ...,Martha's Vineyard,96,235.0,California,Napa Valley,Napa,Cabernet Sauvignon,Heitz,Heitz Cabernet Sauvignon
1,Spain,"Ripe aromas of fig, blackberry and cassis are ...",Carodorum Selección Especial Reserva,96,110.0,Northern Spain,Toro,,Tinta de Toro,Bodega Carmen Rodríguez,Bodega Carmen Rodríguez Tinta de Toro
2,US,Mac Watson honors the memory of a wine once ma...,Special Selected Late Harvest,96,90.0,California,Knights Valley,Sonoma,Sauvignon Blanc,Macauley,Macauley Sauvignon Blanc
3,US,"This spent 20 months in 30% new French oak, an...",Reserve,96,65.0,Oregon,Willamette Valley,Willamette Valley,Pinot Noir,Ponzi,Ponzi Pinot Noir
4,France,"This is the top wine from La Bégude, named aft...",La Brûlade,95,66.0,Provence,Bandol,,Provence red blend,Domaine de la Bégude,Domaine de la Bégude Provence red blend


In [9]:
# sample descrption
df_reds.description[0]

'This tremendous 100% varietal wine hails from Oakville and was aged over three years in oak. Juicy red-cherry fruit and a compelling hint of caramel greet the palate, framed by elegant, fine tannins and a subtle minty tone in the background. Balanced and rewarding from start to finish, it has years ahead of it to develop further nuance. Enjoy 2022–2030.'

In [12]:
df_des = df_reds[['name','description']]
df_des.head()

Unnamed: 0,name,description
0,Heitz Cabernet Sauvignon,This tremendous 100% varietal wine hails from ...
1,Bodega Carmen Rodríguez Tinta de Toro,"Ripe aromas of fig, blackberry and cassis are ..."
2,Macauley Sauvignon Blanc,Mac Watson honors the memory of a wine once ma...
3,Ponzi Pinot Noir,"This spent 20 months in 30% new French oak, an..."
4,Domaine de la Bégude Provence red blend,"This is the top wine from La Bégude, named aft..."


## Pre processing text descriptions

In [18]:
all_descriptions = df_des["description"].tolist()
all_descriptions = [des for des in all_descriptions]
#all_descriptions = [item for sublist in flavors_list for item in sublist]
#print(all_descriptions[1])

# Manipulate text
full_corpus = ' '.join(all_descriptions)
sentences_tokenized = sent_tokenize(full_corpus)
stop_words = set(stopwords.words('english')) 
punctuation_table = str.maketrans({key: None for key in string.punctuation})
sno = SnowballStemmer('english')

def normalize_text(raw_text):
    try:
        word_list = word_tokenize(raw_text)
        normalized_sentence = []
        for w in word_list:
            try:
                w = str(w)
                lower_case_word = str.lower(w)
                stemmed_word = sno.stem(lower_case_word)
                no_punctuation = stemmed_word.translate(punctuation_table)
                if len(no_punctuation) > 1 and no_punctuation not in stop_words:
                    normalized_sentence.append(no_punctuation)
            except:
                continue
        return normalized_sentence
    except:
        return ''

In [19]:
print(sentences_tokenized[:10])

['This tremendous 100% varietal wine hails from Oakville and was aged over three years in oak.', 'Juicy red-cherry fruit and a compelling hint of caramel greet the palate, framed by elegant, fine tannins and a subtle minty tone in the background.', 'Balanced and rewarding from start to finish, it has years ahead of it to develop further nuance.', 'Enjoy 2022–2030.', 'Ripe aromas of fig, blackberry and cassis are softened and sweetened by a slathering of oaky chocolate and vanilla.', 'This is full, layered, intense and cushioned on the palate, with rich flavors of chocolaty black fruits and baking spices.', 'A toasty, everlasting finish is heady but ideally balanced.', 'Drink through 2023.', 'Mac Watson honors the memory of a wine once made by his mother in this tremendously delicious, balanced and complex botrytised white.', 'Dark gold in color, it layers toasted hazelnut, pear compote and orange peel flavors, reveling in the succulence of its 122 g/L of residual sugar.']


In [None]:
normalized_sentences = []
for s in sentences_tokenized:
    normalized_text = normalize_text(s)
    normalized_sentences.append(normalized_text)

phrases = Phrases(normalized_sentences)
phrases = Phrases(phrases[normalized_sentences])

ngrams = Phraser(phrases)

phrased_sentences = []
for sent in normalized_sentences:
    phrased_sentence = ngrams[sent]
    phrased_sentences.append(phrased_sentence)

full_list_words = [item for sublist in phrased_sentences for item in sublist]

In [None]:
descriptor_mapping = pd.read_csv('s3://{}/descriptor_mapping.csv'.format(bucket)).set_index('raw descriptor')

sess = sagemaker.Session()

def return_mapped_descriptor(word):
    if word in list(descriptor_mapping.index):
        normalized_word = descriptor_mapping['level_3'][word]
        return normalized_word
    else:
        return word

normalized_sentences = []
for sent in phrased_sentences:
    normalized_sentence = []
    for word in sent:
        normalized_word = return_mapped_descriptor(word)
        normalized_sentence.append(str(normalized_word))
    normalized_sentence.append('.')
    normalized_sentence_concat = ' '.join(normalized_sentence)
    normalized_sentences.append(normalized_sentence_concat)