# Wineteller : Reminder


- Build a wine recommender based on occasion rather than intrinsic wine characteristics 

- Deduce occasion from wine descriptors that can fit an atmosphere -> Body, Complexity, Finish 

- Use Computational Wine Wheel (mappings) to extract only words that are specific to Body, Complexity and Finish

- Train a Word2Vec model -> clusterize with K-means

In [1]:
#File paths 
file_location = '/Users/hyunoochang/code/chyunoo/wineteller/raw_data/winemag-data_first150k.csv'
map_location = '/Users/hyunoochang/code/chyunoo/wineteller/raw_data/descriptor_mapping.csv'

# Data Exploration

In [2]:
#python
import pandas as pd
import string
import numpy as np

#compute cell-executing time
from tqdm.notebook import trange, tqdm

#text preprocessing
from operator import itemgetter
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords
from collections import Counter, OrderedDict

#nlp modeling
from gensim.models.phrases import Phrases, Phraser
from gensim.models import Word2Vec
from sklearn.feature_extraction.text import TfidfVectorizer

#clusterization 
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors
from sklearn.decomposition import PCA
from matplotlib import pyplot as plt
from nltk.cluster import KMeansClusterer
import nltk
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

## Load Data

In [3]:
data = pd.read_csv(file_location, index_col='Unnamed: 0')
print(data.shape)

(150930, 10)


In [4]:
data.head(5)

Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,variety,winery
0,US,This tremendous 100% varietal wine hails from ...,Martha's Vineyard,96,235.0,California,Napa Valley,Napa,Cabernet Sauvignon,Heitz
1,Spain,"Ripe aromas of fig, blackberry and cassis are ...",Carodorum Selección Especial Reserva,96,110.0,Northern Spain,Toro,,Tinta de Toro,Bodega Carmen Rodríguez
2,US,Mac Watson honors the memory of a wine once ma...,Special Selected Late Harvest,96,90.0,California,Knights Valley,Sonoma,Sauvignon Blanc,Macauley
3,US,"This spent 20 months in 30% new French oak, an...",Reserve,96,65.0,Oregon,Willamette Valley,Willamette Valley,Pinot Noir,Ponzi
4,France,"This is the top wine from La Bégude, named aft...",La Brûlade,95,66.0,Provence,Bandol,,Provence red blend,Domaine de la Bégude


In [5]:
#Sample for fast test
#data=data[:1000]

## Clean Data

In [6]:
data.isna().sum()

country            5
description        0
designation    45735
points             0
price          13695
province           5
region_1       25060
region_2       89977
variety            0
winery             0
dtype: int64

In [7]:
data.drop(columns = ["region_1", "region_2", "points", "price", "designation", "winery"], inplace=True)

In [8]:
data = data.drop_duplicates()
data.shape

(97833, 4)

# Data Preprocessing

## Tokenize into sentences

In [9]:
%%time
#tokenization : reviews -> sentences

reviews_list = list(data['description'])
reviews_list = [str(r) for r in reviews_list]

sentences_tokenized=[]
for review in tqdm(reviews_list) :
    sentences_tokenized.append(sent_tokenize(review))
sentences_tokenized = [item for sublist in sentences_tokenized for item in sublist]

  0%|          | 0/97833 [00:00<?, ?it/s]

CPU times: user 14.8 s, sys: 831 ms, total: 15.7 s
Wall time: 19.5 s


In [10]:
len(sentences_tokenized)

270453

In [11]:
sentences_tokenized[:1]

['This tremendous 100% varietal wine hails from Oakville and was aged over three years in oak.']

## Remove stopwords and punctuation

In [12]:
%%time
#normalization : remove stopwords and punctuation + tokenize sentences into words

stop_words = set(stopwords.words('english')) 

punctuation_table = str.maketrans({key: None for key in string.punctuation})
sno = SnowballStemmer('english')

def normalize_text(raw_text):
    try:
        word_list = word_tokenize(raw_text)
        normalized_sentence = []
        for w in word_list:
            try:
                w = str(w)
                lower_case_word = str.lower(w)
                stemmed_word = sno.stem(lower_case_word)
                no_punctuation = stemmed_word.translate(punctuation_table)
                if len(no_punctuation) > 1 and no_punctuation not in stop_words:
                    normalized_sentence.append(no_punctuation)
            except:
                continue
        return normalized_sentence
    except:
        return ''

sentence_sample = sentences_tokenized[:10]
normalized_sentences = []
for s in tqdm(sentences_tokenized) :
    normalized_text = normalize_text(s)
    normalized_sentences.append(normalized_text)

  0%|          | 0/270453 [00:00<?, ?it/s]

CPU times: user 3min 30s, sys: 9.44 s, total: 3min 39s
Wall time: 4min 38s


In [13]:
normalized_sentences[:1]

[['tremend',
  '100',
  'variet',
  'wine',
  'hail',
  'oakvill',
  'age',
  'three',
  'year',
  'oak']]

## N-grams

In [14]:
%%time 
#retrieve bi-grams and tri-grams from normalized sentences

phrases = Phrases(normalized_sentences)
phrases = Phrases(phrases[normalized_sentences])

ngrams = Phraser(phrases)

phrased_sentences = []
for sent in tqdm(normalized_sentences):
    phrased_sentence = ngrams[sent]
    phrased_sentences.append(phrased_sentence)

full_list_words = [item for sublist in phrased_sentences for item in sublist]

  0%|          | 0/270453 [00:00<?, ?it/s]

CPU times: user 25.3 s, sys: 626 ms, total: 25.9 s
Wall time: 27.6 s


In [15]:
phrased_sentences[:1]

[['tremend',
  '100_variet',
  'wine',
  'hail',
  'oakvill',
  'age',
  'three_year',
  'oak']]

In [16]:
full_list_words[:5]

['tremend', '100_variet', 'wine', 'hail', 'oakvill']

In [17]:
#most common 5000 words in corpus

word_counts = Counter(full_list_words)
sorted_counts = OrderedDict(word_counts.most_common(5000))
counter_df = pd.DataFrame.from_dict(sorted_counts, orient='index')
top_5000_words = counter_df.head(5000)
counter_df.to_csv('top_5000_descriptors.csv')

In [18]:
top_5000_words.head()

Unnamed: 0,0
wine,60956
flavor,55898
fruit,40089
finish,29733
acid,25915


## Filter wine descriptors

### Load descriptor mapping

In [19]:
#load wine descriptor_mapping
descriptor_mapping = pd.read_csv(map_location).set_index('raw descriptor')
descriptor_mapping.head()

Unnamed: 0_level_0,level_3,level_2,level_1
raw descriptor,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
abras,abrasive,high_tannin,tannin
acacia,acacia,flowery,flower
acacia_flower,acacia,flowery,flower
aciddriven,acid_driven,high_acid,acid
aggress,aggressive,high_acid,acid


In [20]:
descriptor_mapping.level_1.value_counts()

fruit              235
vegetal             90
woody               77
tannin              72
earth_inorganic     68
acid                63
body                57
spice               46
caramel             44
flower              38
visual              30
earth_organic       27
sweetness           22
sulfides            21
nutty               19
microbial           17
concentration       17
complexity          16
alcohol             16
brettanomyces       15
style               11
salinity             8
finish               6
Name: level_1, dtype: int64

In [21]:
#We keep ALL non-aroma descriptors and exclude aroma descriptors

descriptor_list = ['body', 'complexity', 'finish', 'alcohol', 'sweetness']
filtered_descriptor_mapping = descriptor_mapping[descriptor_mapping['level_1'].isin(descriptor_list)]

In [22]:
filtered_descriptor_mapping.head()

Unnamed: 0_level_0,level_3,level_2,level_1
raw descriptor,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
airi,airy,light_bodied,body
bake,baked,high_alcohol,alcohol
big_bold,bold,full_bodied,body
bone_dri,bone_dry,dry,sweetness
bonedri,bone_dry,dry,sweetness


In [23]:
#number of raw descriptors per features 
filtered_descriptor_mapping["level_1"].value_counts()

body          57
sweetness     22
alcohol       16
complexity    16
finish         6
Name: level_1, dtype: int64

In [24]:
print('filtered descriptors :', round((filtered_descriptor_mapping["level_1"].value_counts().sum()/descriptor_mapping.level_1.value_counts().sum())*100),'% ,',filtered_descriptor_mapping["level_1"].value_counts().sum(), 'out of', descriptor_mapping.level_1.value_counts().sum())

filtered descriptors : 12 % , 117 out of 1015


In [25]:
#list of all raw descriptors
print(list(filtered_descriptor_mapping.index),)

['airi', 'bake', 'big_bold', 'bone_dri', 'bonedri', 'bullish', 'blocki', 'chunki', 'solid_chunki', 'clampi', 'close', 'cloy', 'complex', 'complic', 'dainti', 'deep', 'depth', 'dri', 'drier', 'dryness', 'eas', 'easi_sip', 'eleg', 'expans', 'extract', 'feminin', 'finess', 'ampl_weight', 'bold', 'full_bodi', 'fullbodi', 'soupi', 'weighti', 'headi', 'hearti', 'heavi', 'heavier', 'heavyweight', 'bigger', 'high_alcohol', 'highoctan', 'alcohol_heat', 'heat', 'heat_evid', 'hot', 'lavish', 'rather_lean', 'length', 'lengthi', 'long_mouthwat', 'longlast', 'lightest', 'light', 'light_bodi', 'lightbodi', 'lighter', 'lighter_style', 'lightfoot', 'lightweight', 'thin', 'linear', 'light_feet', 'low_alcohol', 'lowalcohol', 'straightforward', 'lush', 'luxuri', 'mass', 'medium_bodi', 'medium_fullbodi', 'medium_weight', 'mediumbodi', 'mediumweight', 'medium_length', 'mediumlength_finish', 'mediumsweet', 'semisweet', 'modest', 'offdri', 'offdri_style', 'onedimension', 'opul', 'plump', 'pungent', 'quaffer',

In [26]:
#list of all level_2 descriptors
filtered_descriptor_mapping["level_2"].value_counts()

full_bodied             36
light_bodied            16
high_alcohol            13
sweet                   10
high_complexity          8
low_complexity           8
dry                      7
very_sweet               5
medium_bodied            5
long_finish              4
low_alcohol              3
medium_length_finish     2
Name: level_2, dtype: int64

#### Data exploration (descriptors)

In [27]:
#filtered_descriptor_mapping[filtered_descriptor_mapping["level_1"] == "visual"].head()

In [28]:
#filtered_descriptor_mapping[filtered_descriptor_mapping["level_1"] == "style"].head()

In [29]:
#filtered_descriptor_mapping[filtered_descriptor_mapping["level_1"] == "alcohol"] .head()

In [30]:
#filtered_descriptor_mapping[filtered_descriptor_mapping["level_1"] == "tannin"].groupby("level_2").count()

In [31]:
#filtered_descriptor_mapping[filtered_descriptor_mapping["level_2"] == "low_tannin"].value_counts()

### Apply mapping to sentences

In [None]:
%%time
#apply mapping on each word of each sentence

#### MUST UPDATE CODE ####
#-> due to small size of vocabulary : return level_3 for body, complexity, finish, sweetness and alcohol

def return_mapped_descriptor(word):
    if word in list(filtered_descriptor_mapping.index):
        normalized_word = filtered_descriptor_mapping['level_3'][word]
        return normalized_word
    else:
        return ""

normalized_sentences = []
for sent in tqdm(phrased_sentences):
    normalized_sentence = []
    for word in sent:
        normalized_word = return_mapped_descriptor(word)
        if normalized_word != "" :
            normalized_sentence.append(str(normalized_word))
        else :
            pass
    normalized_sentences.append(normalized_sentence)

  0%|          | 0/270453 [00:00<?, ?it/s]

In [None]:
normalized_sentences[:10]

In [None]:
len(normalized_sentences)

# Model

## Train model 

In [None]:
%%time
#fit Word2Vec model into corpus 
model = Word2Vec(normalized_sentences, vector_size=300, min_count=1, epochs=15)
print(model)

model.save('model.bin')

In [None]:
#list of vocab learned by model
print(model.wv.index_to_key,)

## Explore Word Embeddings

In [None]:
#Find most similar from vector
#wv = preprocessed_data.loc[4,"review_vector"]
#wv = np.squeeze(wv, axis=0)
#print(wv.shape)
#model.wv.most_similar(positive=[wv,], topn=10)

In [None]:
#Find most similar word 
model.wv.most_similar(positive='light_bodied', topn=10)

In [None]:
#Retrieve vector from word 
#model.wv["light_bodied"]

In [None]:
#Compute distance between two words/vectors
#cosine_similarity([model.wv["light_bodied"]], [wv])

## Return mapped descriptors from reviews

In [None]:
%%time
wine_reviews = list(data['description'])

def return_descriptor_from_mapping(word):
    if word in list(filtered_descriptor_mapping.index):
        descriptor_to_return = filtered_descriptor_mapping['level_3'][word]
        return descriptor_to_return

descriptorized_reviews = []
for review in tqdm(wine_reviews):
    normalized_review = normalize_text(review)
    phrased_review = ngrams[normalized_review]
    descriptors_only = [return_descriptor_from_mapping(word) for word in phrased_review]
    no_nones = [str(d) for d in descriptors_only if d is not None]
    descriptorized_review = ' '.join(no_nones)
    descriptorized_reviews.append(descriptorized_review)

In [None]:
descriptorized_reviews[:5]

In [None]:
len(descriptorized_reviews)

## Calculate Review Vectors with TFIDF

In [None]:
%%time

#apply Tfidf weights and compute wine review vectors from word vectors

vectorizer = TfidfVectorizer()
X = vectorizer.fit(descriptorized_reviews)

dict_of_tfidf_weightings = dict(zip(X.get_feature_names(), X.idf_))

In [None]:
%%time
wine_review_vectors = []
for d in tqdm(descriptorized_reviews):
    descriptor_count = 0
    weighted_review_terms = []
    terms = d.split(' ')
    for term in terms:
        if term in dict_of_tfidf_weightings.keys():
            tfidf_weighting = dict_of_tfidf_weightings[term]
            word_vector = model.wv.get_vector(term).reshape(1, 300)
            weighted_word_vector = tfidf_weighting * word_vector
            weighted_review_terms.append(weighted_word_vector)
            descriptor_count += 1
        else:
            continue
    try:
        review_vector = sum(weighted_review_terms)/len(weighted_review_terms)
    except:
        review_vector = []
    vector_and_count = [terms, review_vector, descriptor_count]
    wine_review_vectors.append(vector_and_count)

## Merge in final dataset 

In [None]:
#concatenante in new dataset
data['normalized_descriptors'] = list(map(itemgetter(0), wine_review_vectors))
data['review_vector'] = list(map(itemgetter(1), wine_review_vectors))
data['descriptor_count'] = list(map(itemgetter(2), wine_review_vectors))

data.reset_index(inplace=True)

In [None]:
data.head()

In [None]:
data.loc[3,"description"]

In [None]:
#rows with no BCF descriptor 
data = data.drop(columns= "index")
preprocessed_data = data[data["descriptor_count"]>0]
print(preprocessed_data.shape)

## Export in csv

In [None]:
#from pathlib import Path  
#filepath = Path('/Users/hyunoochang/code/chyunoo/wineteller/notebooks/preprocessed_data.csv')  
#preprocessed_data.to_csv(filepath)

# Clusterization 

## Prepare dataset 

In [None]:
test = preprocessed_data.copy()

#Convert word index (non hashable) to tuples
test['normalized_descriptors'] = test['normalized_descriptors'].apply(tuple)

#Drop duplicates
test = test.drop_duplicates(subset="normalized_descriptors")

#Create X_train
X_train = test["review_vector"]
X_train = np.array(list(X_train), dtype=np.float)
X_train = np.squeeze(X_train, axis = 1)
X_train.shape

## TSNE 

In [None]:
tsne = TSNE(n_components=2, random_state=0)
Y = tsne.fit_transform(X_train)

In [None]:
plt.scatter(Y[:, 0], Y[:, 1])

In [None]:
#Code for plotting words (unreadable)
#plt.scatter(Y[:, 0], Y[:, 1])

#for label, x, y in zip(wines, Y[:, 0], Y[:, 1]):
    #plt.annotate(label, xy=(x, y), xytext=(0, 0), textcoords="offset points")
#plt.show()

## PCA 

### With Two dimensions

In [None]:
pca = PCA(n_components=2)
pca_results = pca.fit_transform(X_train)

In [None]:
pca_results.shape

In [None]:
pca_df = pd.DataFrame(pca_results, index=list(test.normalized_descriptors), columns=["pc1", "pc2"])
pca_df.head()

In [None]:
pca_df.plot(x='pc1',y='pc2',kind="scatter",figsize=(15, 10))

In [None]:
wine_words = [str(word).strip("(),' '") for word in pca_df.index]
print(len(wine_words,))

In [None]:
pca_df.index = wine_words
pca_df.head()

In [None]:
ax = pca_df.plot(x='pc1',y='pc2',kind="scatter",figsize=(15, 10),alpha=0)
for word in pca_df.index : 
    for txt in ['full_bodied']:
        if txt in word : 
            x = pca_df.pc1.loc[word]
            y = pca_df.pc2.loc[word]
            ax.annotate(word, (x,y))
plt.show()

In [None]:
for v in pca.explained_variance_ratio_:
    print('Explained variation per principal component: {}%'.format(round(v*100,2)))

In [None]:
plt.plot(pca.explained_variance_ratio_)
plt.xlabel('Principal Component'); plt.ylabel('% explained variance');

### With Max Dimensions

In [None]:
X_train.shape

In [None]:
len(list(test.normalized_descriptors))

In [None]:
pca_max = PCA()
pca_max_results = pca_max.fit_transform(X_train)
pca_max_df = pd.DataFrame(pca_max_results, index=list(test.normalized_descriptors), columns= [f'PC{i}' for i in range(1, 301)])

In [None]:
pca_max_results.shape

In [None]:
pca_max_df.head()

In [None]:
pca_max.explained_variance_ratio_[:11]

In [None]:
plt.plot(pca_max.explained_variance_ratio_[:11])
plt.xlabel('Principal Component'); plt.ylabel('% explained variance');

## K-Means 

In [None]:
X_proj = pd.DataFrame(pca_results, columns=["pc1", "pc2"])

In [None]:
from sklearn.cluster import KMeans
km = KMeans(n_clusters=5) 
km.fit(X_proj)

In [None]:
km.cluster_centers_.shape

In [None]:
print(km.labels_,)

In [None]:
plt.title('KMeans clustering'); plt.xlabel('PC 1'); plt.ylabel('PC 2')
ax = plt.gca()
sc = ax.scatter(X_proj.iloc[:,0], X_proj.iloc[:,1], c=km.labels_)


colors = km.labels_
labels = np.unique(km.labels_)
clset = set(zip(colors, labels))
handles = [plt.plot([],color=sc.get_cmap()(sc.norm(c)),ls="", marker="o")[0] for c,l in clset ]
labels = [l for c,l in clset]
plt.legend(handles, labels)
plt.show()

In [None]:
%%time
inertias = []
ks = range(1,20)

for k in ks:
    km_test = KMeans(n_clusters=k).fit(X_train)
    inertias.append(km_test.inertia_)

plt.plot(ks, inertias)
plt.xlabel('k cluster number')

## Word clouds for each cluster

In [None]:
text = str(list(pca_df[km.labels_==1].index))

# Create and generate a word cloud image:
wordcloud = WordCloud(background_color="white", max_words=5).generate(text)

# Display the generated image:
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

In [None]:
text = str(list(pca_df[km.labels_==0].index))

# Create and generate a word cloud image:
wordcloud = WordCloud(background_color="white", max_words=5).generate(text)

# Display the generated image:
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

In [None]:
text = str(list(pca_df[km.labels_==2].index))

# Create and generate a word cloud image:
wordcloud = WordCloud(background_color="white", max_words=10).generate(text)

# Display the generated image:
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

In [None]:
text = str(list(pca_df[km.labels_==3].index))

# Create and generate a word cloud image:
wordcloud = WordCloud(background_color="white", max_words=10).generate(text)

# Display the generated image:
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

In [None]:
text = str(list(pca_df[km.labels_==4].index))

# Create and generate a word cloud image:
wordcloud = WordCloud(background_color="white", max_words=5).generate(text)

# Display the generated image:
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()