In [None]:
#Imports

import numpy as np
import pandas as pd
import string

import nltk
from nltk.corpus import stopwords
#nltk.download('stopwords')
#nltk.download('wordnet') 
#nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

import gensim


In [None]:
#Visualization Imports

import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline 
%config InlineBackend.figure_formats = ['retina']

sns.set_style("white")
sns.set_palette('husl')
sns.set_context('talk')

In [None]:
#### Load pickled dataframe

In [None]:
df = pd.read_pickle('./tokenized_wine_data_w_target.pkl')

In [None]:
# List of tokenized descriptions
corpus = list(df['description'])

In [None]:
corpus[0:2]

In [None]:
# Load stop words
stop_words = stopwords.words('english')

In [None]:
# Remove stop words from corpus
clean_corpus = []

for desc in corpus:
    desc = [word for word in desc if word not in stop_words]
    clean_corpus.append(desc)

In [None]:
# Lemmatize words
clean_corpus = [[WordNetLemmatizer().lemmatize(word) for word in desc] for desc in clean_corpus]

In [None]:
clean_corpus[0:8]

In [None]:
# Dataset-specific stop words
wine_stopwords = ['alongside', 'aroma', 'palate', 'offer', 'hint', 'include', 
                  'offering', 'recall', 'pretty', 'nose', 'note', 'lightly', 
                  'part', 'extended', 'series', 'show', 'backed', 'touch', 
                  'flavor', 'provides', 'companion', 'behind', 'mouthfeel', 
                  'could', 'plus', 'open', 'background', 'tone', 'stand', 
                  'isnt', 'expressive', 'mouth', 'wine', 'broad', 'generous', 
                  'term', 'would', 'make', 'tiny', 'blend']

In [None]:
# Remove stop words from corpus
cleaner_corpus = []

for desc in clean_corpus:
    desc = [word for word in desc if word not in wine_stopwords]
    cleaner_corpus.append(desc)

In [None]:
cleaner_corpus[0:8]

In [None]:
# Rejoin lists of words in each description for use in CV & TF-IDF
cleaner_corpus_joined = []

for doc in cleaner_corpus:
    joined = ' '.join(doc)
    cleaner_corpus_joined.append(joined)

In [None]:
## Goal: come up with most important vocabulary list for wine descriptions (aka distill wine descriptions down to most important parts) --> Figures


In [None]:
### Then, build model that can determine varietal based on description (use variety as target)

In [None]:
### ModelSomm: Finally, build model that can determine varietal + Province based off description as a proxy for taste

In [None]:
## Create Document-Term Matrix from Wine Descriptions

In [None]:
#### Count Vectorizer

In [None]:
cv = CountVectorizer()#ngram_range=(1,2))
X_cv = cv.fit_transform(cleaner_corpus_joined)

print(f"Dimensions of Document-term matrix: {X_cv.toarray().shape}")

In [None]:
# Checked out the vocab list
# cv.vocabulary_

In [None]:
#### TF-IDF

In [None]:
tfidfvec = TfidfVectorizer()#stop_words = 'english')#ngram_range=(1,2))
X_tfidf = tfidfvec.fit_transform(clean_corpus_joined)

print(f"Dimensions of Document-term matrix: {X_tfidf.toarray().shape}")

In [None]:
## Gensim

In [None]:
# Convert each document in doc list into a list of lowercase tokens
#tokenized_docs = [gensim.utils.simple_preprocess(d) for d in clean_corpus]

In [None]:
# Create a Gensim Dictionary.  This creates an id to word mapping for everything in our vocbulary
# It is NOT the same as the dictionary object in the Python standard library
# index is key, value is word
mydict = gensim.corpora.Dictionary()

In [None]:
# Create a Gensim Corpus object.  This creates a list of tuples for each document.
# The first element of the tuple is the word id, the second is the number of counts
mycorpus = [mydict.doc2bow(doc, allow_update=True) for doc in clean_corpus]

In [None]:
# Create tf-idf model
tfidf = gensim.models.TfidfModel(mycorpus)
tfidf_matrix = gensim.matutils.corpus2dense(tfidf[mycorpus], num_terms=len(mydict))

In [None]:
tfidf_matrix[0:2]

In [None]:
tfidf_matrix.shape

In [None]:
### Word2Vec

In [None]:
from gensim.models import Word2Vec

In [None]:
# size: desired dimension of our word vectors
# window: size of our context window
# sg: using Skip-gram architecture

model = gensim.models.Word2Vec(clean_corpus, size=10, window=2, min_count=1, sg=1)

In [None]:
model.wv.most_similar('bright', topn=8)

In [None]:
### PCA for Scree Plot

In [None]:
# Choosing number of components with a scree plot
from sklearn.decomposition import PCA

In [None]:
pca = PCA(n_components=200)
pca.fit(X_tfidf)
pcafeatures_train = pca.transform(X_tfidf)

In [None]:
plt.plot(pca.explained_variance_ratio_)
plt.xlabel('# components')
plt.ylabel('explained variance');
plt.title('Scree plot for digits dataset');

In [None]:
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('# components')
plt.ylabel('cumulative explained variance');
plt.title('Cumulative explained variance by PCA for digits');