In [1]:
#Imports
import numpy as np
import pandas as pd

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
#nltk.download('stopwords')
#nltk.download('wordnet') 
#nltk.download('punkt')
from nltk.stem.wordnet import WordNetLemmatizer

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation


In [2]:
#Visualization Imports
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline 
%config InlineBackend.figure_formats = ['retina']
sns.set_style("white")
sns.set_palette('husl')
sns.set_context('talk')

#### Load pickled dataframe

In [3]:
df = pd.read_pickle('./clean_wine_data_w_target.pkl')

In [4]:
df.head()

Unnamed: 0,description,province,variety,target,target_code
0,aromas include tropical fruit broom brimston...,Sicily & Sardinia,White Blend,White Blend from Sicily & Sardinia,907
1,delicate aromas recall white flower and citrus...,Sicily & Sardinia,White Blend,White Blend from Sicily & Sardinia,907
2,pretty aromas of yellow flower and stone fruit...,Sicily & Sardinia,White Blend,White Blend from Sicily & Sardinia,907
3,part of the extended calanìca series this gri...,Sicily & Sardinia,White Blend,White Blend from Sicily & Sardinia,907
4,this offers heady aromas of honeysuckle white...,Sicily & Sardinia,White Blend,White Blend from Sicily & Sardinia,907


## Count Vectorizer I

Initial run to create Document-Term Matrix

In [5]:
# Create a document-term matrix using CountVectorizer
cv = CountVectorizer(stop_words='english')
data_cv = cv.fit_transform(df.description)

data_dtm = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names())
data_dtm.index = df.index
data_dtm.head()


Unnamed: 0,aacacia,aand,aaron,aas,ab,abacela,abacelas,abadal,abadia,abandon,...,élevé,élévage,émilion,émilions,étoile,über,überaromatic,überbest,ürzig,ürziger
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


**Takeaways:**
 - Way too many features, need to reduce dimensions
 - Increase list of stop words to target:
   - Domain stop words
   - Words only used in one review
   - Check size of matrix, then potentially remove other most-common words

### Deal with Stop Words

#### Domain Stop Words

In [6]:
domain_stopwords = ['tannins', 'flavors', 'flavor', 'drink', 
                    'wine', 'finish', 'hints', 'fruit', 
                    'notes', 'offers', 'aromas', 'style', 
                    'character', 'hint', 'bit', 'drinkable', 
                    'palate', 'imported']

# Took list of stopwords from a research paper which used the same basic dataset:
# Martinez, R., et al.; Grapevine: A Wine Prediction Algorithm Using Multi-dimensional Clustering Methods

#### Least-Used Words

In [41]:
# Create list of all documents
corpus = list(df.description)
corpus = ' '.join(corpus)
corpus[0:10]

'aromas inc'

In [42]:
# Tokenize all words in joined_corpus
tokens = [word for word in word_tokenize(corpus)]

In [44]:

########################## FIX THIS ######################





# Lemmatize words ---> Need to lemmatize earlier in workflow (maybe in earlier notebook)
lemmatized_tokens = [WordNetLemmatizer().lemmatize(word) for word in tokens]

In [45]:
# Frequency distribution of words in corpus
fdist = FreqDist(lemmatized_tokens)

In [46]:
# List of words used only once in corpus
used_once = fdist.hapaxes()
len(used_once)

9915

In [47]:
used_once

['edèlmio',
 'baccante',
 'setback',
 'alastro',
 'cavanera',
 'bestowed',
 'kindness',
 'guardian',
 'corino',
 'cdc',
 'rina',
 'ianca',
 'blanq',
 'provoked',
 'luncheon',
 'albanello',
 'donnafranca',
 'palas',
 'entemari',
 'waker',
 'impronta',
 'phantasmagorical',
 'caricante',
 'tascas',
 'catarrato',
 'kue',
 'plated',
 'cariddi',
 'ansonica',
 'mon',
 'giglio',
 'lecrù',
 'vv',
 'recycle',
 'underlaying',
 'meandro',
 'velhas',
 'serodio',
 'francescas',
 'fonte',
 'conceito',
 'cambres',
 'muxagat',
 'assobio',
 'fronteira',
 'romaneiras',
 'pinhao',
 'bagos',
 'faria',
 'serendipitously',
 'manzwine',
 'càlem',
 'redoma',
 'edmar',
 'redonda',
 'murcas',
 'altano',
 'wheatear',
 'espirito',
 'prehistoric',
 'carla',
 'crastos',
 'honored',
 'rede',
 'dodgy',
 'menezes',
 'montenegro',
 'passadouros',
 'oriole',
 'leda',
 'passa',
 '\xaddominates',
 'barricas',
 'rapport',
 'sagrado',
 'numão',
 'vallados',
 'recognizing',
 'ataíde',
 'abilio',
 'espoãos',
 'virility',
 'qua

In [25]:
# Add new stop words
from sklearn.feature_extraction import text 

stop_words = text.ENGLISH_STOP_WORDS.union(domain_stopwords, used_once)

In [26]:
# This stop words list should cut down on original 30,526-word vocabulary!
len(stop_words)

11183

## Count Vectorizer II

In [27]:
# Create a document-term matrix using CountVectorizer
cv = CountVectorizer(stop_words=stop_words)
data_cv = cv.fit_transform(df.description)

data_dtm = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names())
data_dtm.index = df.index
data_dtm.head()

  'stop_words.' % sorted(inconsistent))


Unnamed: 0,aacacia,aaron,abacela,abacelas,abadal,abandon,abandoned,abate,abbazia,abbey,...,zweigelt,zédé,àmaurice,élevage,élevé,élévage,émilion,émilions,über,ürziger
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [24]:
# Find the words used less than 2 times in descriptions
top_dict = {}
for c in data.columns:
    top = data[c].sort_values(ascending=True).head(100)
    top_dict[c]= list(zip(top.index, top.values))

top_dict


KeyboardInterrupt: 

In [None]:
# Print the top 15 words said by each comedian
for comedian, top_words in top_dict.items():
    print(comedian)
    print(', '.join([word for word, count in top_words[0:14]]))
    print('---')

In [None]:
# Look at the least common words used (those used < 2 times) 
# --> add them to the stop word list

from collections import Counter

# Let's first pull out the top 30 words for each comedian
words = []
for comedian in data.columns:
    top = [word for (word, count) in top_dict[comedian]]
    for t in top:
        words.append(t)
        
words

In [None]:
# Let's aggregate this list and identify the most common words along with how many routines they occur in
Counter(words).most_common()

In [None]:
# If more than half of the comedians have it as a top word, exclude it from the list
add_stop_words = [word for word, count in Counter(words).most_common() if count > 6]
add_stop_words

#### Create list of tokenized descriptions

In [5]:
corpus = list(df['description'])

In [6]:
corpus[0:2]

[['aromas',
  'include',
  'tropical',
  'fruit',
  'broom',
  'brimstone',
  'and',
  'dried',
  'herb',
  'the',
  'palate',
  'isnt',
  'overly',
  'expressive',
  'offering',
  'unripened',
  'apple',
  'citrus',
  'and',
  'dried',
  'sage',
  'alongside',
  'brisk',
  'acidity'],
 ['delicate',
  'aromas',
  'recall',
  'white',
  'flower',
  'and',
  'citrus',
  'the',
  'palate',
  'offers',
  'passion',
  'fruit',
  'lime',
  'and',
  'white',
  'peach',
  'with',
  'a',
  'hint',
  'of',
  'mineral',
  'alongside',
  'bright',
  'acidity']]

In [None]:
# Load stop words
stop_words = stopwords.words('english')

In [None]:
# Remove stop words from corpus
clean_corpus = []

for desc in corpus:
    desc = [word for word in desc if word not in stop_words]
    clean_corpus.append(desc)

In [None]:
clean_corpus[0:8]

In [None]:
# Dataset-specific stop words
wine_stopwords = ['alongside', 'aroma', 'palate', 'offer', 'hint', 'include', 
                  'offering', 'recall', 'pretty', 'nose', 'note', 'lightly', 
                  'part', 'extended', 'series', 'show', 'backed', 'touch', 
                  'flavor', 'provides', 'companion', 'behind', 'mouthfeel', 
                  'could', 'plus', 'open', 'background', 'tone', 'stand', 
                  'isnt', 'expressive', 'mouth', 'wine', 'broad', 'generous', 
                  'term', 'would', 'make', 'tiny', 'blend']

In [None]:
# Remove stop words from corpus
cleaner_corpus = []

for desc in clean_corpus:
    desc = [word for word in desc if word not in wine_stopwords]
    cleaner_corpus.append(desc)

In [None]:
cleaner_corpus[0:8]

In [None]:
# Rejoin lists of words in each description for use in CV & TF-IDF
cleaner_corpus_joined = []

for doc in cleaner_corpus:
    joined = ' '.join(doc)
    cleaner_corpus_joined.append(joined)

In [None]:
### Then, build model that can determine varietal based on description (use variety as target)

In [None]:
### ModelSomm: Finally, build model that can determine varietal + Province based off description as a proxy for taste

In [None]:
## Create Document-Term Matrix from Wine Descriptions

## Count Vectorizer

**Goal: come up with most important vocabulary list for wine descriptions (aka distill wine descriptions down to most important parts) --> Figures**

In [None]:
cv = CountVectorizer()#ngram_range=(1,2))
X_cv = cv.fit_transform(cleaner_corpus_joined)

print(f"Dimensions of Document-term matrix: {X_cv.toarray().shape}")

In [None]:
# Checked out the vocab list
# cv.vocabulary_

In [None]:
#### TF-IDF

In [None]:
tfidfvec = TfidfVectorizer()#stop_words = 'english')#ngram_range=(1,2))
X_tfidf = tfidfvec.fit_transform(clean_corpus_joined)

print(f"Dimensions of Document-term matrix: {X_tfidf.toarray().shape}")

In [None]:
### PCA for Scree Plot

In [None]:
# Choosing number of components with a scree plot
from sklearn.decomposition import PCA

In [None]:
pca = PCA(n_components=200)
pca.fit(X_tfidf)
pcafeatures_train = pca.transform(X_tfidf)

In [None]:
plt.plot(pca.explained_variance_ratio_)
plt.xlabel('# components')
plt.ylabel('explained variance');
plt.title('Scree plot for digits dataset');

In [None]:
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('# components')
plt.ylabel('cumulative explained variance');
plt.title('Cumulative explained variance by PCA for digits');