## Model 1

In [1]:
import pandas as pd
import numpy as np
import nltk
import pyLDAvis.sklearn
from collections import Counter
from textblob import TextBlob
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation, NMF
from wordcloud import WordCloud, ImageColorGenerator
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
pd.options.mode.chained_assignment = None
pd.set_option('display.max_colwidth', 100)


In [2]:
%%capture
from tqdm import tqdm_notebook as tqdm
tqdm().pandas()


  and should_run_async(code)


In [3]:
wines = pd.read_csv('winesft_clean.csv')

wines.head()

  and should_run_async(code)


Unnamed: 0,country,points,price,province,variety,winery,lemmatized
0,US,96,235.0,California,Cabernet Sauvignon,Heitz,"['tremendous', 'varietal', 'wine', 'hail', 'oakville', 'age', 'three', 'year', 'oak', 'juicy', '..."
1,Spain,96,110.0,Northern Spain,Tinta de Toro,Bodega Carmen Rodríguez,"['ripe', 'aroma', 'fig', 'blackberry', 'cassis', 'soften', 'sweetened', 'slather', 'oaky', 'choc..."
2,US,96,90.0,California,Sauvignon Blanc,Macauley,"['mac', 'watson', 'honor', 'memory', 'wine', 'make', 'mother', 'tremendously', 'delicious', 'bal..."
3,US,96,65.0,Oregon,Pinot Noir,Ponzi,"['spent', 'month', 'new', 'french', 'oak', 'incorporate', 'fruit', 'ponzis', 'aurora', 'abetina'..."
4,France,95,66.0,Provence,Provence red blend,Domaine de la Bégude,"['top', 'wine', 'la', 'begude', 'name', 'high', 'point', 'vineyard', 'foot', 'structure', 'densi..."


### Sentiment Analysis

In [4]:
df = wines[['points','lemmatized']]
df.head()

  and should_run_async(code)


Unnamed: 0,points,lemmatized
0,96,"['tremendous', 'varietal', 'wine', 'hail', 'oakville', 'age', 'three', 'year', 'oak', 'juicy', '..."
1,96,"['ripe', 'aroma', 'fig', 'blackberry', 'cassis', 'soften', 'sweetened', 'slather', 'oaky', 'choc..."
2,96,"['mac', 'watson', 'honor', 'memory', 'wine', 'make', 'mother', 'tremendously', 'delicious', 'bal..."
3,96,"['spent', 'month', 'new', 'french', 'oak', 'incorporate', 'fruit', 'ponzis', 'aurora', 'abetina'..."
4,95,"['top', 'wine', 'la', 'begude', 'name', 'high', 'point', 'vineyard', 'foot', 'structure', 'densi..."


In [5]:
df['lemma_str'] = [''.join(map(str,l)) for l in df['lemmatized']]
df.head()

  and should_run_async(code)


Unnamed: 0,points,lemmatized,lemma_str
0,96,"['tremendous', 'varietal', 'wine', 'hail', 'oakville', 'age', 'three', 'year', 'oak', 'juicy', '...","['tremendous', 'varietal', 'wine', 'hail', 'oakville', 'age', 'three', 'year', 'oak', 'juicy', '..."
1,96,"['ripe', 'aroma', 'fig', 'blackberry', 'cassis', 'soften', 'sweetened', 'slather', 'oaky', 'choc...","['ripe', 'aroma', 'fig', 'blackberry', 'cassis', 'soften', 'sweetened', 'slather', 'oaky', 'choc..."
2,96,"['mac', 'watson', 'honor', 'memory', 'wine', 'make', 'mother', 'tremendously', 'delicious', 'bal...","['mac', 'watson', 'honor', 'memory', 'wine', 'make', 'mother', 'tremendously', 'delicious', 'bal..."
3,96,"['spent', 'month', 'new', 'french', 'oak', 'incorporate', 'fruit', 'ponzis', 'aurora', 'abetina'...","['spent', 'month', 'new', 'french', 'oak', 'incorporate', 'fruit', 'ponzis', 'aurora', 'abetina'..."
4,95,"['top', 'wine', 'la', 'begude', 'name', 'high', 'point', 'vineyard', 'foot', 'structure', 'densi...","['top', 'wine', 'la', 'begude', 'name', 'high', 'point', 'vineyard', 'foot', 'structure', 'densi..."


### LDA Model

In [6]:
tf_vectorizer = CountVectorizer(max_df=0.9, min_df=25, max_features=5000)
tf = tf_vectorizer.fit_transform(df['lemma_str'].values.astype('U'))
tf_feature_names = tf_vectorizer.get_feature_names()
doc_term_matrix = pd.DataFrame(tf.toarray(), columns=list(tf_feature_names))
doc_term_matrix

  and should_run_async(code)


Unnamed: 0,ability,able,abound,abrasive,abrupt,abruzzo,absence,absolute,absolutely,absorb,...,zinfandel,zing,zingy,zinny,zins,zip,zippy,zone,zonin,zweigelt
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
150925,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
150926,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
150927,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
150928,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
lda_model = LatentDirichletAllocation(n_components=10, learning_method='online', max_iter=500, random_state=0).fit(tf)
no_top_words = 10
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic %d:" % (topic_idx))
        print(" ".join([feature_names[i]
                          for i in topic.argsort()[:-no_top_words - 1:-1]]))
             
display_topics(lda_model, tf_feature_names, no_top_words)


  and should_run_async(code)


Topic 0:
flavor apple citrus acidity wine crisp peach white finish fruit
Topic 1:
fruit seem noir pinot new go acid perfume bit acidic
Topic 2:
cherry black tannin cabernet blend blackberry finish dry merlot drink
Topic 3:
wine import vineyard year vintage one grape time intense pie
Topic 4:
cherry flavor raspberry dry little cola spice simple pinot red
Topic 5:
flavor finish palate fruit berry plum aroma big feel herbal
Topic 6:
wine fruit spice aroma aromas berry mouth note bright offer
Topic 7:
flavor wine oak sweet rich good blackberry best like show
Topic 8:
finish flavor style palate nose sweet light candy hint note
Topic 9:
wine fruit acidity ripe age tannin year flavor rich structure


### TF-IDF-NMF Model

In [8]:
tfidf_vectorizer = TfidfVectorizer(max_df=0.90, min_df =25, max_features=5000, use_idf=True)
tfidf = tfidf_vectorizer.fit_transform(df['lemma_str'])
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
doc_term_matrix_tfidf = pd.DataFrame(tfidf.toarray(), columns=list(tfidf_feature_names))
doc_term_matrix_tfidf

  and should_run_async(code)


Unnamed: 0,ability,able,abound,abrasive,abrupt,abruzzo,absence,absolute,absolutely,absorb,...,zinfandel,zing,zingy,zinny,zins,zip,zippy,zone,zonin,zweigelt
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
150925,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
150926,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
150927,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
150928,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
nmf = NMF(n_components=10, random_state=0, alpha=.1, init='nndsvd').fit(tfidf)
display_topics(nmf, tfidf_feature_names, no_top_words)

  and should_run_async(code)


Topic 0:
finish berry palate nose plum aroma herbal note flavor feel
Topic 1:
oak vanilla toast pineapple chardonnay butter rich flavor creamy acidity
Topic 2:
wine fruit age year ripe structure wood rich well tannin
Topic 3:
pinot cherry noir raspberry cola silky flavor red drink spice
Topic 4:
acidity fresh light fruity wine bright red drink crisp attractive
Topic 5:
sweet soft simple taste like flavor candy wine little jammy
Topic 6:
cabernet blend sauvignon merlot franc syrah verdot petit blanc sangiovese
Topic 7:
apple citrus white peach green crisp pear lemon clean lime
Topic 8:
black cherry spice tannin dark pepper chocolate offer blackberry licorice
Topic 9:
dry blackberry good currant flavor tannic show drink tannin wine




### Comparing topics generated between LDA and NMF models:

Looking over the two sets if topics, it appears the NMF model generates much better overall topic descriptions than the LDA model based a sommelier book called Wine Folly which offer descriptions on wines and how to translate the different descriptions. (will be creating a vectorization model from this later)

In [10]:
nmf_topic_values = nmf.transform(tfidf)
df['nmf_topics'] = nmf_topic_values.argmax(axis=1)
lda_topic_values = lda_model.transform(tf)
df['lda_topics'] = lda_topic_values.argmax(axis=1)
lda_remap = {0: 'Fruity Crisp Acidic', 1: 'Medium-bodied Red Acidic Perfumy', 2: 'Bitter Full-Bodied Alcohol', 3: 'Skill Building', 4: 'Difficult but Enjoyable Work', 5: 'Great Company/Job', 6: 'Care about Employees', 7: 'Great Contractor Pay', 8: 'Customer Service', 9: 'Unknown1'}
df['lda_topics'] = df['lda_topics'].map(lda_remap)
nmf_remap = {0: 'Fun Work Culture', 1: 'Design Process', 2: 'Enjoyable Job', 3: 'Difficult but Enjoyable Work', 
             4: 'Great Experience', 5: 'Perks', 6: 'Learning Opportunities', 7: 'Great Company/Job', 
             8: 'Contractor Employee Experience', 9: 'Management'}
df['nmf_topics'] = df['nmf_topics'].map(nmf_remap)

  and should_run_async(code)


In [11]:
df.head()

  and should_run_async(code)


Unnamed: 0,points,lemmatized,lemma_str,nmf_topics,lda_topics
0,96,"['tremendous', 'varietal', 'wine', 'hail', 'oakville', 'age', 'three', 'year', 'oak', 'juicy', '...","['tremendous', 'varietal', 'wine', 'hail', 'oakville', 'age', 'three', 'year', 'oak', 'juicy', '...",Enjoyable Job,Difficult but Enjoyable Work
1,96,"['ripe', 'aroma', 'fig', 'blackberry', 'cassis', 'soften', 'sweetened', 'slather', 'oaky', 'choc...","['ripe', 'aroma', 'fig', 'blackberry', 'cassis', 'soften', 'sweetened', 'slather', 'oaky', 'choc...",Contractor Employee Experience,Great Company/Job
2,96,"['mac', 'watson', 'honor', 'memory', 'wine', 'make', 'mother', 'tremendously', 'delicious', 'bal...","['mac', 'watson', 'honor', 'memory', 'wine', 'make', 'mother', 'tremendously', 'delicious', 'bal...",Design Process,Fruity Crisp Acidic
3,96,"['spent', 'month', 'new', 'french', 'oak', 'incorporate', 'fruit', 'ponzis', 'aurora', 'abetina'...","['spent', 'month', 'new', 'french', 'oak', 'incorporate', 'fruit', 'ponzis', 'aurora', 'abetina'...",Contractor Employee Experience,Bitter Full-Bodied Alcohol
4,95,"['top', 'wine', 'la', 'begude', 'name', 'high', 'point', 'vineyard', 'foot', 'structure', 'densi...","['top', 'wine', 'la', 'begude', 'name', 'high', 'point', 'vineyard', 'foot', 'structure', 'densi...",Enjoyable Job,Medium-bodied Red Acidic Perfumy


In [12]:
df.to_csv('winesft_w_models.csv', index=False, encoding='utf-8')

  and should_run_async(code)
