## Model 1

In [1]:
import pandas as pd
import numpy as np
import nltk
import pyLDAvis.sklearn
from collections import Counter
from textblob import TextBlob
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation, NMF
from wordcloud import WordCloud, ImageColorGenerator
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
pd.options.mode.chained_assignment = None
pd.set_option('display.max_colwidth', 100)


In [2]:
%%capture
from tqdm import tqdm_notebook as tqdm
tqdm().pandas()


  and should_run_async(code)


In [3]:
wines = pd.read_csv('winesft_clean.csv')

wines.head()

  and should_run_async(code)


Unnamed: 0,country,points,price,province,variety,winery,lemmatized
0,US,96,235.0,California,Cabernet Sauvignon,Heitz,"['tremendous', 'varietal', 'wine', 'hail', 'oakville', 'age', 'three', 'year', 'oak', 'juicy', '..."
1,Spain,96,110.0,Northern Spain,Tinta de Toro,Bodega Carmen Rodríguez,"['ripe', 'aroma', 'fig', 'blackberry', 'cassis', 'soften', 'sweetened', 'slather', 'oaky', 'choc..."
2,US,96,90.0,California,Sauvignon Blanc,Macauley,"['mac', 'watson', 'honor', 'memory', 'wine', 'make', 'mother', 'tremendously', 'delicious', 'bal..."
3,US,96,65.0,Oregon,Pinot Noir,Ponzi,"['spent', 'month', 'new', 'french', 'oak', 'incorporate', 'fruit', 'ponzis', 'aurora', 'abetina'..."
4,France,95,66.0,Provence,Provence red blend,Domaine de la Bégude,"['top', 'wine', 'la', 'begude', 'name', 'high', 'point', 'vineyard', 'foot', 'structure', 'densi..."


### Sentiment Analysis

In [4]:
df = wines[['points','lemmatized']]
df.head()

  and should_run_async(code)


Unnamed: 0,points,lemmatized
0,96,"['tremendous', 'varietal', 'wine', 'hail', 'oakville', 'age', 'three', 'year', 'oak', 'juicy', '..."
1,96,"['ripe', 'aroma', 'fig', 'blackberry', 'cassis', 'soften', 'sweetened', 'slather', 'oaky', 'choc..."
2,96,"['mac', 'watson', 'honor', 'memory', 'wine', 'make', 'mother', 'tremendously', 'delicious', 'bal..."
3,96,"['spent', 'month', 'new', 'french', 'oak', 'incorporate', 'fruit', 'ponzis', 'aurora', 'abetina'..."
4,95,"['top', 'wine', 'la', 'begude', 'name', 'high', 'point', 'vineyard', 'foot', 'structure', 'densi..."


In [5]:
df['lemma_str'] = [''.join(map(str,l)) for l in df['lemmatized']]
df.head()

  and should_run_async(code)


Unnamed: 0,points,lemmatized,lemma_str
0,96,"['tremendous', 'varietal', 'wine', 'hail', 'oakville', 'age', 'three', 'year', 'oak', 'juicy', '...","['tremendous', 'varietal', 'wine', 'hail', 'oakville', 'age', 'three', 'year', 'oak', 'juicy', '..."
1,96,"['ripe', 'aroma', 'fig', 'blackberry', 'cassis', 'soften', 'sweetened', 'slather', 'oaky', 'choc...","['ripe', 'aroma', 'fig', 'blackberry', 'cassis', 'soften', 'sweetened', 'slather', 'oaky', 'choc..."
2,96,"['mac', 'watson', 'honor', 'memory', 'wine', 'make', 'mother', 'tremendously', 'delicious', 'bal...","['mac', 'watson', 'honor', 'memory', 'wine', 'make', 'mother', 'tremendously', 'delicious', 'bal..."
3,96,"['spent', 'month', 'new', 'french', 'oak', 'incorporate', 'fruit', 'ponzis', 'aurora', 'abetina'...","['spent', 'month', 'new', 'french', 'oak', 'incorporate', 'fruit', 'ponzis', 'aurora', 'abetina'..."
4,95,"['top', 'wine', 'la', 'begude', 'name', 'high', 'point', 'vineyard', 'foot', 'structure', 'densi...","['top', 'wine', 'la', 'begude', 'name', 'high', 'point', 'vineyard', 'foot', 'structure', 'densi..."


In [6]:
tf_vectorizer = CountVectorizer(max_df=0.9, min_df=25, max_features=5000)
tf = tf_vectorizer.fit_transform(df['lemma_str'].values.astype('U'))
tf_feature_names = tf_vectorizer.get_feature_names()
doc_term_matrix = pd.DataFrame(tf.toarray(), columns=list(tf_feature_names))
doc_term_matrix

  and should_run_async(code)


Unnamed: 0,ability,able,abound,abrasive,abrupt,abruzzo,absence,absolute,absolutely,absorb,...,zinfandel,zing,zingy,zinny,zins,zip,zippy,zone,zonin,zweigelt
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
150925,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
150926,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
150927,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
150928,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
lda_model = LatentDirichletAllocation(n_components=10, learning_method='online', max_iter=500, random_state=0).fit(tf)
no_top_words = 10
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic %d:" % (topic_idx))
        print(" ".join([feature_names[i]
                          for i in topic.argsort()[:-no_top_words - 1:-1]]))
             
display_topics(lda_model, tf_feature_names, no_top_words)


  and should_run_async(code)


Topic 0:
flavor apple citrus acidity wine crisp peach white finish fruit
Topic 1:
fruit seem noir pinot new go acid perfume bit acidic
Topic 2:
cherry black tannin cabernet blend blackberry finish dry merlot drink
Topic 3:
wine import vineyard year vintage one grape time intense pie
Topic 4:
cherry flavor raspberry dry little cola spice simple pinot red
Topic 5:
flavor finish palate fruit berry plum aroma big feel herbal
Topic 6:
wine fruit spice aroma aromas berry mouth note bright offer
Topic 7:
flavor wine oak sweet rich good blackberry best like show
Topic 8:
finish flavor style palate nose sweet light candy hint note
Topic 9:
wine fruit acidity ripe age tannin year flavor rich structure


In [12]:
display_topics(lda_model, tf_feature_names, no_top_words)

Topic 0:
flavor apple citrus acidity wine crisp peach white finish fruit
Topic 1:
fruit seem noir pinot new go acid perfume bit acidic
Topic 2:
cherry black tannin cabernet blend blackberry finish dry merlot drink
Topic 3:
wine import vineyard year vintage one grape time intense pie
Topic 4:
cherry flavor raspberry dry little cola spice simple pinot red
Topic 5:
flavor finish palate fruit berry plum aroma big feel herbal
Topic 6:
wine fruit spice aroma aromas berry mouth note bright offer
Topic 7:
flavor wine oak sweet rich good blackberry best like show
Topic 8:
finish flavor style palate nose sweet light candy hint note
Topic 9:
wine fruit acidity ripe age tannin year flavor rich structure


  and should_run_async(code)


In [None]:
pyLDAvis.enable_notebook()
panel = pyLDAvis.sklearn.prepare(lda_model, tf, tf_vectorizer, mds='tsne')
panel

  and should_run_async(code)
Exception in thread Thread-7:
Traceback (most recent call last):
  File "C:\Users\Threadripper\anaconda3\envs\winereviews\lib\threading.py", line 926, in _bootstrap_inner
    self.run()
  File "C:\Users\Threadripper\anaconda3\envs\winereviews\lib\site-packages\joblib\externals\loky\process_executor.py", line 555, in run
    result_item, is_broken, bpe = self.wait_result_broken_or_wakeup()
  File "C:\Users\Threadripper\anaconda3\envs\winereviews\lib\site-packages\joblib\externals\loky\process_executor.py", line 609, in wait_result_broken_or_wakeup
    ready = wait(readers + worker_sentinels)
  File "C:\Users\Threadripper\anaconda3\envs\winereviews\lib\multiprocessing\connection.py", line 869, in wait
    ready_handles = _exhaustive_wait(waithandle_to_obj.keys(), timeout)
  File "C:\Users\Threadripper\anaconda3\envs\winereviews\lib\multiprocessing\connection.py", line 801, in _exhaustive_wait
    res = _winapi.WaitForMultipleObjects(L, False, timeout)
ValueEr

In [None]:
%debug

In [None]:
tfidf_vectorizer = TfidfVectorizer(max_df=0.90, min_df =25, max_features=5000, use_idf=True)
tfidf = tfidf_vectorizer.fit_transform(df['lemma_str'])
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
doc_term_matrix_tfidf = pd.DataFrame(tfidf.toarray(), columns=list(tfidf_feature_names))
doc_term_matrix_tfidf

In [None]:
df['sentiment'] = df['lemma_str'].apply(lambda x: TextBlob(x).sentiment.polarity)
df.head()

In [None]:
plt.figure(figsize=(50,30))
plt.margins(0.02)
plt.xlabel('Sentiment', fontsize=50)
plt.xticks(fontsize=40)
plt.ylabel('Frequency', fontsize=50)
plt.yticks(fontsize=40)
plt.hist(df['sentiment'], bins=50)
plt.title('Sentiment Distribution', fontsize=60)
plt.show()

In [None]:
x_points = df.points.value_counts()
y_points = x_points.sort_index()
plt.figure(figsize=(50,30))
sns.barplot(x_points.index, x_points.values, alpha=0.8)
plt.title("Points Distribution", fontsize=50)
plt.ylabel('Frequency', fontsize=50)
plt.yticks(fontsize=40)
plt.xlabel('Taster Ratings', fontsize=50)
plt.xticks(fontsize=40)

In [None]:
import sweetviz as sv
df2 = wines[['points', 'price']]

my_report = sv.analyze(df2)

my_report.show_html()

In [None]:
words = df['lemmatized']
allwords = []
for wordlist in words:
    allwords += wordlist
print(allwords)

In [None]:
print(allwords[:10])

In [None]:
mostcommon = FreqDist(words).most_common(10)
wordcloud = WordCloud(width=1600, height=800, background_color='white').generate(str(mostcommon))
fig = plt.figure(figsize=(30,10), facecolor='white')
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.title('Top 100 Most Common Words', fontsize=100)
plt.tight_layout(pad=0)
plt.show()

In [None]:
mostcommon_small = FreqDist(allwords).most_common(25)
x, y = zip(*mostcommon_small)
plt.figure(figsize=(50,30))
plt.margins(0.02)
plt.bar(x, y)
plt.xlabel('Words', fontsize=50)
plt.ylabel('Frequency of Words', fontsize=50)
plt.yticks(fontsize=40)
plt.xticks(rotation=60, fontsize=40)
plt.title('Frequency of 25 Most Common Words', fontsize=60)
plt.show()

## Model 2

In [None]:
import numpy as np
import pandas as pd
import seaborn as sn
import sklearn as sk
from sklearn.feature_extraction.text import CountVectorizer
import tpot as tp
import contractions
import spacy
import tensorflow as tf
import keras as k
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
wines.description.head()

In [None]:
corpus = wines['description']
vectorizer = CountVectorizer()

X = vectorizer.fit_transform(corpus)

In [None]:
vectorizer.get_feature_names()

In [None]:
vectorizer2 = CountVectorizer(analyzer='word', ngram_range=(10,10))
X2 = vectorizer2.fit_transform(corpus)

In [None]:
vectorizer2.get_feature_names()