## Topic Modeling

In [1]:
#imports
import nltk
#nltk.download('brown')
from nltk.corpus import brown
import os
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

import pyLDAvis
import pyLDAvis.sklearn
import pyLDAvis.gensim_models

import spacy
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, TruncatedSVD, LatentDirichletAllocation

from spacy.lang.en.stop_words import STOP_WORDS as stopwords

from collections import Counter, defaultdict

nlp = spacy.load('en_core_web_sm')

In [2]:
#directory locations
current_directory = os.getcwd()
parent_directory = os.path.dirname(current_directory)
processed_folder = parent_directory + '/data/wine-com/processed/'

  and should_run_async(code)


## LoadData

In [3]:
df = pd.read_csv(processed_folder + '1677432096.083379.txt', sep = '|')

  and should_run_async(code)


In [4]:
df.head()

Unnamed: 0,product_url,product_name,product_variety,product_origin,product_family,user_avg_rating,user_rating_count,winemaker_description,reviewer_name,reviewer_rating,reviewer_text
0,https://www.wine.com/product/proyecto-salvaje-...,Proyecto Salvaje del Moncayo Garnacha 2020,Grenache,"from Navarra, Spain",Red Wine,4.8,19,bright burgundy wine medium depth tobacco wild...,Decanter,92.0,part proyecto garnachas de españa collection s...
1,https://www.wine.com/product/proyecto-salvaje-...,Proyecto Salvaje del Moncayo Garnacha 2020,Grenache,"from Navarra, Spain",Red Wine,4.8,19,bright burgundy wine medium depth tobacco wild...,Wilfred Wong of Wine.com,91.0,commentary 2020 proyecto garnachas salvaje del...
2,https://www.wine.com/product/domaine-du-terme-...,Domaine du Terme Gigondas 2019,Rhone Red Blends,"from Gigondas, Rhone, France",Red Wine,4.0,17,,Wine & Spirits,96.0,spectacular gigondas wine red cherry flavors s...
3,https://www.wine.com/product/domaine-du-terme-...,Domaine du Terme Gigondas 2019,Rhone Red Blends,"from Gigondas, Rhone, France",Red Wine,4.0,17,,Decanter,94.0,straight first sniff clear going special soari...
4,https://www.wine.com/product/scott-harvey-moun...,Scott Harvey Mountain Selection Zinfandel 2019,Zinfandel,"from Amador, Sierra Foothills, California",Red Wine,4.3,39,fruit forward rich full flavors expressing var...,Wine Enthusiast,93.0,fresh smelling full bodied flavor packed wine ...


In [4]:
# This function comes from the BTAP repo.
def display_topics(model, features, no_top_words=5):
    for topic, words in enumerate(model.components_):
        total = words.sum()
        largest = words.argsort()[::-1] # invert sort order
        print("\nTopic %02d" % topic)
        for i in range(0, no_top_words):
            print("  %s (%2.2f)" % (features[largest[i]], abs(words[largest[i]]*100.0/total)))

  and should_run_async(code)


## what type of wine in the df

In [5]:
# Check if the 'category' column exists in the DataFrame
if 'product_family' in df.columns:
    # Get a list of all unique categories in the DataFrame
    categories = df['product_family'].unique()

    # Loop through each category and print the number of articles in the DataFrame for that category
    for category in categories:
        num_reviews = len(df[df['product_family'] == category])
        print(f"For {category} we have {num_reviews} reviews.")
else:
    print("The 'product_family' column does not exist in the DataFrame.")

For Red Wine we have 12603 reviews.
For White Wine we have 5128 reviews.
For Champagne & Sparkling we have 1961 reviews.
For Rosé Wine we have 479 reviews.
For Dessert, Sherry, & Port we have 817 reviews.


  and should_run_async(code)


In the datafram there is 5 types of wine: red, white, champagne, rose, and dessert. the red wine has 12603 reviews, white with 5128 reviews, champange woth 1961 reviews, rose with 479 reviews and dessert with 817 reviews

## what review topics of wine in the df

In [6]:
# Check if the 'reviews' column exists in the DataFrame
if 'reviewer_text' in df.columns:
    # Count the number of non-null values in the 'reviews' column
    num_reviews = df['reviewer_text'].count()

    # Print the total number of reviews in the DataFrame
    print(f"There are {num_reviews} records in the DataFrame.")
else:
    print("The 'reviews' column does not exist in the DataFrame.")

There are 14494 records in the DataFrame.


  and should_run_async(code)


in our data frame there is 14494 reviews

## The shpae of the df

In [7]:
# Define the wine types to search for
wine_types = df['product_family'].unique()

# Define empty lists to hold the category, id, text, and review data for each wine type
category_list = []
id_list = []
text_list = []
review_list = []

# Loop through each wine type
for wine_type in wine_types:
    # Filter the DataFrame to select only the rows that match the wine type
    filtered_df = df[df['product_family'].str.contains(wine_type, case=False)]

    # Loop through each row in the filtered DataFrame
    for index, row in filtered_df.iterrows():
        # Add the category, id, text, and review data to their respective lists
        category_list.append(wine_type)
        id_list.append(row['product_name'])
        text_list.append(row['winemaker_description'])
        review_list.append(row['reviewer_text'])

# Create a new DataFrame from the category, id, text, and review lists
wine_df = pd.DataFrame({
    'category': category_list,
    'id': id_list,
    'text': text_list,
    'reviews': review_list
})

# Print the shape of the new DataFrame
print(wine_df.shape)

  and should_run_async(code)


(20988, 4)


In [8]:
wine_df.head()

  and should_run_async(code)


Unnamed: 0,category,id,text,reviews
0,Red Wine,Proyecto Salvaje del Moncayo Garnacha 2020,bright burgundy wine medium depth tobacco wild...,part proyecto garnachas de españa collection s...
1,Red Wine,Proyecto Salvaje del Moncayo Garnacha 2020,bright burgundy wine medium depth tobacco wild...,commentary 2020 proyecto garnachas salvaje del...
2,Red Wine,Domaine du Terme Gigondas 2019,,spectacular gigondas wine red cherry flavors s...
3,Red Wine,Domaine du Terme Gigondas 2019,,straight first sniff clear going special soari...
4,Red Wine,Scott Harvey Mountain Selection Zinfandel 2019,fruit forward rich full flavors expressing var...,fresh smelling full bodied flavor packed wine ...


## Review 

In [9]:
wine_df['reviews'] = wine_df['reviews'].fillna('')

  and should_run_async(code)


## Creating the CountVectorizer and TfidVectorizer

In [10]:
#Using the CountVectorizer with the stopwords
count_reviews_vectorizer = CountVectorizer(stop_words=stopwords, min_df=5, max_df=0.7)
#Applying the CountVectorizer to the wine_df reviews column 
count_reviews_vectors = count_reviews_vectorizer.fit_transform(wine_df["reviews"])
count_reviews_vectors.shape

  and should_run_async(code)


(20988, 5575)

In [11]:
#Using the TfidfVectorizer with the stopwords
tfidf_reviews_vectorizer = TfidfVectorizer(stop_words=stopwords, min_df=5, max_df=0.7)
#Applying the TfidfVectorizer to the wine_df reviews column 
tfidf_reviews_vectors = tfidf_reviews_vectorizer.fit_transform(wine_df["reviews"])
tfidf_reviews_vectors.shape

  and should_run_async(code)


(20988, 5575)

## Fitting a Non-Negative Matrix Factorization Model

In [12]:
nmf_reviews_model = NMF(n_components=5, random_state=314)
W_reviews_matrix = nmf_reviews_model.fit_transform(tfidf_reviews_vectors)
H_reviews_matrix = nmf_reviews_model.components_

  and should_run_async(code)


In [13]:
display_topics(nmf_reviews_model, tfidf_reviews_vectorizer.get_feature_names())


Topic 00
  medium (3.35)
  tannins (2.76)
  bodied (2.66)
  body (1.92)
  drink (1.67)

Topic 01
  commentary (2.97)
  tasting (2.93)
  san (2.92)
  francisco (2.90)
  tasted (2.86)

Topic 02
  cabernet (2.03)
  sauvignon (1.27)
  merlot (0.90)
  wine (0.84)
  franc (0.83)

Topic 03
  white (1.33)
  lemon (1.20)
  acidity (1.03)
  fresh (1.02)
  apple (0.93)

Topic 04
  cherry (1.90)
  red (1.55)
  black (1.50)
  flavors (1.35)
  fruit (1.12)


  and should_run_async(code)


top 5 topics and their top 5 associated words

## Fitting an LSA Model

In [29]:
#creating the SVD model
from sklearn.decomposition import TruncatedSVD
svd_reviews_model = TruncatedSVD(n_components = 10, random_state=42)

In [30]:
#creating the matrix 
W_svd_reviews_matrix = svd_reviews_model.fit_transform(tfidf_reviews_vectors)
H_svd_reviews_matrix = svd_reviews_model.components_

In [31]:
#saving the matrix of the new topic 
wine_df["svd_topic_reviews"] = np.argmax(W_svd_reviews_matrix, axis = 1).astype(str)

In [32]:
#for loop that count the unique topic 
for label in wine_df['svd_topic_reviews'].unique():
    print(f"topic {label} observes original label counts of:")
    print(wine_df[wine_df['svd_topic_reviews'] == label]['category'].value_counts())
    print()

topic 0 observes original label counts of:
Red Wine                   11040
White Wine                  4696
Champagne & Sparkling       1481
Dessert, Sherry, & Port      765
Rosé Wine                    419
Name: category, dtype: int64

topic 1 observes original label counts of:
Red Wine                   482
White Wine                 242
Champagne & Sparkling      172
Rosé Wine                   55
Dessert, Sherry, & Port     25
Name: category, dtype: int64

topic 4 observes original label counts of:
Red Wine                   248
Dessert, Sherry, & Port     12
Champagne & Sparkling        4
White Wine                   3
Rosé Wine                    2
Name: category, dtype: int64

topic 2 observes original label counts of:
Red Wine                   392
Dessert, Sherry, & Port      4
White Wine                   1
Name: category, dtype: int64

topic 8 observes original label counts of:
Red Wine                 23
Champagne & Sparkling     3
Rosé Wine                 3
Name: categor

LSA model showed that topic 0 and 1 have the most relevants to the different wine types. LSA created 10 topics in toll.

## Fitting an LDA Model

In [27]:
# Fit your LDA model here
from sklearn.decomposition import LatentDirichletAllocation
lda_reviews_model = LatentDirichletAllocation(n_components = 10, random_state=42)

  and should_run_async(code)


In [28]:
#creating the matrix 
W_lda_reviews_matrix = lda_reviews_model.fit_transform(count_reviews_vectors)
H_lda_reviews_matrix = lda_reviews_model.components_

  and should_run_async(code)


In [29]:
#saving the matrix of the new topic 
wine_df["lda_topic_reviews"] = np.argmax(W_lda_reviews_matrix, axis = 1).astype(str)

  and should_run_async(code)


In [30]:
#for loop that count the unique topic 
for label in wine_df['lda_topic_reviews'].unique():
    print(f"topic {label} observes original label counts of:")
    print(wine_df[wine_df['lda_topic_reviews'] == label]['category'].value_counts())
    print()

topic 6 observes original label counts of:
Red Wine                   1303
White Wine                  320
Dessert, Sherry, & Port     176
Champagne & Sparkling        70
Rosé Wine                    16
Name: category, dtype: int64

topic 2 observes original label counts of:
Red Wine                   485
White Wine                 244
Champagne & Sparkling      174
Rosé Wine                   55
Dessert, Sherry, & Port     28
Name: category, dtype: int64

topic 0 observes original label counts of:
Red Wine                   5166
White Wine                 2299
Champagne & Sparkling       420
Rosé Wine                   258
Dessert, Sherry, & Port     205
Name: category, dtype: int64

topic 3 observes original label counts of:
Red Wine                   2272
Dessert, Sherry, & Port      63
Champagne & Sparkling        39
Rosé Wine                    26
White Wine                   26
Name: category, dtype: int64

topic 8 observes original label counts of:
Red Wine                   822

  and should_run_async(code)


THe LDA model has then topics topic 0, 1, 4 have the highest observation for all five wine categories. topic 0 is able capture the overall characteristics of the different wine, but topic 1 if able to differentiate from red and white wines. last topic 4 can be used to identifying champagne from the different types of wine 

In [31]:
lda_display_review = pyLDAvis.sklearn.prepare(lda_reviews_model, count_reviews_vectors, count_reviews_vectorizer, sort_topics=False)

  and should_run_async(code)


In [32]:
pyLDAvis.display(lda_display_review)

  and should_run_async(code)


## Hyperparameter tuning 

### Fitting an LSA Model

In [14]:
from sklearn.pipeline import Pipeline
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import GridSearchCV

# Define the pipeline
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words=stopwords, min_df=5, max_df=0.7)),
    ('svd', TruncatedSVD(random_state=42))
])

# Define the hyperparameter grid
param_grid = {
    'svd__n_components': [5, 10, 15],
    'svd__algorithm': ['randomized', 'arpack', 'full'],
    'svd__tol': [0.0001, 0.001, 0.01]
}

# Perform grid search
grid_search = GridSearchCV(pipeline, param_grid=param_grid, scoring='accuracy', cv=5)
grid_search.fit(wine_df["reviews"])

# Print the best hyperparameters
print("Best parameters: ", grid_search.best_params_)

  and should_run_async(code)
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 765, in _score
    scores = scorer(estimator, X_test)
TypeError: __call__() missing 1 required positional argument: 'y_true'

45 fits failed out of a total of 135.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
45 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 684, in _fit_and_score
    estimator.fit(X_train, **fit_params)
  File "/opt/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py", line 382, in fit
    self._final_estimator.fit(Xt, y, **fit_para

Best parameters:  {'svd__algorithm': 'randomized', 'svd__n_components': 5, 'svd__tol': 0.0001}


using the gridSearchCV it gave the the best parameters to be n_componets to be 10, algorithm='randomized', tol=0.001

In [15]:
hyp_svd_reviews_model = TruncatedSVD(n_components=10, random_state=42, algorithm='randomized', tol=0.001)

  and should_run_async(code)


In [16]:
#creating the matrix 
W_h_svd_reviews_matrix = hyp_svd_reviews_model.fit_transform(tfidf_reviews_vectors)
H_h_svd_reviews_matrix = hyp_svd_reviews_model.components_

  and should_run_async(code)


In [17]:
wine_df["svd_topic_reviews_H"] = np.argmax(W_h_svd_reviews_matrix, axis = 1).astype(str)

  and should_run_async(code)


In [18]:
#for loop that count the unique topic 
for label in wine_df['svd_topic_reviews_H'].unique():
    print(f"topic {label} observes original label counts of:")
    print(wine_df[wine_df['svd_topic_reviews_H'] == label]['category'].value_counts())
    print()

topic 0 observes original label counts of:
Red Wine                   11040
White Wine                  4696
Champagne & Sparkling       1481
Dessert, Sherry, & Port      765
Rosé Wine                    419
Name: category, dtype: int64

topic 1 observes original label counts of:
Red Wine                   482
White Wine                 242
Champagne & Sparkling      172
Rosé Wine                   55
Dessert, Sherry, & Port     25
Name: category, dtype: int64

topic 4 observes original label counts of:
Red Wine                   248
Dessert, Sherry, & Port     12
Champagne & Sparkling        4
White Wine                   3
Rosé Wine                    2
Name: category, dtype: int64

topic 2 observes original label counts of:
Red Wine                   392
Dessert, Sherry, & Port      4
White Wine                   1
Name: category, dtype: int64

topic 8 observes original label counts of:
Red Wine                 23
Champagne & Sparkling     3
Rosé Wine                 3
Name: categor

  and should_run_async(code)


topic 0 and 1 are still the best topics for differentiating the wine types after hyperrameter tuning the LSA

## Fitting an LDA Model

In [19]:
from sklearn.metrics import make_scorer
import numpy as np

# Define a custom scorer based on the log-likelihood metric for LDA
def log_likelihood_scorer(model, X):
    return np.sum(model.score(X))

# Create the pipeline
pipeline = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('lda', LatentDirichletAllocation())
])

# Define the hyperparameter grid
param_grid = {
    'vectorizer__max_df': [0.5, 0.75],
    'vectorizer__min_df': [1, 2],
    'lda__n_components': [5, 10],
    'lda__learning_method': ['batch', 'online']
}

# Perform grid search with the custom scorer
grid_search = GridSearchCV(pipeline, param_grid=param_grid, scoring=make_scorer(log_likelihood_scorer), refit=True)
grid_search.fit(wine_df["reviews"])

# Print the best hyperparameters
print("Best parameters: ", grid_search.best_params_)

  and should_run_async(code)


Best parameters:  {'lda__learning_method': 'batch', 'lda__n_components': 5, 'vectorizer__max_df': 0.5, 'vectorizer__min_df': 1}


using the gridSearchCV it gave the the best parameters to be n_componets to be 5, learning_method='batch'

In [20]:
lda_reviews_model_H = LatentDirichletAllocation(learning_method='batch', n_components = 5, random_state=42,)

  and should_run_async(code)


In [21]:
#creating the matrix 
W_lda_H_reviews_matrix = lda_reviews_model_H.fit_transform(count_reviews_vectors)
H_lda_H_reviews_matrix = lda_reviews_model_H.components_

  and should_run_async(code)


In [22]:
#saving the matrix of the new topic 
wine_df["lda_topic_reviews_H"] = np.argmax(W_lda_H_reviews_matrix, axis = 1).astype(str)

  and should_run_async(code)


In [23]:
#for loop that count the unique topic 
for label in wine_df['lda_topic_reviews_H'].unique():
    print(f"topic {label} observes original label counts of:")
    print(wine_df[wine_df['lda_topic_reviews_H'] == label]['category'].value_counts())
    print()

topic 0 observes original label counts of:
Red Wine                   6029
White Wine                 2535
Champagne & Sparkling       435
Rosé Wine                   253
Dessert, Sherry, & Port     249
Name: category, dtype: int64

topic 2 observes original label counts of:
Red Wine                   568
White Wine                 268
Champagne & Sparkling      175
Rosé Wine                   55
Dessert, Sherry, & Port     26
Name: category, dtype: int64

topic 1 observes original label counts of:
Red Wine                   1951
White Wine                 1645
Champagne & Sparkling       615
Dessert, Sherry, & Port     281
Rosé Wine                   110
Name: category, dtype: int64

topic 3 observes original label counts of:
Red Wine                   3761
Dessert, Sherry, & Port      63
White Wine                   10
Rosé Wine                     9
Champagne & Sparkling         6
Name: category, dtype: int64

topic 4 observes original label counts of:
Champagne & Sparkling      730

  and should_run_async(code)


It looks like the tuned LDA redused th eobservation in each topic beside in topic 0, most of the topics are dominated by the red wine hyper tuning didn't make a big difference

In [24]:
lda_display_review_H = pyLDAvis.sklearn.prepare(lda_reviews_model_H, count_reviews_vectors, count_reviews_vectorizer, sort_topics=False)
pyLDAvis.display(lda_display_review_H)

  and should_run_async(code)


## Description 

In [33]:
wine_df['text'] = wine_df['text'].fillna('')

  and should_run_async(code)


In [34]:
#Using the CountVectorizer with the stopwords
count_text_vectorizer = CountVectorizer(stop_words=stopwords, min_df=5, max_df=0.7)
#Applying the CountVectorizer to the wine_df reviews column 
count_text_vectors = count_text_vectorizer.fit_transform(wine_df["text"])
count_text_vectors.shape

  and should_run_async(code)


(20988, 7626)

In [35]:
#Using the TfidfVectorizer with the stopwords
tfidf_text_vectorizer = TfidfVectorizer(stop_words=stopwords, min_df=5, max_df=0.7)
#Applying the TfidfVectorizer to the wine_df reviews column 
tfidf_text_vectors = tfidf_text_vectorizer.fit_transform(wine_df["text"])
tfidf_text_vectors.shape

  and should_run_async(code)


(20988, 7626)

## Fitting a Non-Negative Matrix Factorization Model

In [36]:
nmf_text_model = NMF(n_components=5, random_state=314)
W_text_matrix = nmf_text_model.fit_transform(tfidf_text_vectors)
H_text_matrix = nmf_text_model.components_

  and should_run_async(code)


In [37]:
display_topics(nmf_text_model, tfidf_text_vectorizer.get_feature_names())


Topic 00
  red (3.38)
  intense (1.71)
  ruby (1.69)
  color (1.43)
  nose (1.19)

Topic 01
  cabernet (9.04)
  merlot (5.63)
  sauvignon (5.27)
  blend (4.59)
  franc (4.33)

Topic 02
  white (1.45)
  fresh (1.32)
  citrus (1.23)
  yellow (1.05)
  notes (0.92)

Topic 03
  black (2.20)
  cherry (1.68)
  dark (1.66)
  flavors (1.15)
  plum (1.12)

Topic 04
  pinot (1.31)
  wine (1.21)
  noir (1.02)
  chardonnay (0.62)
  blend (0.61)


  and should_run_async(code)


top 5 topics and their top 5 associated words

## Fitting an LSA Model

In [38]:
svd_text_model = TruncatedSVD(n_components = 10, random_state=42)

  and should_run_async(code)


In [39]:
W_svd_text_matrix = svd_text_model.fit_transform(tfidf_text_vectors)
H_svd_text_matrix = svd_text_model.components_

  and should_run_async(code)


In [41]:
wine_df["svd_topic_text"] = np.argmax(W_svd_text_matrix, axis = 1).astype(str)

  and should_run_async(code)


In [43]:
#for loop that count the unique topic 
for label in wine_df['svd_topic_text'].unique():
    print(f"topic {label} observes original label counts of:")
    print(wine_df[wine_df['svd_topic_text'] == label]['category'].value_counts())
    print()

topic 0 observes original label counts of:
Red Wine                   9594
White Wine                 3620
Champagne & Sparkling      1277
Dessert, Sherry, & Port     664
Rosé Wine                   358
Name: category, dtype: int64

topic 4 observes original label counts of:
Red Wine                   1056
White Wine                  563
Champagne & Sparkling       260
Dessert, Sherry, & Port     116
Rosé Wine                    26
Name: category, dtype: int64

topic 6 observes original label counts of:
Red Wine                   102
White Wine                  85
Rosé Wine                   14
Champagne & Sparkling        7
Dessert, Sherry, & Port      3
Name: category, dtype: int64

topic 1 observes original label counts of:
Red Wine                   1138
White Wine                   12
Dessert, Sherry, & Port       6
Rosé Wine                     5
Champagne & Sparkling         2
Name: category, dtype: int64

topic 3 observes original label counts of:
Red Wine                   88


  and should_run_async(code)


The LSA model shows bias towards red wine when using description data with topic 0 having the highest cpunt for red wine

## Fitting an LDA Model

In [44]:
lda_text_model = LatentDirichletAllocation(n_components = 10, random_state=42)

  and should_run_async(code)


In [45]:
#creating the matrix 
W_lda_text_matrix = lda_text_model.fit_transform(count_text_vectors)
H_lda_text_matrix = lda_text_model.components_

  and should_run_async(code)


In [46]:
#saving the matrix of the new topic 
wine_df["lda_topic"] = np.argmax(W_lda_text_matrix, axis = 1).astype(str)

  and should_run_async(code)


In [47]:
#for loop that count the unique topic 
for label in wine_df['lda_topic'].unique():
    print(f"topic {label} observes original label counts of:")
    print(wine_df[wine_df['lda_topic'] == label]['category'].value_counts())
    print()

topic 9 observes original label counts of:
Red Wine                   740
Champagne & Sparkling      275
White Wine                  67
Rosé Wine                   21
Dessert, Sherry, & Port      7
Name: category, dtype: int64

topic 0 observes original label counts of:
White Wine                 2195
Red Wine                   1411
Champagne & Sparkling       682
Dessert, Sherry, & Port     206
Rosé Wine                   135
Name: category, dtype: int64

topic 6 observes original label counts of:
Red Wine                   4642
Dessert, Sherry, & Port      77
Rosé Wine                    18
White Wine                   18
Champagne & Sparkling         9
Name: category, dtype: int64

topic 5 observes original label counts of:
Red Wine                   2910
Dessert, Sherry, & Port      53
White Wine                   27
Champagne & Sparkling        22
Rosé Wine                     9
Name: category, dtype: int64

topic 8 observes original label counts of:
White Wine                 919

  and should_run_async(code)


topic 0 has th best descrioption for the wine types it has the close observation count to the original were topic 6 and 5 are specific to wine types. 6 focuses on red wines and topic 5 is focused on white and champagne

In [48]:
lda_display = pyLDAvis.sklearn.prepare(lda_text_model, count_text_vectors, count_text_vectorizer, sort_topics=False)

  and should_run_async(code)


In [49]:
pyLDAvis.display(lda_display)

  and should_run_async(code)


In [50]:
# Display both visualizations
pyLDAvis.display(lda_display)
pyLDAvis.display(lda_display_review)

  and should_run_async(code)
