## Homework 4  Build Topics Using BERTopic.

# BERTopic is a topic modeling technique that leverages transformers and c-TF-IDF (https://maartengr.github.io/BERTopic/api/ctfidf.html) to create dense clusters allowing for easily interpretable topics whilst keeping important words in the topic descriptions.

In [None]:
### Check working directory to load CSV

import os

print('getcwd:      ', os.getcwd())

import pandas as pd

In [None]:
##  Install BerTopic  https://maartengr.github.io/BERTopic/index.html

import sys
!{sys.executable} -m pip install BERTopic

In [None]:
##  Other packages
import pandas as pd
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer, util
from umap import UMAP
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.vectorizers import ClassTfidfTransformer
import numpy as np

In [None]:
 #create a pandas dataframe
df = pd.read_csv('winemag-data-130k-v2.csv')

#check the first 5 dataframe rows
df.head()

In [None]:
# Modify this code to read only wines from Italy
df = df.loc[df.'col' == 'value']
df.head()

In [None]:
#create an array of descriptions
docs = df.description.values

## The BERTopic algorithm has five primary steps:

1. Extract embeddings - Uses the sentence transformer library
2. Reduce dimensionality - Uses the UMAP aglorithm
3. Cluster reduced embeddings - Uses HBSCAN to form clustes of different shapes
4. Tokenize topics -  Uses bag of words
5. Extract topic words - class bsaed TF-IDF

In [None]:
#instantiate BERTopic
topic_model = BERTopic(language="english", calculate_probabilities=True, verbose=True)

# Generate the topics --
# Note that training will take a few minutes
# Batch size is a hyperparameter which defines the number of samples taken to work through
#a particular machine learning model before updating its internal model parameters.
#A batch can be considered a for-loop iterating over one or more samples and making predictions.
# The default is for BERTtop 2 which I don't believe can be changed.
# https://python.plainenglish.io/topic-modeling-for-beginners-using-bertopic-and-python-aaf1b421afeb

topics, probs = topic_model.fit_transform(docs)

In [None]:
# Generate topics
topic_model.get_topic_info()

In [None]:
topic_model.get_topic(n) # use this code to display a topic of your choice (enter a number for n) and then annotate

In [None]:
topic_model.get_topic(n) # Repeate with new topic

In [None]:
##  Run this example of training BERTopic sequentially.

# Step 1 - Extract embeddings
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# Step 2 - Reduce dimensionality
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine')

# Step 3 - Cluster reduced embeddings
hdbscan_model = HDBSCAN(min_cluster_size=15, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

# Step 4 - Tokenize topics
vectorizer_model = CountVectorizer(stop_words="english")

# Step 5 - Create topic representation
ctfidf_model = ClassTfidfTransformer()

# All steps together
topic_model = BERTopic(
  embedding_model=embedding_model,    # Step 1 - Extract embeddings
  umap_model=umap_model,              # Step 2 - Reduce dimensionality
  hdbscan_model=hdbscan_model,        # Step 3 - Cluster reduced embeddings
  vectorizer_model=vectorizer_model,  # Step 4 - Tokenize topics
  ctfidf_model=ctfidf_model,          # Step 5 - Extract topic words

  calculate_probabilities=True,
  verbose=True
)

topics, probs = topic_model.fit_transform(docs)
topic_model.get_topic_info()

In [None]:
## Render the following visualizations and answer the questions in a markdown cell you insert just above the viz.

pip install bertopic[visualization]

## Reference https://maartengr.github.io/BERTopic/getting_started/visualization/visualization.html#visualize-topics