# Topic modelling with Llama2
Inspired by https://maartengrootendorst.substack.com/p/topic-modeling-with-llama-2

In [1]:
%%capture
!pip install bertopic datasets accelerate bitsandbytes xformers adjustText

# DATA

In [2]:
#visualization tool for displaying long load/processing times
!pip install tqdm --quiet
#data processing
!pip install pandas --quiet
#workhorse for converting text into embeddings/vectors
!pip install sentence-transformers==2.2.2 --quiet
#data framework for LLM applications
!pip install llama-index==0.9.6.post1 --quiet
#logging output
!pip install loguru==0.7.0 --quiet
#convenient pretty printing library
!pip install rich==13.7.0 --quiet
#openai Tokenizer library
!pip install tiktoken --quiet

In [3]:
!curl -o preprocessing.py https://raw.githubusercontent.com/americanthinker/vectorsearch-applications/main/preprocessing.py

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0100  4625  100  4625    0     0  22641      0 --:--:-- --:--:-- --:--:-- 22671


In [4]:
!curl -o unitesting_utils.py https://raw.githubusercontent.com/americanthinker/vectorsearch-applications/main/unitesting_utils.py

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  1495  100  1495    0     0   7694      0 --:--:-- --:--:-- --:--:--  7706


In [5]:
!curl -o impact_theory_data.json https://raw.githubusercontent.com/americanthinker/vectorsearch-applications/main/data/impact_theory_data.json

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 25.6M  100 25.6M    0     0  30.2M      0 --:--:-- --:--:-- --:--:-- 30.2M


In [6]:
#standard libraries
import json
import os
import time
from collections import defaultdict
from typing import List, Dict, Tuple, Union, Callable
from math import ceil

#external libraries
import pandas as pd
import numpy as np
from rich import print
from torch import cuda
from tqdm.notebook import tqdm

#external files
from preprocessing import FileIO

In [7]:
#root folder on Google Colab is: /content/
root_folder = '/content/'
data_file = 'impact_theory_data.json'
data_path = os.path.join(root_folder, data_file)
data_path

'/content/impact_theory_data.json'

In [61]:
with open(data_file) as f:
    data =  json.load(f)
print(f'Total # of episodes: {len(data)}')

In [64]:
print(data[0].keys())

In [71]:
contents = [d['summary'] for d in data]
video_ids = [d['video_id'] for d in data]
content_lengths = [len(content.split()) for content in contents]
df = pd.DataFrame(content_lengths, columns=['# Words'])
df.describe()

Unnamed: 0,# Words
count,384.0
mean,204.958333
std,65.135916
min,86.0
25%,158.75
50%,191.0
75%,235.25
max,443.0


In [72]:
mean_word_count = ceil(np.mean(content_lengths))
token_to_word_ratio = 1.3
approx_token_count = ceil(mean_word_count * token_to_word_ratio)
print(f'The mean word count for each episode is about {mean_word_count} words, which corresponds to a rough token count of {approx_token_count} tokens.')

# 🤗 HuggingFace Hub Credentials
Before we can load in Llama2 using a number of tricks, we will first need to accept the License for using Llama2. The steps are as follows:


* Create a HuggingFace account [here](https://huggingface.co)
* Apply for Llama 2 access [here](https://huggingface.co/meta-llama/Llama-2-13b-chat-hf)
* Get your HuggingFace token [here](https://huggingface.co/settings/tokens)

After doing so, we can login with our HuggingFace credentials so that this environment knows we have permission to download the Llama 2 model that we are interested in.

In [11]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

# 🦙 **Llama 2**

Now comes one of the more interesting components of this tutorial, how to load in a Llama 2 model on a T4-GPU!

We will be focusing on the `'meta-llama/Llama-2-13b-chat-hf'` variant. It is large enough to give interesting and useful results whilst small enough that it can be run on our environment.

We start by defining our model and identifying if our GPU is correctly selected. We expect the output of `device` to show a cuda device:

In [12]:
from torch import cuda

model_id = 'meta-llama/Llama-2-13b-chat-hf'
device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'

print(device)

## **Optimization & Quantization**

In order to load our 13 billion parameter model, we will need to perform some optimization tricks. Since we have limited VRAM and not an A100 GPU, we will need to "condense" the model a bit so that we can run it.

There are a number of tricks that we can use but the main principle is going to be 4-bit quantization.

This process reduces the 64-bit representation to only 4-bits which reduces the GPU memory that we will need. It is a recent technique and quite an elegant at that for efficient LLM loading and usage. You can find more about that method [here](https://arxiv.org/pdf/2305.14314.pdf) in the QLoRA paper and on the amazing HuggingFace blog [here](https://huggingface.co/blog/4bit-transformers-bitsandbytes).

In [13]:
from torch import bfloat16
import transformers

# set quantization configuration to load large model with less GPU memory
# this requires the `bitsandbytes` library

bnb_config = transformers.BitsAndBytesConfig(
    load_in_4bit=True,  # 4-bit quantization
    bnb_4bit_quant_type='nf4',  # Normalized float 4
    bnb_4bit_use_double_quant=True,  # Second quantization after the first
    bnb_4bit_compute_dtype=bfloat16  # Computation type
)

These four parameters that we just run are incredibly important and bring many LLM applications to consumers:
* `load_in_4bit`
  * Allows us to load the model in 4-bit precision compared to the original 32-bit precision
  * This gives us an incredibly speed up and reduces memory!
* `bnb_4bit_quant_type`
  * This is the type of 4-bit precision. The paper recommends normalized float 4-bit, so that is what we are going to use!
* `bnb_4bit_use_double_quant`
  * This is a neat trick as it perform a second quantization after the first which further reduces the necessary bits
* `bnb_4bit_compute_dtype`
  * The compute type used during computation, which further speeds up the model.



Using this configuration, we can start loading in the model as well as the tokenizer:

In [14]:
# Llama 2 Tokenizer
tokenizer = transformers.AutoTokenizer.from_pretrained(model_id)

# Llama 2 Model
model = transformers.AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    quantization_config=bnb_config,
    device_map='auto',
)
model.eval()

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 5120)
    (layers): ModuleList(
      (0-39): 40 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear4bit(in_features=5120, out_features=5120, bias=False)
          (k_proj): Linear4bit(in_features=5120, out_features=5120, bias=False)
          (v_proj): Linear4bit(in_features=5120, out_features=5120, bias=False)
          (o_proj): Linear4bit(in_features=5120, out_features=5120, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=5120, out_features=13824, bias=False)
          (up_proj): Linear4bit(in_features=5120, out_features=13824, bias=False)
          (down_proj): Linear4bit(in_features=13824, out_features=5120, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm

Using the model and tokenizer, we will generate a HuggingFace transformers pipeline that allows us to easily generate new text:

In [15]:
# Our text generator
generator = transformers.pipeline(
    model=model, tokenizer=tokenizer,
    task='text-generation',
    temperature=0.1,
    max_new_tokens=500,
    repetition_penalty=1.1
)

In [16]:
prompt = "Do you listen to the podcast impact theory?"
res = generator(prompt)
print(res[0]["generated_text"])

## **Prompt Engineering**


Although we can directly prompt the model, there is actually a template that we need to follow. The template looks as follows:

```python
"""
<s>[INST] <<SYS>>

{{ System Prompt }}

<</SYS>>

{{ User Prompt }} [/INST]

{{ Model Answer }}
"""
```

This template consists of two main components, namely the `{{ System Prompt }}` and the `{{ User Prompt }}`:
* The `{{ System Prompt }}` helps us guide the model during a conversation. For example, we can say that it is a helpful assisant that is specialized in labeling topics.
* The  `{{ User Prompt }}` is where we ask it a question.

You might have noticed the `[INST]` tags, these are used to identify the beginning and end of a prompt. We can use these to model the conversation history as we will see more in-depth later on.

Next, let's see how we can use this template to optimize Llama 2 for topic modeling.

### **Prompt Template**

We are going to keep our `system prompt` simple and to the point:

In [17]:
# System prompt describes information given to all conversations
system_prompt = """
<s>[INST] <<SYS>>
You are a helpful, respectful and honest assistant for labeling topics.
<</SYS>>
"""

We will tell the model that it is simply a helpful assistant for labeling topics since that is our main goal.

In contrast, our `user prompt` is going to the be a bit more involved. It will consist of two components, an **example** and the **main prompt**.

Let's start with the **example**. Most LLMs do a much better job of generating accurate responses if you give them an example to work with. We will show it an accurate example of the kind of output we are expecting.

In [18]:
# Example prompt demonstrating the output we are looking for
example_prompt = """
I have a topic that contains the following documents:
- Traditional diets in most cultures were primarily plant-based with a little meat on top, but with the rise of industrial style meat production and factory farming, meat has become a staple food.
- Meat, but especially beef, is the word food in terms of emissions.
- Eating meat doesn't make you a bad person, not eating meat doesn't make you a good one.

The topic is described by the following keywords: 'meat, beef, eat, eating, emissions, steak, food, health, processed, chicken'.

Based on the information about the topic above, please create a short label of this topic. Make sure you to only return the label and nothing more.

[/INST] Environmental impacts of eating meat
"""

This example, based on a number of keywords and documents primarily about the impact of
meat, helps to model to understand the kind of output it should give. We show the model that we were expecting only the label, which is easier for us to extract.

Next, we will create a template that we can use within BERTopic:

In [19]:
# Our main prompt with documents ([DOCUMENTS]) and keywords ([KEYWORDS]) tags
main_prompt = """
[INST]
I have a topic that contains the following documents:
[DOCUMENTS]

The topic is described by the following keywords: '[KEYWORDS]'.

Based on the information about the topic above, please create a very short label of this topic. Make sure you to only return the label and nothing more.
[/INST]
"""

There are two BERTopic-specific tags that are of interest, namely `[DOCUMENTS]` and `[KEYWORDS]`:

* `[DOCUMENTS]` contain the top 5 most relevant documents to the topic
* `[KEYWORDS]` contain the top 10 most relevant keywords to the topic as generated through c-TF-IDF

This template will be filled accordingly to each topic. And finally, we can combine this into our final prompt:

In [20]:
prompt = system_prompt + example_prompt + main_prompt

# 🗨️ **BERTopic**

Before we can start with topic modeling, we will first need to perform two steps:
* Pre-calculating Embeddings
* Defining Sub-models

## **Preparing Embeddings**

By pre-calculating the embeddings for each document, we can speed-up additional exploration steps and use the embeddings to quickly iterate over BERTopic's hyperparameters if needed.

🔥 **TIP**: You can find a great overview of good embeddings for clustering on the [MTEB Leaderboard](https://huggingface.co/spaces/mteb/leaderboard).

In [73]:
from sentence_transformers import SentenceTransformer

# Pre-calculate embeddings
embedding_model = SentenceTransformer("BAAI/bge-small-en")
embeddings = embedding_model.encode(contents, show_progress_bar=True)

Batches:   0%|          | 0/12 [00:00<?, ?it/s]

## **Sub-models**

Next, we will define all sub-models in BERTopic and do some small tweaks to the number of clusters to be created, setting random states, etc.

In [22]:
from umap import UMAP
from hdbscan import HDBSCAN

umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42)
hdbscan_model = HDBSCAN(min_cluster_size=20, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

In [74]:
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42)
hdbscan_model = HDBSCAN(min_cluster_size=5, min_samples=5, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

As a small bonus, we are going to reduce the embeddings we created before to 2-dimensions so that we can use them for visualization purposes when we have created our topics.

In [75]:
# Pre-reduce embeddings for visualization purposes
reduced_embeddings = UMAP(n_neighbors=15, n_components=2, min_dist=0.0, metric='cosine', random_state=42).fit_transform(embeddings)

### **Representation Models**

One of the ways we are going to represent the topics is with Llama 2 which should give us a nice label. However, we might want to have additional representations to view a topic from multiple angles.

Here, we will be using c-TF-IDF as our main representation and [KeyBERT](https://maartengr.github.io/BERTopic/getting_started/representation/representation.html#keybertinspired), [MMR](https://maartengr.github.io/BERTopic/getting_started/representation/representation.html#maximalmarginalrelevance), and [Llama 2](https://maartengr.github.io/BERTopic/getting_started/representation/llm.html) as our additional representations.

In [76]:
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance, TextGeneration, PartOfSpeech

# KeyBERT
keybert = KeyBERTInspired()

aspect_model1 = PartOfSpeech("en_core_web_sm")
aspect_model2 = [KeyBERTInspired(top_n_words=20), MaximalMarginalRelevance(diversity=.5)]

# Text generation with Llama 2
llama2 = TextGeneration(generator, prompt=prompt)

# All representation models
representation_model = {
    "KeyBERT": keybert,
    "Aspect1":  aspect_model1,
    "Aspect2":  aspect_model2,
    "Llama2": llama2,
}

# 🔥 **Training**

Now that we have our models prepared, we can start training our topic model! We supply BERTopic with the sub-models of interest, run `.fit_transform`, and see what kind of topics we get.

In [77]:
from bertopic import BERTopic

topic_model = BERTopic(

  # Sub-models
  embedding_model=embedding_model,
  umap_model=umap_model,
  hdbscan_model=hdbscan_model,
  representation_model=representation_model,

  # Hyperparameters
  top_n_words=20,
  verbose=True
)

# Train model
topics, probs = topic_model.fit_transform(contents, embeddings)


2023-12-16 11:02:08,142 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2023-12-16 11:02:11,015 - BERTopic - Dimensionality - Completed ✓
2023-12-16 11:02:11,019 - BERTopic - Cluster - Start clustering the reduced embeddings
2023-12-16 11:02:11,044 - BERTopic - Cluster - Completed ✓
2023-12-16 11:02:11,051 - BERTopic - Representation - Extracting topics from clusters using representation models.
100%|██████████| 14/14 [04:47<00:00, 20.54s/it]
2023-12-16 11:07:04,410 - BERTopic - Representation - Completed ✓


Now that we are done training our model, let's see what topics were generated:

In [78]:
# Show topics
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,KeyBERT,Aspect1,Aspect2,Llama2,Representative_Docs
0,-1,118,-1_and_the_of_to,"[and, the, of, to, in, he, that, his, with, fo...","[mindset, episode, impact, achieving, insights...","[importance, episode, one, success, individual...","[mindset, episode, impact, insights, interview...","[Personal growth and success strategies, , , ,...","[In this episode of Impact Theory, host Tom Bi..."
1,0,38,0_and_of_the_to,"[and, of, the, to, in, that, he, she, self, ho...","[impact, episode, insights, thoughts, intervie...","[self, episode, importance, life, impact, powe...","[impact, episode, insights, interviews, theory...","[Personal Growth and Self-Care, , , , , , , , , ]","[In this episode of Impact Theory, host Tom Bi..."
2,1,33,1_and_his_the_of,"[and, his, the, of, to, he, in, importance, pe...","[episode, overcoming, achieving, success, achi...","[importance, personal, self, life, journey, ep...","[episode, achieving, fitness, identity, relati...","[Personal Development, , , , , , , , , ]","[In this episode, the guest is David Goggins, ..."
3,2,31,2_the_health_and_of,"[the, health, and, of, sleep, in, on, for, to,...","[dietary, nutrition, metabolism, diet, alzheim...","[health, sleep, diet, fasting, body, impact, a...","[dietary, metabolism, alzheimer, fasting, dr, ...","[Health and Aging, , , , , , , , , ]","[In this episode, Dr. Tim Spector challenges t..."
4,3,31,3_and_to_the_his,"[and, to, the, his, in, her, of, he, she, with...","[mindset, perseverance, impact, resilience, ep...","[success, importance, journey, mindset, impact...","[mindset, perseverance, impact, resilience, ep...","[Success Mindset, , , , , , , , , ]","[In this episode of Impact Theory, the guest i..."
5,4,24,4_to_and_the_of,"[to, and, the, of, he, in, his, their, that, i...","[impact, mindset, insights, episode, entrepren...","[impact, importance, episode, success, success...","[impact, mindset, insights, episode, entrepren...","[Impact Investing and Social Responsibility, ,...","[In this episode of Impact Theory, host Tom Bi..."
6,5,23,5_and_to_the_bilyeu,"[and, to, the, bilyeu, of, he, that, in, for, ...","[mindset, episode, achieving, insights, succes...","[importance, personal, mindset, energy, self, ...","[mindset, episode, insights, success, focus, g...","[Mindset and Personal Growth, , , , , , , , , ]","[In this episode, the guest, Tom Bilyeu, share..."
7,6,19,6_the_of_our_and,"[the, of, our, and, in, that, to, dr, he, cons...","[neuroscientist, consciousness, conscious, epi...","[consciousness, brain, meditation, emotions, r...","[neuroscientist, consciousness, episode, brain...","[Meditation and Consciousness, , , , , , , , , ]","[In this episode of Impact Theory, host Tom Bi..."
8,7,16,7_ai_the_and_of,"[ai, the, and, of, potential, in, to, for, har...","[ai, metaverse, episode, impact, societal, dis...","[potential, need, technology, episode, human, ...","[ai, metaverse, episode, disrupt, implications...","[Future of Work and AI, , , , , , , , , ]","[In this episode, Tom Bilyeu discusses the imp..."
9,8,13,8_the_economic_of_and,"[the, economic, of, and, financial, current, i...","[hyperinflation, bitcoin, inflation, economy, ...","[economic, financial, current, inflation, pote...","[hyperinflation, liquidity, episode, recession...","[Economic Outlook and Financial Strategy, , , ...","[In this episode, Michael Saylor discusses his..."


In [79]:
llama2_labels = [label[0][0].split("\n")[0] for label in topic_model.get_topics(full=True)["Llama2"].values()]
topic_model.set_topic_labels(llama2_labels)

# 📊 **Visualize**
We can go through each topic manually, which would take a lot of work, or we can visualize them all in a single interactive graph.
BERTopic has a bunch of [visualization functions](https://medium.com/r/?url=https%3A%2F%2Fmaartengr.github.io%2FBERTopic%2Fgetting_started%2Fvisualization%2Fvisualize_documents.html) that we can use. For now, we are sticking with visualizing the documents.

In [80]:
topic_model.visualize_documents(contents, reduced_embeddings=reduced_embeddings, hide_annotations=True, hide_document_hover=False, custom_labels=True)

## Label each video with topic

In [81]:
labels = topic_model.custom_labels_
label_dict = {str(i-1): label for i, label in enumerate(labels)}
label_dict["-1"] = "Other"
print(label_dict)

In [87]:
import itertools
import pandas as pd

# Define colors for the visualization to iterate over
colors = itertools.cycle(['#e6194b', '#3cb44b', '#ffe119', '#4363d8', '#f58231', '#911eb4', '#46f0f0', '#f032e6', '#bcf60c', '#fabebe', '#008080', '#e6beff', '#9a6324', '#fffac8', '#800000', '#aaffc3', '#808000', '#ffd8b1', '#000075', '#808080', '#ffffff', '#000000'])
color_key = {str(topic): next(colors) for topic in set(topic_model.topics_) if topic != -1}

# Prepare dataframe and ignore outliers
df = pd.DataFrame({"x": reduced_embeddings[:, 0], "y": reduced_embeddings[:, 1], "Topic": [str(t) for t in topic_model.topics_]})
df["Length"] = [len(doc) for doc in contents]

#df = df.loc[df.Topic != "-1"]
df = df.loc[(df.y > -10) & (df.y < 10) & (df.x < 10) & (df.x > -10), :]
df["Topic"] = df["Topic"].astype("category")
df['topic_name'] = df['Topic'].astype(str).map(label_dict)
df["video_id"] = [v_id for v_id in video_ids]



In [88]:
df[["video_id", "topic_name"]]

Unnamed: 0,video_id,topic_name
0,nXJBccSwtB8,Future of Work and AI
1,6KJhM7Pg5EA,Future of Work and AI
2,IQefdkl8PfY,Meditation and Consciousness
3,sl3XhHs6ggs,Future of Work and AI
4,60U-wLfB8iU,Future of Work and AI
...,...,...
379,LCHPSo79rB4,Other
380,Kd06uvinqLI,Impact Investing and Social Responsibility
381,PYzGv6Tfu_0,Impact Investing and Social Responsibility
382,ULPE3_nPNL0,Success Mindset


In [91]:
import json
json_dict = pd.Series(df.topic_name.values, index=df.video_id).to_dict()

# Save the dictionary to a JSON file
with open('output.json', 'w') as file:
    json.dump(json_dict, file, indent=4)