In [1]:
import json

In [2]:
filename = "data/paper_metadata.json"

# Write the dictionary to a file
with open(filename, "r") as file:
    paper_metadata = json.load(file)

In [3]:
paper_metadata


[{'url': 'https://paperswithcode.com/paper/transparent-image-layer-diffusion-using',
  'title': 'Transparent Image Layer Diffusion using Latent Transparency',
  'arxiv_link': 'https://arxiv.org/pdf/2402.17113v2.pdf',
  'published': '2024-02-28',
  'authors': 'Lvmin Zhang, Maneesh Agrawala',
  'summary': 'We present LayerDiffusion, an approach enabling large-scale pretrained latent\ndiffusion models to generate transparent images. The method allows generation\nof single transparent images or of multiple transparent layers. The method\nlearns a "latent transparency" that encodes alpha channel transparency into the\nlatent manifold of a pretrained latent diffusion model. It preserves the\nproduction-ready quality of the large diffusion model by regulating the added\ntransparency as a latent offset with minimal changes to the original latent\ndistribution of the pretrained model. In this way, any latent diffusion model\ncan be converted into a transparent image generator by finetuning it w

https://colab.research.google.com/drive/1BoQ_vakEVtojsd2x_U6-_x52OOuqruj2?usp=sharing#scrollTo=cIu9afMo1YYg

In [20]:
abstracts = []
titles = []
for paper in paper_metadata:
    abstracts.append(paper['summary'])
    titles.append(paper['title'])

In [21]:
abstracts

['We present LayerDiffusion, an approach enabling large-scale pretrained latent\ndiffusion models to generate transparent images. The method allows generation\nof single transparent images or of multiple transparent layers. The method\nlearns a "latent transparency" that encodes alpha channel transparency into the\nlatent manifold of a pretrained latent diffusion model. It preserves the\nproduction-ready quality of the large diffusion model by regulating the added\ntransparency as a latent offset with minimal changes to the original latent\ndistribution of the pretrained model. In this way, any latent diffusion model\ncan be converted into a transparent image generator by finetuning it with the\nadjusted latent space. We train the model with 1M transparent image layer pairs\ncollected using a human-in-the-loop collection scheme. We show that latent\ntransparency can be applied to different open source image generators, or be\nadapted to various conditional control systems to achieve ap

In [22]:
titles

['Transparent Image Layer Diffusion using Latent Transparency',
 'Intent-based Prompt Calibration: Enhancing prompt optimization with synthetic boundary cases',
 'Sora: A Review on Background, Technology, Limitations, and Opportunities of Large Vision Models',
 'YOLOv9: Learning What You Want to Learn Using Programmable Gradient Information',
 'Datasets for Large Language Models: A Comprehensive Survey',
 'Learning to Generate Instruction Tuning Datasets for Zero-Shot Task Adaptation',
 'MobiLlama: Towards Accurate and Lightweight Fully Transparent GPT',
 'Training-Free Long-Context Scaling of Large Language Models',
 'BitNet: Scaling 1-bit Transformers for Large Language Models',
 'The First Place Solution of WSDM Cup 2024: Leveraging Large Language Models for Conversational Multi-Doc QA']

In [23]:
from nltk.tokenize import sent_tokenize, word_tokenize
sentences = [sent_tokenize(abstract) for abstract in abstracts]
sentences = [sentence for doc in sentences for sentence in doc]

### Pre-calcuate embeddings

In [63]:
from sentence_transformers import SentenceTransformer

# Pre-calculate embeddings
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = embedding_model.encode(abstracts, show_progress_bar=True)

Batches: 100%|██████████| 1/1 [00:00<00:00,  3.52it/s]


### Prevent stochastic behavior

In [180]:
from umap import UMAP

umap_model = UMAP(n_neighbors=2, n_components=1, min_dist=0.0, metric='cosine', random_state=42)

### Controlling Number of Topics

Use MLFlow for tracking best input parameters (n_neighbors, n_components, min_dfm min_cluster_size, etc) and resulting number of topics

In [181]:
from hdbscan import HDBSCAN

# Allign metric with UMAP?
hdbscan_model = HDBSCAN(min_cluster_size=2, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

### Improving Default Representation

In [170]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer_model = CountVectorizer(stop_words="english", min_df=1, ngram_range=(1, 2))

### Additional Representations

In [None]:
import os
import dotenv

dotenv.load_dotenv()


In [188]:
import openai
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance, OpenAI, PartOfSpeech

# KeyBERT
keybert_model = KeyBERTInspired()

# Part-of-Speech
# pos_model = PartOfSpeech("en_core_web_sm")

# MMR
mmr_model = MaximalMarginalRelevance(diversity=0.3)

# GPT-3.5
prompt = """
I have a topic that contains the following documents:
[DOCUMENTS]
The topic is described by the following keywords: [KEYWORDS]

Based on the information above, extract a short but highly descriptive topic label of at most 5 words. Make sure it is in the following format:
topic: <topic label>
"""
    
api_key = os.getenv("OPENAI_API_KEY")
client = openai.OpenAI(api_key=api_key)
openai_model = OpenAI(client, model="gpt-3.5-turbo", exponential_backoff=True, chat=True, prompt=prompt)

# All representation models
representation_model = {
    "KeyBERT": keybert_model,
    "OpenAI": openai_model,  # Uncomment if you will use OpenAI
    "MMR": mmr_model,
    # "POS": pos_model
}

### Training

In [189]:
from bertopic import BERTopic

topic_model = BERTopic(

  # Pipeline models
  embedding_model=embedding_model,
  umap_model=umap_model,
  hdbscan_model=hdbscan_model,
  vectorizer_model=vectorizer_model,
  representation_model=representation_model,

  # Hyperparameters
  top_n_words=10,
  verbose=True
)

topics, probs = topic_model.fit_transform(abstracts, embeddings)

2024-03-05 17:55:58,773 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm


2024-03-05 17:55:59,695 - BERTopic - Dimensionality - Completed ✓
2024-03-05 17:55:59,697 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-03-05 17:55:59,724 - BERTopic - Cluster - Completed ✓
2024-03-05 17:55:59,731 - BERTopic - Representation - Extracting topics from clusters using representation models.
100%|██████████| 2/2 [00:01<00:00,  1.11it/s]
2024-03-05 17:56:11,340 - BERTopic - Representation - Completed ✓


In [183]:
topics

[0, 0, 0, 1, 0, 0, 1, 1, 1, 0]

In [190]:
topic_model.get_topics()

{0: [('datasets', 0.047192280763216785),
  ('model', 0.03656682618864347),
  ('task', 0.03094053284629336),
  ('prompt', 0.029016748790358354),
  ('generation', 0.029016748790358354),
  ('latent', 0.029016748790358354),
  ('instruction', 0.029016748790358354),
  ('language', 0.028128327837418053),
  ('models', 0.02618493118479418),
  ('transparent', 0.02579266559142965)],
 1: [('models', 0.054457542880977906),
  ('information', 0.041262874629549424),
  ('large', 0.03649971392021519),
  ('results', 0.0334742279690802),
  ('language', 0.031499612271014554),
  ('language models', 0.02969903184180259),
  ('pgi', 0.02884357679044659),
  ('chunk', 0.02884357679044659),
  ('performance', 0.025789296643468386),
  ('data', 0.025789296643468386)]}

In [191]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,KeyBERT,OpenAI,MMR,Representative_Docs
0,0,6,0_datasets_model_task_prompt,"[datasets, model, task, prompt, generation, la...","[language models, video generation, question a...",[Automated Prompt Engineering],"[datasets, model, task, prompt, generation, la...","[We present LayerDiffusion, an approach enabli..."
1,1,4,1_models_information_large_results,"[models, information, large, results, language...","[large language, memory, memory footprint, dee...",[Efficient Small Language Models],"[models, information, large, results, language...",[The ability of Large Language Models (LLMs) t...


In [111]:
topic_model.get_topic(0, full=True)

{'Main': [('datasets', 0.16955208084578507),
  ('model', 0.13226718741478377),
  ('task', 0.11726174577864121),
  ('language', 0.10174399031906442),
  ('transparent', 0.0996209403156825),
  ('llms', 0.09380939662291297),
  ('models', 0.0912251082638667),
  ('text', 0.09017109174198108),
  ('large', 0.08310533484956989),
  ('dataset', 0.08021949231172919)],
 'KeyBERT': [('language models', 0.5080408),
  ('large language', 0.395446),
  ('learning', 0.32574537),
  ('language', 0.30879468),
  ('generate', 0.30399048),
  ('text', 0.29888886),
  ('models llms', 0.27738488),
  ('pre training', 0.27479458),
  ('trained', 0.27347666),
  ('datasets', 0.27270696)],
 'OpenAI': [('Automatic Prompt Engineering', 1)],
 'MMR': [('datasets', 0.16955208084578507),
  ('model', 0.13226718741478377),
  ('task', 0.11726174577864121),
  ('language', 0.10174399031906442),
  ('transparent', 0.0996209403156825),
  ('llms', 0.09380939662291297),
  ('models', 0.0912251082638667),
  ('text', 0.09017109174198108),


In [202]:
# or ChatGPT's labels
chatgpt_topic_labels = {topic: " | ".join(list(zip(*values))[0]) for topic, values in topic_model.topic_aspects_["OpenAI"].items()}
chatgpt_topic_labels[-1] = "Outlier Topic"
topic_model.set_topic_labels(chatgpt_topic_labels)

In [197]:
topic_model.topic_aspects_

{'KeyBERT': {0: [('language models', 0.47570348),
   ('video generation', 0.45058262),
   ('question answering', 0.3853972),
   ('natural language', 0.38166407),
   ('prompt engineering', 0.37694418),
   ('large language', 0.35815015),
   ('automatic prompt', 0.35813928),
   ('pretrained latent', 0.3552686),
   ('generate', 0.34009475),
   ('pretrained', 0.3373075)],
  1: [('large language', 0.42704934),
   ('memory', 0.41538286),
   ('memory footprint', 0.41059476),
   ('deep networks', 0.40945095),
   ('language models', 0.3942625),
   ('bitnet', 0.3673072),
   ('resource constrained', 0.3126595),
   ('attention', 0.3055657),
   ('slm', 0.30371237),
   ('slms', 0.29135308)]},
 'OpenAI': {0: [('Automated Prompt Engineering', 1)],
  1: [('Efficient Small Language Models', 1)]},
 'MMR': {0: [('datasets', 0.047192280763216785),
   ('model', 0.03656682618864347),
   ('task', 0.03094053284629336),
   ('prompt', 0.029016748790358354),
   ('generation', 0.029016748790358354),
   ('latent', 0

In [203]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,CustomName,Representation,KeyBERT,OpenAI,MMR,Representative_Docs
0,0,6,0_datasets_model_task_prompt,Automated Prompt Engineering,"[datasets, model, task, prompt, generation, la...","[language models, video generation, question a...",[Automated Prompt Engineering],"[datasets, model, task, prompt, generation, la...","[We present LayerDiffusion, an approach enabli..."
1,1,4,1_models_information_large_results,Efficient Small Language Models,"[models, information, large, results, language...","[large language, memory, memory footprint, dee...",[Efficient Small Language Models],"[models, information, large, results, language...",[The ability of Large Language Models (LLMs) t...


In [209]:
topic_model.get_topic_info()['KeyBERT'][1]

['large language',
 'memory',
 'memory footprint',
 'deep networks',
 'language models',
 'bitnet',
 'resource constrained',
 'attention',
 'slm',
 'slms']

In [242]:
n_topic_list = len(topic_model.get_topic_info()['KeyBERT'])

topic_lists = []
for i in range(n_topic_list):
    topic_lists.append(topic_model.get_topic_info()['KeyBERT'][i])


In [243]:
topic_lists

[['language models',
  'video generation',
  'question answering',
  'natural language',
  'prompt engineering',
  'large language',
  'automatic prompt',
  'pretrained latent',
  'generate',
  'pretrained'],
 ['large language',
  'memory',
  'memory footprint',
  'deep networks',
  'language models',
  'bitnet',
  'resource constrained',
  'attention',
  'slm',
  'slms']]

In [237]:
n_topic_list = len(topic_model.get_topic_info()['KeyBERT'])
topic_list = topic_model.get_topic_info()['KeyBERT'][0]

for topic in topic_list:
    print(topic.title())


Language Models
Video Generation
Question Answering
Natural Language
Prompt Engineering
Large Language
Automatic Prompt
Pretrained Latent
Generate
Pretrained


In [221]:
for i, paper in enumerate(paper_metadata):
    topic_id = topics[i]
    topic_list = topic_model.get_topic_info()['KeyBERT'][topic_id]

    paper['topic_id'] = topic_id
    paper['topics'] = topic_list

    

In [222]:
paper_metadata

[{'url': 'https://paperswithcode.com/paper/transparent-image-layer-diffusion-using',
  'title': 'Transparent Image Layer Diffusion using Latent Transparency',
  'arxiv_link': 'https://arxiv.org/pdf/2402.17113v2.pdf',
  'published': '2024-02-28',
  'authors': 'Lvmin Zhang, Maneesh Agrawala',
  'summary': 'We present LayerDiffusion, an approach enabling large-scale pretrained latent\ndiffusion models to generate transparent images. The method allows generation\nof single transparent images or of multiple transparent layers. The method\nlearns a "latent transparency" that encodes alpha channel transparency into the\nlatent manifold of a pretrained latent diffusion model. It preserves the\nproduction-ready quality of the large diffusion model by regulating the added\ntransparency as a latent offset with minimal changes to the original latent\ndistribution of the pretrained model. In this way, any latent diffusion model\ncan be converted into a transparent image generator by finetuning it w

In [122]:
# `topic_distr` contains the distribution of topics in each document
topic_distr, _ = topic_model.approximate_distribution(abstracts)

  0%|          | 0/1 [00:00<?, ?it/s]

100%|██████████| 1/1 [00:00<00:00, 61.71it/s]


In [123]:
topic_distr

array([[0.67922139, 0.32077861],
       [0.57147287, 0.42852713],
       [0.64964919, 0.35035081],
       [0.29195086, 0.70804914],
       [0.77898086, 0.22101914],
       [0.63608148, 0.36391852],
       [0.49602461, 0.50397539],
       [0.41885281, 0.58114719],
       [0.3826809 , 0.6173191 ],
       [0.53250429, 0.46749571]])

In [125]:
abstract_id = 5
print(abstracts[abstract_id])

We introduce Bonito, an open-source model for conditional task generation:
the task of converting unannotated text into task-specific training datasets
for instruction tuning. Our goal is to enable zero-shot task adaptation of
large language models on users' specialized, private data. We train Bonito on a
new large-scale dataset with 1.65M examples created by remixing existing
instruction tuning datasets into meta-templates. The meta-templates for a
dataset produce training examples where the input is the unannotated text and
the task attribute and the output consists of the instruction and the response.
We use Bonito to generate synthetic tasks for seven datasets from specialized
domains across three task types -- yes-no question answering, extractive
question answering, and natural language inference -- and adapt language
models. We show that Bonito significantly improves the average performance of
pretrained and instruction tuned models over the de facto self supervised
baseline. Fo

In [126]:
# Visualize the topic-document distribution for a single document
topic_model.visualize_distribution(topic_distr[abstract_id])

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

In [127]:
# Visualize the topic-document distribution for a single document
topic_model.visualize_distribution(topic_distr[abstract_id], custom_labels=True)

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

In [131]:
# import matplotlib
# # Calculate the topic distributions on a token-level
# topic_distr, topic_token_distr = topic_model.approximate_distribution(abstracts[abstract_id], calculate_tokens=True)

# # Visualize the token-level distributions
# df = topic_model.visualize_approximate_distribution(abstracts[abstract_id], topic_token_distr[0])
# df

In [133]:
topic_distr, _ = topic_model.approximate_distribution(abstracts, use_embedding_model=True)

  0%|          | 0/1 [00:00<?, ?it/s]

Batches: 100%|██████████| 60/60 [00:06<00:00,  9.78it/s]
100%|██████████| 1/1 [00:06<00:00,  6.27s/it]


In [134]:
topic_distr

array([[0.6648464 , 0.3351536 ],
       [0.661491  , 0.33850902],
       [0.63655037, 0.36344963],
       [0.48688585, 0.5131142 ],
       [0.5654748 , 0.43452513],
       [0.58394575, 0.41605422],
       [0.43747273, 0.56252724],
       [0.46877176, 0.53122824],
       [0.3860429 , 0.61395705],
       [0.63068044, 0.36931956]], dtype=float32)

In [145]:
topic_model.visualize_barchart()

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

In [148]:
# topic_model.visualize_heatmap()

### Outlier Reduction

In [None]:
# # Reduce outliers
# new_topics = topic_model.reduce_outliers(abstracts, topics)

# # Reduce outliers with pre-calculate embeddings instead
# new_topics = topic_model.reduce_outliers(abstracts, topics, strategy="embeddings", embeddings=embeddings)

### Update Topics with Outlier Reduction 

In [135]:
# topic_model.update_topics(docs, topics=new_topics)

### Visualize Topics

In [139]:
# topic_model.visualize_topics(custom_labels=True)

In [141]:
# topic_model.visualize_hierarchy(custom_labels=True)

### Visualize Documents

In [142]:
# Reduce dimensionality of embeddings, this step is optional but much faster to perform iteratively:
reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings)

In [143]:
# Visualize the documents in 2-dimensional space and show the titles on hover instead of the abstracts
# NOTE: You can hide the hover with `hide_document_hover=True` which is especially helpful if you have a large dataset
topic_model.visualize_documents(titles, reduced_embeddings=reduced_embeddings, custom_labels=True)

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

In [None]:
# # We can also hide the annotation to have a more clear overview of the topics
# topic_model.visualize_documents(titles, reduced_embeddings=reduced_embeddings, custom_labels=True, hide_annotations=True)

### Serialization

In [144]:
# embedding_model = "sentence-transformers/all-MiniLM-L6-v2"
# topic_model.save("my_model_dir", serialization="safetensors", save_ctfidf=True, save_embedding_model=embedding_model)

In [None]:
# from sentence_transformers import SentenceTransformer

# # Define embedding model
# embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# # Load model and add embedding model
# loaded_model = BERTopic.load("path/to/my/model_dir", embedding_model=embedding_model)

In [None]:
# model.save("new_model")

# model = BERTopic.load("new_model")

### Inference

To speed up the inference, we can leverage a "best practice" that we used before, namely serialization. When you save a model as safetensors and then load it in, we are removing the dimensionality reduction and clustering steps from the pipeline.

Instead, the assignment of topics is done through cosine similarity of document embeddings and topic embeddings. This speeds up inferences significantly.