In [None]:
!rm -rf .venv
!python3.10 -m venv .venv
!.venv/bin/python3.10 -m pip install --upgrade --quiet pip
!.venv/bin/python3.10 -m pip install --r requirements.txt

Blog post : https://dylancastillo.co/clustering-documents-with-openai-langchain-hdbscan/

In [3]:
pip install -r requirements.txt

Collecting hdbscan (from -r requirements.txt (line 1))
  Downloading hdbscan-0.8.33.tar.gz (5.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.2/5.2 MB[0m [31m52.0 MB/s[0m eta [36m0:00:00[0m:00:01[0m0:01[0m
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
[?25hCollecting langchain==0.0.194 (from -r requirements.txt (line 2))
  Downloading langchain-0.0.194-py3-none-any.whl.metadata (13 kB)
Collecting openai==0.27.8 (from -r requirements.txt (line 3))
  Downloading openai-0.27.8-py3-none-any.whl.metadata (13 kB)
Collecting pandas==2.0.2 (from -r requirements.txt (line 4))
  Downloading pandas-2.0.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (18 kB)
Collecting python-dotenv==1.0.0 (from -r requirements.txt (line 5))
  Downloading python_dotenv-1.0.0-py3-none-any.whl (19 kB)
Collecting tiktoken==0.4.0 (from -r req

In [5]:
import os

import hdbscan
import pandas as pd

from langchain import LLMChain
from langchain.chat_models import ChatOpenAI
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.prompts.chat import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    HumanMessagePromptTemplate,
)
from newsapi import NewsApiClient

from dotenv import load_dotenv

load_dotenv()

True

## Get 200 news articles from Hacker News 

In [6]:
newsapi = NewsApiClient(api_key=os.getenv("NEWSAPI_API_KEY"))

sources_1 = [
    "the-washington-post",
    "the-wall-street-journal",
    "business-insider",
]
sources_2 = [
    "associated-press",
    "bloomberg",
]

recent_articles = []

for source in [sources_1, sources_2]:
    recent_articles.extend(newsapi.get_everything(
        sources=",".join(source),
        language="en",
        page_size=100
    )["articles"])

## Generate embeddings from articles

In [7]:
docs = [
    a["title"] + "\n\n" + a["description"]
    for a in recent_articles
]

embeddings = OpenAIEmbeddings(chunk_size=200).embed_documents(docs)

## Cluster documents, plot results, and store them in a dataframe

In [9]:
hdb = hdbscan.HDBSCAN(min_samples=3, min_cluster_size=3).fit(embeddings)

df = pd.DataFrame({
    "title": [article["title"] for article in recent_articles],
    "description": [article["description"] for article in recent_articles],
    "cluster": hdb.labels_,
})
df = df.query("cluster != -1") # Remove documetns that are not in a cluster

## Create cluster topics from documents in each cluster

In [10]:
def get_prompt():
    system_template = "You're an expert journalist. You're helping me write a compelling topic title for news articles."
    human_template = "Using the following articles, write a topic title that summarizes them.\n\nARTICLES:{articles}\n\nTOPIC TITLE:"

    return ChatPromptTemplate(
        messages=[
            SystemMessagePromptTemplate.from_template(system_template),
            HumanMessagePromptTemplate.from_template(human_template),
        ],
        input_variables=["articles"],
    )

for c in df.cluster.unique():
    chain = LLMChain(
        llm=ChatOpenAI(temperature=0, model_name="gpt-3.5-turbo"), prompt=get_prompt(), verbose=False
    )
    articles_str = "\n".join(
        [
            f"{article['title']}\n{article['description']}\n"
            for article in df.query(f"cluster == {c}").to_dict(orient="records")
        ]
    )
    result = chain.run(
        {
            "articles": articles_str,
        }
    )
    df.loc[df.cluster == c, "topic_title"] = result

Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 1.0 seconds as it raised RateLimitError: Rate limit reached for gpt-3.5-turbo in organization org-lGgYtSylXadLLiSEUsojr7MZ on requests per min (RPM): Limit 3, Used 3, Requested 1. Please try again in 20s. Visit https://platform.openai.com/account/rate-limits to learn more. You can increase your rate limit by adding a payment method to your account at https://platform.openai.com/account/billing..
Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 2.0 seconds as it raised RateLimitError: Rate limit reached for gpt-3.5-turbo in organization org-lGgYtSylXadLLiSEUsojr7MZ on requests per min (RPM): Limit 3, Used 3, Requested 1. Please try again in 20s. Visit https://platform.openai.com/account/rate-limits to learn more. You can increase your rate limit by adding a payment method to your account at https://platform.openai.com/account/

Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 2.0 seconds as it raised RateLimitError: Rate limit reached for gpt-3.5-turbo in organization org-lGgYtSylXadLLiSEUsojr7MZ on requests per min (RPM): Limit 3, Used 3, Requested 1. Please try again in 20s. Visit https://platform.openai.com/account/rate-limits to learn more. You can increase your rate limit by adding a payment method to your account at https://platform.openai.com/account/billing..
Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised RateLimitError: Rate limit reached for gpt-3.5-turbo in organization org-lGgYtSylXadLLiSEUsojr7MZ on requests per min (RPM): Limit 3, Used 3, Requested 1. Please try again in 20s. Visit https://platform.openai.com/account/rate-limits to learn more. You can increase your rate limit by adding a payment method to your account at https://platform.openai.com/account/

In [12]:
c = 1
with pd.option_context("display.max_colwidth", None):
    print(df.query(f"cluster == {c}").topic_title.values[0])
    display(df.query(f"cluster == {c}").head())

"Boeing Faces Scrutiny and Increased Oversight After Series of Safety Issues"


Unnamed: 0,title,description,cluster,topic_title
1,'We have planes all over the world that have issues that nobody has found' — union leader accused Boeing supplier of lack of quality control,Spirit AeroSystems is a Kansas-based company that builds the fuselages and other parts of Boeing planes. The company is under scrutiny after the Alaska Airlines blowout.,1,"""Boeing Faces Scrutiny and Increased Oversight After Series of Safety Issues"""
63,Boeing is adding more 737 airplane inspections as company says it's 'clear that we are not where we need to be',"Boeing Commercial Airplanes CEO Stan Deal said the firm is ""taking a hard look"" at its quality practices after a 737 Max 9 lost a door plug midflight.",1,"""Boeing Faces Scrutiny and Increased Oversight After Series of Safety Issues"""
85,Boeing pledges more scrutiny as FAA plans audit of Max 9 planes,The fallout from a Boeing plane’s midair accident continues as the FAA examines whether a third party should take over some Boeing procedures.,1,"""Boeing Faces Scrutiny and Increased Oversight After Series of Safety Issues"""
138,"FAA to intensify oversight of Boeing, audit 737 Max 9 production",The Federal Aviation Administration says it will increase oversight of Boeing and audit the production of the 737 Max 9 jetliner after a panel blew off an Alaska Airlines plane in midflight last week. It was the latest in a string of mishaps at the troubled a…,1,"""Boeing Faces Scrutiny and Increased Oversight After Series of Safety Issues"""
151,Boeing design under FAA investigation after Alaska Airlines blowout,Federal officials are investigating Boeing's oversight of production of a panel that blew off a jetliner in midflight last week. The Federal Aviation Administration said Thursday that the investigation is focusing on door plugs like the one that came off an A…,1,"""Boeing Faces Scrutiny and Increased Oversight After Series of Safety Issues"""
