In [1]:
import os

import hdbscan
import requests

import numpy as np
import pandas as pd
import plotly.express as px

from langchain import LLMChain
from langchain.chat_models import ChatOpenAI
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.prompts.chat import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    HumanMessagePromptTemplate,
)
from sklearn.manifold import TSNE
from dotenv import load_dotenv

load_dotenv()

True

## Get 200 news articles from Hacker News 

In [2]:
response = requests.get('https://hacker-news.firebaseio.com/v0/topstories.json')
article_ids = response.json()

recent_articles = []
for id in article_ids:
    article_response = requests.get(f'https://hacker-news.firebaseio.com/v0/item/{id}.json')
    recent_articles.append(article_response.json())

docs = [article['title'] for article in recent_articles if len(article["title"]) > 20]

In [3]:
len(docs)

462

## Generate embeddings from articles

In [4]:
embeddings = OpenAIEmbeddings(chunk_size=1000).embed_documents(docs)

## Cluster documents, plot results, and store them in a dataframe

In [5]:
hdb = hdbscan.HDBSCAN(gen_min_span_tree=True, min_samples=3, min_cluster_size=3).fit(embeddings)

In [6]:
tsne = TSNE(n_components=2, random_state=0)

df_tsne = (
    pd.DataFrame(tsne.fit_transform(np.array(embeddings)), columns=['x', 'y'])
    .assign(cluster=lambda df: hdb.labels_.astype(str))
    .query('cluster != "-1"')
    .sort_values(by='cluster')
)

fig = px.scatter(df_tsne, x='x', y='y', color='cluster')
fig.show()

In [7]:
df = pd.DataFrame({
    "title": docs,
    "cluster": hdb.labels_,
})
df = df.query("cluster != -1")

## Create cluster topics from documents in each cluster

In [8]:
def get_prompt():
    system_template = "You're an expert tech journalist. You're helping me write short (4 or words max) but compelling topic title for groups of news articles."
    human_template = "Using the following articles, write a topic title that summarizes them.\n\nARTICLES:{articles}\n\nTOPIC TITLE:"

    return ChatPromptTemplate(
        messages=[
            SystemMessagePromptTemplate.from_template(system_template),
            HumanMessagePromptTemplate.from_template(human_template),
        ],
        input_variables=["articles"],
    )


articles_str = "\n\n".join(docs) 

prompt = get_prompt()

for c in df.cluster.unique():
    chain = LLMChain(
        llm=ChatOpenAI(temperature=0, model_name="gpt-4"), prompt=prompt, verbose=False
    )
    articles_str = "\n".join(
        [
            f"{article['title']}\n"
            for article in df.query(f"cluster == {c}").to_dict(orient="records")
        ]
    )
    result = chain.run(
        {
            "articles": articles_str,
        }
    )
    df.loc[df.cluster == c, "topic_title"] = result

In [11]:
c = 10
with pd.option_context("display.max_colwidth", None):
    print(df.query(f"cluster == {c}").topic_title.values[0])
    display(df.query(f"cluster == {c}").head())

"FTC Battles Tech Giants"


Unnamed: 0,title,cluster,topic_title
97,Court denies FTC last-ditch attempt to stop Microsoft buying Activision Blizzard,10,"""FTC Battles Tech Giants"""
163,"Disney, Netflix, and more are fighting FTC's 'click to cancel' proposal",10,"""FTC Battles Tech Giants"""
171,FTC Loses Appeals Court Bid to Pause Microsoft-Activision Deal,10,"""FTC Battles Tech Giants"""
224,Microsoft wins FTC fight to buy Activision Blizzard,10,"""FTC Battles Tech Giants"""
413,US FTC asks court to temporarily halt Microsoft's acquisition of Activision,10,"""FTC Battles Tech Giants"""
