In [None]:
import os

import hdbscan
import pandas as pd

from langchain import LLMChain
from langchain.chat_models import ChatOpenAI
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.prompts.chat import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    HumanMessagePromptTemplate,
)
from newsapi import NewsApiClient

from dotenv import load_dotenv

load_dotenv()

## Get 200 news articles from relevant sources

In [None]:
newsapi = NewsApiClient(api_key=os.getenv("NEWSAPI_API_KEY"))
sources_1 = [
    "the-washington-post",
    "the-wall-street-journal",
    "business-insider",
]
sources_2 = [
    "associated-press",
    "bloomberg",
]

In [None]:
recent_articles = []
for source in [sources_1, sources_2]:
    recent_articles.extend(newsapi.get_everything(
        sources=",".join(source),
        language="en",
        page_size=100
    )["articles"])

## Generate embeddings from news articles

In [None]:
docs = [
    a["title"] + "\n\n" + a["description"]
    for a in recent_articles
]

In [None]:
embeddings = OpenAIEmbeddings(chunk_size=1000).embed_documents(docs)

## Cluster documents and store the results in a dataframe

In [None]:
hdb = hdbscan.HDBSCAN(gen_min_span_tree=True, min_samples=3, min_cluster_size=3).fit(embeddings)

In [None]:
df = pd.DataFrame({
    "title": [article["title"] for article in recent_articles],
    "description": [article["description"] for article in recent_articles],
    "cluster": hdb.labels_,
})
df = df.query("cluster != -1") # Remove documetns that are not in a cluster

## Create cluster topics from documents in each cluster

In [None]:



def get_prompt():
    system_template = "You're an expert journalist. You're helping me write a compelling topic title for news articles."
    human_template = "Using the following articles, write a topic title that summarizes them.\n\nARTICLES:{articles}\n\nTOPIC TITLE:"

    return ChatPromptTemplate(
        messages=[
            SystemMessagePromptTemplate.from_template(system_template),
            HumanMessagePromptTemplate.from_template(human_template),
        ],
        input_variables=["articles"],
    )


articles_str = "\n\n".join(
    [article["title"] + "\n\n" + article["description"] for article in recent_articles]
)

prompt = get_prompt()

for c in df.cluster.unique():
    chain = LLMChain(
        llm=ChatOpenAI(temperature=0, model_name="gpt-4"), prompt=prompt, verbose=False
    )
    articles_str = "\n".join(
        [
            f"{article['title']}\n{article['description']}\n"
            for article in df.query(f"cluster == {c}").to_dict(orient="records")
        ]
    )
    result = chain.run(
        {
            "articles": articles_str,
        }
    )
    df.loc[df.cluster == c, "topic_title"] = result

In [None]:
c = 6
with pd.option_context("display.max_colwidth", None):
    print(df.query(f"cluster == {c}").topic_title.values[0])
    display(df.query(f"cluster == {c}").head())