In [2]:
#https://github.com/Crossme0809/langchain-tutorials/blob/main/Using_OpenAI__LangChain_And_HDBSCAN_Clustering_Documents.ipynb
import os

import hdbscan
import pandas as pd

from langchain import LLMChain
from langchain.chat_models import ChatOpenAI
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.prompts.chat import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    HumanMessagePromptTemplate,)
from newsapi.newsapi_client import NewsApiClient

from dotenv import load_dotenv

load_dotenv()

True

In [19]:
newsapi = NewsApiClient(api_key=os.getenv("NEWSAPI_API_KEY"))

sources_1 = [
    "the-washington-post",
    "the-wall-street-journal",
    "business-insider",
]
sources_2 = [
    "associated-press",
    "bloomberg",
]

recent_articles = []

for source in [sources_1, sources_2]:
    recent_articles.extend(newsapi.get_everything(
        sources=",".join(source),
        language="zh",
        page_size=100
    )["articles"])

In [13]:
recent_articles

[{'source': {'id': 'business-insider', 'name': 'Business Insider'},
  'author': 'Kwan Wei Kevin Tan',
  'title': "Mark Cuban says Elon Musk is a 'genius' for buying X but is also his 'own worst enemy'",
  'description': '"He really has got a unique chance to do some really incredible things with it if he can get out of his own way," Cuban told GQ.',
  'url': 'https://www.businessinsider.com/mark-cuban-says-elon-musk-is-his-own-worst-enemy-2023-9',
  'urlToImage': 'https://i.insider.com/65166ee9c50dce0019136a47?width=1200&format=jpeg',
  'publishedAt': '2023-09-29T07:04:29Z',
  'content': 'Mark Cuban (left) and Elon Musk (right).Christopher Willard/ABC via Getty Images; Alain Jocard/AFP via Getty Images\r\n<ul><li>Mark Cuban thinks Elon Musk is his\xa0"own worst enemy" when it comes to mana… [+2541 chars]'},
 {'source': {'id': 'business-insider', 'name': 'Business Insider'},
  'author': 'kteo@insider.com (Kai Xiang Teo)',
  'title': "Meet the average CEO of the top UK companies: He's wh

In [16]:
docs = [
    a["title"] + "\n\n" + (a["description"] or "") 
    for a in recent_articles
]

embeddings = OpenAIEmbeddings(chunk_size=1000).embed_documents(docs)

KeyboardInterrupt: 

In [None]:
hdb = hdbscan.HDBSCAN(min_samples=3, min_cluster_size=3).fit(embeddings)

df = pd.DataFrame({
    "title": [article["title"] for article in recent_articles],
    "description": [article["description"] for article in recent_articles],
    "cluster": hdb.labels_,
})
df = df.query("cluster != -1") # 删除不在群集中的文档

In [None]:
def get_prompt():
    system_template = "你是一位记者专家。你要帮我为新闻文章写一个引人注目的主题标题。"
    human_template = "使用以下文章，写一个能概括这些文章的主题标题。\n\nARTICLES:{articles}\n\nTOPIC TITLE:"
    
    return ChatPromptTemplate(
        messages=[
            SystemMessagePromptTemplate.from_template(system_template),
            HumanMessagePromptTemplate.from_template(human_template),
        ],
        input_variables=["articles"],
    )
    
for c in df.cluster.unique():
    chain = LLMChain(
        llm=ChatOpenAI(temperature=0, model_name="gpt-3.5-turbo-0613"), prompt=get_prompt(), verbose=False
    )
    articles_str = "\n".join(
        [
            f"{article['title']}\n{article['description']}\n"
            for article in df.query(f"cluster == {c}").to_dict(orient="records")
        ]
    )
    result = chain.run(
        {
            "articles": articles_str,
        }
    )
    df.loc[df.cluster == c, "topic_title"] = result

In [None]:
c = 1
with pd.option_context("display.max_colwidth", None):
    print(df.query(f"cluster == {c}").topic_title.values[0])
    display(df.query(f"cluster == {c}").drop(columns=["topic_title"]).head())