In [12]:
import os
from dotenv import load_dotenv
load_dotenv()


True

In [2]:
from typing import Literal

from langchain_core.prompts import ChatPromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_openai import ChatOpenAI

# Data model
class RouteQuery(BaseModel):
    """Route a user query to the most relevant datasource."""

    datasource: Literal["python_docs", "js_docs", "golang_docs"] = Field(
        ...,
        description="Given a user question choose which datasource would be most relevant for answering their question",
    )

# LLM with function call 
llm = ChatOpenAI(model="gpt-3.5-turbo-0125", temperature=0)
structured_llm = llm.with_structured_output(RouteQuery)

# Prompt 
system = """You are an expert at routing a user question to the appropriate data source.

Based on the programming language the question is referring to, route it to the relevant data source."""

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system),
        ("human", "{question}"),
    ]
)

# Define router 
router = prompt | structured_llm


For example, replace imports like: `from langchain_core.pydantic_v1 import BaseModel`
with: `from pydantic import BaseModel`
or the v1 compatibility namespace if you are working in a code base that has not been fully upgraded to pydantic 2 yet. 	from pydantic.v1 import BaseModel

  exec(code_obj, self.user_global_ns, self.user_ns)


In [3]:
question = """Why doesn't the following code work:

from langchain_core.prompts import ChatPromptTemplate

prompt = ChatPromptTemplate.from_messages(["human", "speak in {language}"])
prompt.invoke("french")
"""

result = router.invoke({"question": question})

In [4]:
result.datasource

'python_docs'

In [None]:
def choose_route(result):
    if "python_docs" in result.datasource.lower():
        ### Logic here 
        return "chain for python_docs"
    elif "js_docs" in result.datasource.lower():
        ### Logic here 
        return "chain for js_docs"
    else:
        ### Logic here 
        return "golang_docs"

from langchain_core.runnables import RunnableLambda

full_chain = router | RunnableLambda(choose_route)

In [6]:
full_chain.invoke({"question": question})

'chain for python_docs'

In [7]:
from langchain.utils.math import cosine_similarity
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import PromptTemplate
from langchain_core.runnables import RunnableLambda, RunnablePassthrough
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

# Two prompts
physics_template = """You are a very smart physics professor. \
You are great at answering questions about physics in a concise and easy to understand manner. \
When you don't know the answer to a question you admit that you don't know.

Here is a question:
{query}"""

math_template = """You are a very good mathematician. You are great at answering math questions. \
You are so good because you are able to break down hard problems into their component parts, \
answer the component parts, and then put them together to answer the broader question.

Here is a question:
{query}"""

# Embed prompts
embeddings = OpenAIEmbeddings()
prompt_templates = [physics_template, math_template]
prompt_embeddings = embeddings.embed_documents(prompt_templates)

# Route question to prompt 
def prompt_router(input):
    # Embed question
    query_embedding = embeddings.embed_query(input["query"])
    # Compute similarity
    similarity = cosine_similarity([query_embedding], prompt_embeddings)[0]
    most_similar = prompt_templates[similarity.argmax()]
    # Chosen prompt 
    print("Using MATH" if most_similar == math_template else "Using PHYSICS")
    return PromptTemplate.from_template(most_similar)


chain = (
    {"query": RunnablePassthrough()}
    | RunnableLambda(prompt_router)
    | ChatOpenAI()
    | StrOutputParser()
)

print(chain.invoke("What's a black hole"))

Using PHYSICS
A black hole is a region in space where gravity is so strong that nothing, not even light, can escape from it. Black holes are formed when massive stars collapse under their own gravity. They have a singularity at their center, which is a point of infinite density and gravity. The boundary surrounding a black hole is called the event horizon.


## 

## Query structuring

In [7]:
## 因为YoutubeLoader本地用不了（暂时不清楚原因）

import requests
from langchain_core.documents import Document

class BilibiliLoader:
    def __init__(self, url: str, add_video_info: bool = True):
        self.url = url
        self.add_video_info = add_video_info
        self.metadata = {}

    def _get_video_info(self):
        """使用 yt-dlp 获取 Bilibili 视频信息"""
        import yt_dlp

        ydl_opts = {
            'quiet': True,
            'skip_download': True,
            # 可添加 cookie 文件应对登录墙
            # 'cookiefile': 'cookies.txt',
        }
        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
            info = ydl.extract_info(self.url, download=False)
            return {
                "title": info.get("title"),
                "description": info.get("description"),
                "author": info.get("uploader"),
                "upload_date": info.get("upload_date"),
                "duration": info.get("duration"),  # 秒
                "view_count": info.get("view_count"),
                "thumbnail": info.get("thumbnail"),
                "webpage_url": info.get("webpage_url"),
            }

    def _get_subtitles(self):
        """获取字幕（如果存在）"""
        import yt_dlp

        ydl_opts = {
            'writesubtitles': True,
            'writeautomaticsub': True,  # 自动字幕
            'subtitleslangs': ['zh'],   # 中文
            'skip_download': False,
            'quiet': True,
        }
        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
            result = ydl.extract_info(self.url, download=False)
            subtitles = result.get('subtitles') or result.get('automatic_captions')
            if not subtitles:
                return None

            # 提取中文字幕文本（自动字幕优先）
            for lang in ['zh', 'zh-Hans', 'zh-CN']:
                if lang in subtitles and subtitles[lang]:
                    text = ""
                    for line in subtitles[lang]:
                        if line.get('ext') == 'ass':
                            continue
                        text += line.get('text', '') + "\n"
                    return text.strip()

            # 尝试英文或其他
            for sub in subtitles.values():
                if sub:
                    text = ""
                    for line in sub:
                        text += line.get('text', '') + "\n"
                    return text.strip()

        return None

    def load(self):
        """模拟 langchain 的 load() 方法"""
        if self.add_video_info:
            self.metadata = self._get_video_info()

        subtitle_text = self._get_subtitles()
        if not subtitle_text:
            subtitle_text = "(No subtitle available)"

        doc = Document(
            page_content=subtitle_text,
            metadata=self.metadata
        )
        return [doc]

In [8]:
from langchain_community.document_loaders import YoutubeLoader

# docs = YoutubeLoader.from_youtube_url(
#     "https://www.youtube.com/watch?v=pbAd8O1Lvm4", add_video_info=True
# ).load()

# docs[0].metadata

loader = BilibiliLoader("https://www.bilibili.com/video/BV166egzvE9H/", add_video_info=True)
docs = loader.load()

# 输出元数据
print(docs[0].metadata)

# # 输出字幕内容
# print(docs[0].page_content[:500])



{'title': '终于知道Transformer 为啥离不开 RoPE了', 'description': '为什么 Transformer 需要位置编码？为什么正弦嵌入不够用？RoPE 又是如何通过“旋转”解决长文本记忆的问题？这一期我会彻底讲清 RoPE 的设计思路。看完你会明白，RoPE 不只是一个小改动，而是让大模型真正具备长序列理解能力的关键。', 'author': '机器学习算法应用实战', 'upload_date': '20250820', 'duration': 364.042, 'view_count': 7005, 'thumbnail': 'http://i2.hdslb.com/bfs/archive/bace2a7bba801c6262d63ce69b74e39b65e63f53.jpg', 'webpage_url': 'https://www.bilibili.com/video/BV166egzvE9H/'}


In [9]:
import datetime
from typing import Literal, Optional, Tuple
from langchain_core.pydantic_v1 import BaseModel, Field

class TutorialSearch(BaseModel):
    """Search over a database of tutorial videos about a software library."""

    content_search: str = Field(
        ...,
        description="Similarity search query applied to video transcripts.",
    )
    title_search: str = Field(
        ...,
        description=(
            "Alternate version of the content search query to apply to video titles. "
            "Should be succinct and only include key words that could be in a video "
            "title."
        ),
    )
    min_view_count: Optional[int] = Field(
        None,
        description="Minimum view count filter, inclusive. Only use if explicitly specified.",
    )
    max_view_count: Optional[int] = Field(
        None,
        description="Maximum view count filter, exclusive. Only use if explicitly specified.",
    )
    earliest_publish_date: Optional[datetime.date] = Field(
        None,
        description="Earliest publish date filter, inclusive. Only use if explicitly specified.",
    )
    latest_publish_date: Optional[datetime.date] = Field(
        None,
        description="Latest publish date filter, exclusive. Only use if explicitly specified.",
    )
    min_length_sec: Optional[int] = Field(
        None,
        description="Minimum video length in seconds, inclusive. Only use if explicitly specified.",
    )
    max_length_sec: Optional[int] = Field(
        None,
        description="Maximum video length in seconds, exclusive. Only use if explicitly specified.",
    )

    def pretty_print(self) -> None:
        for field in self.__fields__:
            if getattr(self, field) is not None and getattr(self, field) != getattr(
                self.__fields__[field], "default", None
            ):
                print(f"{field}: {getattr(self, field)}")


For example, replace imports like: `from langchain_core.pydantic_v1 import BaseModel`
with: `from pydantic import BaseModel`
or the v1 compatibility namespace if you are working in a code base that has not been fully upgraded to pydantic 2 yet. 	from pydantic.v1 import BaseModel

  exec(code_obj, self.user_global_ns, self.user_ns)


In [10]:
from datetime import date

query = TutorialSearch(
    content_search="how to load documents in LangChain",
    title_search="LangChain document loaders",
    min_view_count=1000,
    earliest_publish_date=date(2023, 1, 1),
    max_length_sec=1200  # < 20 minutes
)

query.pretty_print()

content_search: how to load documents in LangChain
title_search: LangChain document loaders
min_view_count: 1000
earliest_publish_date: 2023-01-01
max_length_sec: 1200


In [13]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI

system = """You are an expert at converting user questions into database queries. \
You have access to a database of tutorial videos about a software library for building LLM-powered applications. \
Given a question, return a database query optimized to retrieve the most relevant results.

If there are acronyms or words you are not familiar with, do not try to rephrase them."""
prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system),
        ("human", "{question}"),
    ]
)
llm = ChatOpenAI(model="gpt-3.5-turbo-0125", temperature=0)
# 做functin call
structured_llm = llm.with_structured_output(TutorialSearch)
query_analyzer = prompt | structured_llm



In [14]:
query_analyzer.invoke({"question": "rag from scratch"}).pretty_print()

content_search: rag from scratch
title_search: rag from scratch


In [15]:
query_analyzer.invoke(
    {"question": "videos on chat langchain published in 2023"}
).pretty_print()

content_search: chat langchain
title_search: chat langchain
earliest_publish_date: 2023-01-01
latest_publish_date: 2024-01-01


In [16]:
query_analyzer.invoke(
    {"question": "videos that are focused on the topic of chat langchain that are published before 2024"}
).pretty_print()

content_search: chat langchain
title_search: chat langchain
earliest_publish_date: 2024-01-01
