# Query Structuring 4 Metadata filtering
![metadata-filtering](./metadata-filtering.png)

https://python.langchain.com/v0.1/docs/modules/data_connection/retrievers/self_query/  
https://python.langchain.com/docs/integrations/vectorstores/upstash/#metadata-filtering  
Many vectorstores contain metadata fields.

This makes it possible to filter for specific chunks based on metadata.

Let's look at some example metadata we might see in a database of YouTube transcripts.

In [1]:
# !pip install --upgrade youtube-transcript-api pytube



In [2]:
from youtube_transcript_api import YouTubeTranscriptApi

video_id = "pbAd8O1Lvm4"
transcript = YouTubeTranscriptApi.get_transcript(video_id)

In [18]:
from langchain_community.document_loaders import YoutubeLoader
from pytube import YouTube
url = "https://www.youtube.com/watch?v=pbAd8O1Lvm4"
try:
    docs = YoutubeLoader.from_youtube_url(
        url, add_video_info=False
    ).load()
    # docs = loader.load()

    # Manually fetch metadata with pytube
    yt = YouTube(url)
    metadata = {
        "title": yt.title,
        "description": yt.description,
        "publish_date": yt.publish_date,
        "channel_url": yt.channel_url,
        "thumbnail_url": yt.thumbnail_url,
    }

    # Attach metadata to each doc
    for doc in docs:
        doc.metadata.update(metadata)

except Exception as e:
    print(f"An error occurred: {e}")

metadata = {'source': 'pbAd8O1Lvm4',
 'title': 'Self-reflective RAG with LangGraph: Self-RAG and CRAG',
 'description': 'Unknown',
 'view_count': 11922,
 'thumbnail_url': 'https://i.ytimg.com/vi/pbAd8O1Lvm4/hq720.jpg',
 'publish_date': '2024-02-07 00:00:00',
 'length': 1058,
 'author': 'LangChain'}

An error occurred: HTTP Error 400: Bad Request


In [19]:
len(transcript)

407

In [20]:
yt

<pytube.__main__.YouTube object: videoId=pbAd8O1Lvm4>

In [21]:
import os.path

from google.auth.transport.requests import Request
from google.oauth2.credentials import Credentials
from google_auth_oauthlib.flow import InstalledAppFlow
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError

# If modifying these scopes, delete the file token.json.
SCOPES = ["https://www.googleapis.com/auth/drive.metadata.readonly"]


def auth():
  """Shows basic usage of the Drive v3 API.
  Prints the names and ids of the first 10 files the user has access to.
  """
  creds = None
  # The file token.json stores the user's access and refresh tokens, and is
  # created automatically when the authorization flow completes for the first
  # time.
  if os.path.exists("keyy.json"):
    creds = Credentials.from_authorized_user_file("keyy.json", SCOPES)
  # If there are no (valid) credentials available, let the user log in.
  if not creds or not creds.valid:
    if creds and creds.expired and creds.refresh_token:
      creds.refresh(Request())
    else:
      flow = InstalledAppFlow.from_client_secrets_file(
          "credentials.json", SCOPES
      )
      creds = flow.run_local_server(port=0)
    # Save the credentials for the next run
    with open("key.json", "w") as token:
      token.write(creds.to_json())

  try:
    service = build("drive", "v3", credentials=creds)

    # Call the Drive v3 API
    results = (
        service.files()
        .list(pageSize=10, fields="nextPageToken, files(id, name)")
        .execute()
    )
    items = results.get("files", [])

    if not items:
      print("No files found.")
      return
    print("Files:")
    for item in items:
      print(f"{item['name']} ({item['id']})")
  except HttpError as error:
    # TODO(developer) - Handle errors from drive API.
    print(f"An error occurred: {error}")
#auth()

In [22]:
import datetime
from typing import Literal, Optional, Tuple
from pydantic import BaseModel, Field

class TutorialSearch(BaseModel):
    """Search over a database of tutorial videos about a software library."""

    content_search: str = Field(
        ...,
        description="Similarity search query applied to video transcripts.",
    )
    title_search: str = Field(
        ...,
        description=(
            "Alternate version of the content search query to apply to video titles. "
            "Should be succinct and only include key words that could be in a video "
            "title."
        ),
    )
    min_view_count: Optional[int] = Field(
        None,
        description="Minimum view count filter, inclusive. Only use if explicitly specified.",
    )
    max_view_count: Optional[int] = Field(
        None,
        description="Maximum view count filter, exclusive. Only use if explicitly specified.",
    )
    earliest_publish_date: Optional[datetime.date] = Field(
        None,
        description="Earliest publish date filter, inclusive. Only use if explicitly specified.",
    )
    latest_publish_date: Optional[datetime.date] = Field(
        None,
        description="Latest publish date filter, exclusive. Only use if explicitly specified.",
    )
    min_length_sec: Optional[int] = Field(
        None,
        description="Minimum video length in seconds, inclusive. Only use if explicitly specified.",
    )
    max_length_sec: Optional[int] = Field(
        None,
        description="Maximum video length in seconds, exclusive. Only use if explicitly specified.",
    )

    def pretty_print(self) -> None:
        for field in self.__fields__:
            if getattr(self, field) is not None and getattr(self, field) != getattr(
                self.__fields__[field], "default", None
            ):
                print(f"{field}: {getattr(self, field)}")

In [23]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI

system = """You are an expert at converting user questions into database queries. \
You have access to a database of tutorial videos about a software library for building LLM-powered applications. \
Given a question, return a database query optimized to retrieve the most relevant results.

If there are acronyms or words you are not familiar with, do not try to rephrase them."""
prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system),
        ("human", "{question}"),
    ]
)
llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)
structured_llm = llm.with_structured_output(TutorialSearch)
query_analyzer = prompt | structured_llm

In [24]:
query_analyzer.invoke({"question": "rag from scratch"}).pretty_print()


content_search: rag from scratch
title_search: rag scratch
min_view_count: 100


/var/folders/3h/1wlbxc196jgb9hdyjcjcyn500000gn/T/ipykernel_89310/3806102375.py:46: PydanticDeprecatedSince20: The `__fields__` attribute is deprecated, use `model_fields` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  for field in self.__fields__:
/var/folders/3h/1wlbxc196jgb9hdyjcjcyn500000gn/T/ipykernel_89310/3806102375.py:48: PydanticDeprecatedSince20: The `__fields__` attribute is deprecated, use `model_fields` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  self.__fields__[field], "default", None


In [25]:
query_analyzer.invoke(
    {"question": "videos on chat langchain published in 2023"}
).pretty_print()

content_search: chat langchain
title_search: chat langchain
earliest_publish_date: 2023-01-01
latest_publish_date: 2023-12-31


/var/folders/3h/1wlbxc196jgb9hdyjcjcyn500000gn/T/ipykernel_89310/3806102375.py:46: PydanticDeprecatedSince20: The `__fields__` attribute is deprecated, use `model_fields` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  for field in self.__fields__:
/var/folders/3h/1wlbxc196jgb9hdyjcjcyn500000gn/T/ipykernel_89310/3806102375.py:48: PydanticDeprecatedSince20: The `__fields__` attribute is deprecated, use `model_fields` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  self.__fields__[field], "default", None


In [26]:
query_analyzer.invoke(
    {"question": "videos that are focused on the topic of chat langchain that are published before 2024"}
).pretty_print()

content_search: chat langchain
title_search: chat langchain
latest_publish_date: 2024-01-01


/var/folders/3h/1wlbxc196jgb9hdyjcjcyn500000gn/T/ipykernel_89310/3806102375.py:46: PydanticDeprecatedSince20: The `__fields__` attribute is deprecated, use `model_fields` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  for field in self.__fields__:
/var/folders/3h/1wlbxc196jgb9hdyjcjcyn500000gn/T/ipykernel_89310/3806102375.py:48: PydanticDeprecatedSince20: The `__fields__` attribute is deprecated, use `model_fields` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  self.__fields__[field], "default", None


In [27]:
query_analyzer.invoke(
    {
        "question": "how to use multi-modal models in an agent, only videos under 5 minutes"
    }
).pretty_print()

content_search: how to use multi-modal models in an agent
title_search: multi-modal models agent
min_length_sec: 0
max_length_sec: 300


/var/folders/3h/1wlbxc196jgb9hdyjcjcyn500000gn/T/ipykernel_89310/3806102375.py:46: PydanticDeprecatedSince20: The `__fields__` attribute is deprecated, use `model_fields` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  for field in self.__fields__:
/var/folders/3h/1wlbxc196jgb9hdyjcjcyn500000gn/T/ipykernel_89310/3806102375.py:48: PydanticDeprecatedSince20: The `__fields__` attribute is deprecated, use `model_fields` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  self.__fields__[field], "default", None
