# LangChain 加载 Youtube 音频

In [None]:
# !pip -q install yt_dlp pydub ffmpeg ffprobe

In [1]:
import os
import openai
import sys
sys.path.append('../..')

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) 

openai.api_key  = os.environ['OPENAI_API_KEY']

## 1 加载 Youtube 音频文档

### 1.1 加载 Youtube 音频相关包

In [None]:
from langchain.document_loaders.generic import GenericLoader
from langchain.document_loaders.parsers import OpenAIWhisperParser
from langchain.document_loaders.blob_loaders.youtube_audio import YoutubeAudioLoader

In [None]:
# 吴秋生 2023 年 geemap 视频
url="https://www.youtube.com/watch?v=qd5alEojY50"
# url="https://www.youtube.com/watch?v=84i5v1nyJHI"
save_dir="data/youtube"

# 创建一个 GenericLoader Class 实例
loader = GenericLoader(
    #将链接url中的Youtube视频的音频下载下来,存在本地路径save_dir
    YoutubeAudioLoader([url], save_dir), 
    
    #使用OpenAIWhisperPaser解析器将音频转化为文本
    OpenAIWhisperParser()
)

# 调用 GenericLoader Class 的函数 load对视频的音频文件进行加载
docs = loader.load()

### 1.2 查看加载的数据

In [None]:
print("文本类型:", type(docs))
print("文本页数:", len(docs))

In [None]:
doc = docs[0]
print("类型:", type(doc))

In [None]:
print("开头部分内容:", doc.page_content[0:500])

print("视频元数据信息:", doc.metadata)

print(docs[0].page_content)

### 1.3 存储文本

将 docs 内容存放在 txt 中

In [None]:
# 读取元数据字典中的 source, 然后把后缀名替换掉
filename = docs[0].metadata['source'].replace('.m4a', '.txt')
# 打开一个 txt 文件，使用 "w" 模式表示写入
f = open(filename, "w")

# 遍历列表中的每个 Document 对象
for doc in docs:
    # 从 Document 对象中获取 page_content 的值，它是一个字符串
    content = doc.page_content
    # 将内容写入到文件中，并换行
    f.write(content + "\n")

# 关闭文件
f.close()

## 2 分割文本

### 2.1 加载视频脚本文件

In [2]:
from langchain.document_loaders import DirectoryLoader, TextLoader

In [3]:
# 批量导入指定路径下所有的 txt , 并显示进度条
path = "./data/youtube"
loader = DirectoryLoader(path, glob="**/*.txt", loader_cls=TextLoader, show_progress=True)

In [4]:
pages = loader.load()

100%|██████████| 1/1 [00:00<?, ?it/s]


### 2.2 基于字符的文本分割

In [6]:
from langchain.text_splitter import CharacterTextSplitter
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [7]:
chunk_size = 450 #设置块大小
chunk_overlap = 50 #设置块重叠大小

In [8]:
# 初始化文本分割器
# 设置空格, 句号为分隔符
r_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap,
    separators=["\n\n", "\n", "(?<=\. )", " ", ""]
)

In [9]:
splits_docs = r_splitter.split_documents(pages)

print("分割前:", len(pages), "\n分割后:", len(splits_docs))

分割前: 1 
分割后: 77


In [None]:
splits_docs[0].page_content

## 3 创建向量数据库

### 3.1 加载 Embedding 模型

In [5]:
from langchain.embeddings.openai import OpenAIEmbeddings

embedding = OpenAIEmbeddings()

### 3.2 创建向量数据库

In [None]:
from langchain.vectorstores import Chroma

persist_directory = 'data/chroma/SciPy2023_Presentation_Geemap/'
vectordb = Chroma.from_documents(
    documents=splits_docs,
    embedding=embedding,
    persist_directory=persist_directory
)

print("向量个数: ", vectordb._collection.count())

In [None]:
# 持久化向量数据库
vectordb.persist()

### 3.3 加载已有向量数据库

In [19]:
from langchain.vectorstores import Chroma

persist_directory = 'data/chroma/SciPy2023_Presentation_Geemap/'
vectordb = Chroma(
    persist_directory=persist_directory,
    embedding_function=embedding
)

## 4 提出问题并查询结果

In [13]:
# question = "I want to know how Geemap is used in research?"
question = "我想知道 Geemap 能实现哪些功能?"

# question = "有没有青岡有關的資訊?"

In [20]:
# 相似性搜索
answer = vectordb.similarity_search(question, k=3)

In [22]:
print(answer[1].page_content)

So it's not just for within the interface. Sometimes you want some high-quality figures. You can do that as well. And lastly, you can also, once you create something like this, you want to make it accessible to other people. You can also utilize gmap to create a web app. So the web app allows you to basically be on top of gmap and iPad widget and others.


## 5 基于语义检索内容的 QA 问答

### 5.1 加载模型

In [23]:
from langchain.chat_models import ChatOpenAI

llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)

### 5.2 基于 Prompt 模板的 QA 问答链

In [24]:
from langchain.prompts import PromptTemplate

template = """使用以下上下文片段来回答最后的问题。
如果你不知道答案，只需说不知道，不要试图编造答案。
答案最多使用三个句子。尽量简明扼要地回答。
在回答的最后一定要说"感谢您的提问！"
{context}
问题：{question}
有用的回答："""
QA_CHAIN_PROMPT = PromptTemplate(
    input_variables=["context", "question"],
    template=template
)

In [25]:
from langchain.chains import RetrievalQA

qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=vectordb.as_retriever(),
    return_source_documents=True,
    chain_type_kwargs={"prompt": QA_CHAIN_PROMPT}
)

In [None]:
result = qa_chain({"query": question})
print(result["result"])

### 5.3 基于 MapReduce 的检索式问答链

In [26]:
qa_chain_mr = RetrievalQA.from_chain_type(
    llm,
    retriever=vectordb.as_retriever(),
    chain_type="map_reduce"
)

In [None]:
result = qa_chain_mr({"query": question})
print(result["result"])

### 5.4 基于 Refine 的检索式问答链

In [27]:
qa_chain_r = RetrievalQA.from_chain_type(
    llm,
    retriever=vectordb.as_retriever(),
    chain_type="refine"
)

In [34]:
result = qa_chain_r({"query": question})
print(result["result"])

Based on the additional context provided, the original answer remains relevant and can be refined to better address the question. Here's the refined answer:

Yes, geemap can be applied to agricultural remote sensing research. Geemap is a Python library based on Google Earth Engine that provides powerful tools and functions for processing and analyzing satellite imagery data. It can be used to create interactive web apps for generating satellite time-lapse animations using various types of satellite data from anywhere around the globe. This capability can be particularly useful in agricultural remote sensing research, allowing researchers to visualize and analyze the changes in agricultural areas over time. Additionally, geemap can be integrated with desktop GIS software such as ArcGIS and QGIS, enabling researchers to further enhance their analysis of agricultural remote sensing data. By combining geemap with these desktop GIS tools, researchers can perform more advanced analysis, such

## 6 实现上下文 QA 问答

In [36]:
# 添加 ConversationBufferMemory 对话缓冲存储
from langchain.memory import ConversationBufferMemory
memory = ConversationBufferMemory(
    memory_key="chat_history",
    return_messages=True
)

In [37]:
from langchain.chains import ConversationalRetrievalChain
retriever=vectordb.as_retriever()
qa = ConversationalRetrievalChain.from_llm(
    llm,
    retriever=retriever,
    memory=memory
)

In [39]:
question = "geemap 可以应用于农业遥感的研究吗?"

result = qa({"question": question})
print(result["answer"])

是的，geemap可以应用于农业遥感的研究。通过geemap，您可以使用各种卫星数据来创建农业遥感图像，并进行分析和可视化。您可以提取农田的特征，比如植被指数、土壤湿度等，并进行监测和评估。此外，geemap还可以与其他开源应用程序（如ArcGIS、QGIS）集成，以便更全面地进行农业遥感研究。


In [41]:
question = "请总结一下本次 SciPy 2023 报告的主要内容。"

result = qa({"question": question})
print(result["answer"])

我不知道SciPy 2023报告的具体内容，因为上下文中并没有提到这个信息。
