# 解决大模型数据新鲜度低的问题
* 导入文本
* 使用搜索引擎/外部工具/API
* 向量数据库

In [None]:
import openai
import os


openai.api_key = os.getenv('MY_API_KEY')
openai.api_base = os.getenv('MY_API_BASE')

chat_history = []

# 定义辅助函数

In [None]:
def user_message(msg):
    item = {"role": "user", "content": msg}
    chat_history.append(item)

In [None]:
def assistant_message(msg):
    item = {"role": "assistant", "content": msg}
    chat_history.append(item)

In [None]:
def system_message(msg):
    item = {"role": "system", "content": msg}
    chat_history.append(item)

In [None]:
def clear_history():
    chat_history.clear()

In [None]:
import json
def dump_chat_history():
    print(json.dumps(chat_history,ensure_ascii=False))

In [None]:
# 辅助函数
def predict(model = 'gpt-3.5-turbo'):
    response = openai.ChatCompletion.create(
        model = model,
        messages = chat_history,
        user = "llm_cource2",
        # 是一个介于 0 ~ 1 之间的数，数值越大，代表生成的结果越不一致/或者稳定
        temperature = 0,
    )
    assistant_message(response.choices[0].message['content'])
    return response.choices[0].message['content']

# 导入文本

In [None]:
# 导入txt文本
def load_knowledge_from_txt(path):
    with open(path,'r',encoding='utf8') as file:
        content = file.read()

        file.close()
    return content

In [None]:
load_knowledge_from_txt("./example.txt")

In [None]:
!pip3 install PyPDF2

In [None]:
import PyPDF2

# 导入pdf
def load_knowledge_from_pdf(path):
    with open(path,'rb') as file:
        pdf_reader = PyPDF2.PdfReader(file)

        total_pages = len(pdf_reader.pages)

        content = ""

        for page_index in range(total_pages):
            page = pdf_reader.pages[page_index]
            text = page.extract_text()
            content += text

        file.close()
    return content

In [None]:
load_knowledge_from_pdf("./example.pdf")

In [None]:
clear_history()
user_message("Sam Altman 为什么被董事会罢免？")
predict()

In [None]:
clear_history()

knowledge = load_knowledge_from_txt("./example.txt")

prompt = f"""
你现在是一个问答助手，你需要优先根据以下的知识回答用户问题，如果以下提供的知识不足以回答用户问题，你可以根据自己的理解回答。
```
{knowledge}
```
"""
system_message(prompt)
user_message("Sam Altman 为什么被董事会罢免？")
predict()


In [None]:
dump_chat_history()

In [None]:
clear_history()

knowledge = load_knowledge_from_pdf("./example.pdf")

prompt = f"""
你现在是一个问答助手，你需要优先根据以下的知识回答用户问题，如果以下提供的知识不足以回答用户问题，你可以根据自己的理解回答。
```
{knowledge}
```
"""
system_message(prompt)
user_message("除了Sam Altman，还有谁离开了openai")
predict()


# 问题
* 如果文本太长，超出了chatgpt的token限制怎么办？

# 使用搜索引擎

In [None]:
import requests

def search_with_bing(keyword):
    headers = {"Ocp-Apim-Subscription-Key": os.getenv('BING_API_KEY')}
    params = {"q": keyword, 'mtk':'zh_CN','count':15}
    response = requests.get('https://api.bing.microsoft.com/v7.0/search', headers=headers, params=params)
    response.raise_for_status()
    search_results = response.json()
    result = ""
    for item in response.json()['webPages']['value']:
        result += item['snippet']
    return result

In [None]:
search_with_bing("北京今天天气")

In [None]:
user_message("北京今天天气怎么样？")
predict()

In [None]:
import re

clear_history()

prompt = f"""
你现在是一个AI助手，你需要耐心解答用户问题，如果你不知道，你可以输出'[search('keyword')]'，其中'keyword'是对用户问题的总结，总结需要尽量简洁且对搜索引擎友好，我会使用搜索引擎来协助你回答用户问题。
下面是一个服务例子：
user: 今天天气如何？
assitant: [search('今日天气')]
"""
system_message(prompt)
user_message("北京今天天气怎么样？")
response = predict()

pattern = r"\[search\('(.+?)'\)\]"

match = re.search(pattern,response)
if match:
    keyword = match.group(1)
    print(keyword)
    search_result = search_with_bing(keyword)
    system_message(search_result)
    response = predict()
    print(response)

# 向量数据库
使用数据库就是更加方便的管理你的私有知识库
这里使用chromadb

In [None]:
!pip3 install chromadb

定义几个数据库辅助函数
* 新建数据集
* 插入数据
* 检索数据

In [None]:
import chromadb
from chromadb.utils import embedding_functions

client = chromadb.Client()
openai_ef = embedding_functions.OpenAIEmbeddingFunction(api_key = os.getenv('MY_API_KEY'),api_base = os.getenv('MY_API_BASE'),model_name="text-embedding-ada-002")
client.delete_collection('my_collection')
colleciton = client.create_collection('my_collection',embedding_function=openai_ef)

In [None]:
# 插入数据
import hashlib
def insertion(doc):
    hash = hashlib.md5(doc.encode('utf8')).hexdigest()
    colleciton.add(
        documents=[doc],
        metadatas=[{'md5':hash}],
        ids=[hash]
    )

In [None]:
# 检索数据
def query_from_vec_db(keyword):
    results = colleciton.query(
        query_texts=[keyword],
        n_results=10
    )
    print(results)
    return results

插入几条数据

In [22]:
import time

content = load_knowledge_from_txt("./example.txt")
docs = content.split("。")
for doc in docs:
    insertion(doc)
    time.sleep(10)
    

Add of existing embedding ID: 23669b5430f11d6766a933fc8c39f04e
Insert of existing embedding ID: 23669b5430f11d6766a933fc8c39f04e
Add of existing embedding ID: 117bb3155c55e8c047eac2991f223a9d
Insert of existing embedding ID: 117bb3155c55e8c047eac2991f223a9d
Add of existing embedding ID: f138d4c353d3a128df6dc59503cec778
Insert of existing embedding ID: f138d4c353d3a128df6dc59503cec778
Add of existing embedding ID: ab9a70148ae795335a209ca3a779f935
Insert of existing embedding ID: ab9a70148ae795335a209ca3a779f935
Add of existing embedding ID: a480b3d721c27662ec801eb2d689f688
Insert of existing embedding ID: a480b3d721c27662ec801eb2d689f688
Add of existing embedding ID: 542cef2a6ac6b2d3825bb59a85ac4d2a
Insert of existing embedding ID: 542cef2a6ac6b2d3825bb59a85ac4d2a
Add of existing embedding ID: a83ff3a688edb373268c7f6a4535658f
Insert of existing embedding ID: a83ff3a688edb373268c7f6a4535658f
Add of existing embedding ID: 1327ac8938fda398d07081c97197790d
Insert of existing embedding ID: 1

In [24]:
query_from_vec_db("Sam Altman 为什么被董事会罢免？")

{'ids': [['117bb3155c55e8c047eac2991f223a9d', '64f966f1121f6f9ca9ce063c45ed041c', 'e58cca1e4c8a41dd649c56fc2633e441', 'f138d4c353d3a128df6dc59503cec778', '31caee0ad1aa8b854a0a53e1dca116d0', '370ad0a4f00f2d76c62260f103fd00db', 'd9f3952b0c2d8dd486aa9dd83ae8ce99', 'afbf1d4740f869f6df4160bae2b56599', 'd81638dc38b0f9b935afa542b1b546f1', 'fc5e3b3dff275d9407215790714a0515']], 'distances': [[0.2700326144695282, 0.270065575838089, 0.3001212179660797, 0.3281254470348358, 0.3289717435836792, 0.3323637545108795, 0.33528652787208557, 0.337248831987381, 0.3567003011703491, 0.3608716130256653]], 'metadatas': [[{'md5': '117bb3155c55e8c047eac2991f223a9d'}, {'md5': '64f966f1121f6f9ca9ce063c45ed041c'}, {'md5': 'e58cca1e4c8a41dd649c56fc2633e441'}, {'md5': 'f138d4c353d3a128df6dc59503cec778'}, {'md5': '31caee0ad1aa8b854a0a53e1dca116d0'}, {'md5': '370ad0a4f00f2d76c62260f103fd00db'}, {'md5': 'd9f3952b0c2d8dd486aa9dd83ae8ce99'}, {'md5': 'afbf1d4740f869f6df4160bae2b56599'}, {'md5': 'd81638dc38b0f9b935afa542b1b5

{'ids': [['117bb3155c55e8c047eac2991f223a9d',
   '64f966f1121f6f9ca9ce063c45ed041c',
   'e58cca1e4c8a41dd649c56fc2633e441',
   'f138d4c353d3a128df6dc59503cec778',
   '31caee0ad1aa8b854a0a53e1dca116d0',
   '370ad0a4f00f2d76c62260f103fd00db',
   'd9f3952b0c2d8dd486aa9dd83ae8ce99',
   'afbf1d4740f869f6df4160bae2b56599',
   'd81638dc38b0f9b935afa542b1b546f1',
   'fc5e3b3dff275d9407215790714a0515']],
 'distances': [[0.2700326144695282,
   0.270065575838089,
   0.3001212179660797,
   0.3281254470348358,
   0.3289717435836792,
   0.3323637545108795,
   0.33528652787208557,
   0.337248831987381,
   0.3567003011703491,
   0.3608716130256653]],
 'metadatas': [[{'md5': '117bb3155c55e8c047eac2991f223a9d'},
   {'md5': '64f966f1121f6f9ca9ce063c45ed041c'},
   {'md5': 'e58cca1e4c8a41dd649c56fc2633e441'},
   {'md5': 'f138d4c353d3a128df6dc59503cec778'},
   {'md5': '31caee0ad1aa8b854a0a53e1dca116d0'},
   {'md5': '370ad0a4f00f2d76c62260f103fd00db'},
   {'md5': 'd9f3952b0c2d8dd486aa9dd83ae8ce99'},
   {'md5

In [27]:
def get_similarest_result(results):
    similar = ""
    distance = 1
    for idx in range(len(results['ids'][0])):
        if results['distances'][0][idx] < distance:
            distance = results['distances'][0][idx]
            similar = results['documents'][0][idx]
    return similar


最后就是例子

In [28]:
clear_history()

question = "Sam Altman为什么被开除？"
similar = get_similarest_result(query_from_vec_db(question))
prompt = f"""
对下面的信息做总结
```
Q:{question}
A:{similar}
```
"""
user_message(prompt)
response = predict()
print(response)

{'ids': [['117bb3155c55e8c047eac2991f223a9d', '64f966f1121f6f9ca9ce063c45ed041c', 'e58cca1e4c8a41dd649c56fc2633e441', 'f138d4c353d3a128df6dc59503cec778', '370ad0a4f00f2d76c62260f103fd00db', 'd9f3952b0c2d8dd486aa9dd83ae8ce99', 'afbf1d4740f869f6df4160bae2b56599', '31caee0ad1aa8b854a0a53e1dca116d0', '23669b5430f11d6766a933fc8c39f04e', 'fc5e3b3dff275d9407215790714a0515']], 'distances': [[0.264531672000885, 0.2734794020652771, 0.2768864333629608, 0.30970245599746704, 0.3169591724872589, 0.32905107736587524, 0.3493810296058655, 0.352994441986084, 0.3698384761810303, 0.3870091140270233]], 'metadatas': [[{'md5': '117bb3155c55e8c047eac2991f223a9d'}, {'md5': '64f966f1121f6f9ca9ce063c45ed041c'}, {'md5': 'e58cca1e4c8a41dd649c56fc2633e441'}, {'md5': 'f138d4c353d3a128df6dc59503cec778'}, {'md5': '370ad0a4f00f2d76c62260f103fd00db'}, {'md5': 'd9f3952b0c2d8dd486aa9dd83ae8ce99'}, {'md5': 'afbf1d4740f869f6df4160bae2b56599'}, {'md5': '31caee0ad1aa8b854a0a53e1dca116d0'}, {'md5': '23669b5430f11d6766a933fc8c3

In [29]:
dump_chat_history()

[{"role": "user", "content": "\n对下面的信息做总结\n```\nQ:Sam Altman为什么被开除？\nA:\n关于离职原因，OpenAI 方面给出的理由是：Sam Altman 先生的离职是在董事会经过审议后得出的结论，他在与董事会的沟通中始终不坦诚，阻碍了董事会履行职责的能力\n```\n"}, {"role": "assistant", "content": "Sam Altman被开除的原因是他在与董事会的沟通中不坦诚，阻碍了董事会履行职责的能力。"}]


#大模型数据新鲜度低的问题
* 导入文本，chatpdf
* 使用搜索引擎，外部工具，API
* 向量数据库
下一期看看langchain