In [1]:
import gradio as gr
from typing import Optional, List, Any
import torch
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.llms.base import LLM
from langchain.callbacks.manager import CallbackManagerForLLMRun
from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig, TextIteratorStreamer, AutoProcessor
# from modelscope import AutoTokenizer, AutoModelForCausalLM, GenerationConfig
from langchain import HuggingFacePipeline
from langchain.chains import RetrievalQA, ConversationalRetrievalChain
from langchain.prompts import PromptTemplate, ChatPromptTemplate, MessagesPlaceholder
from langchain_core.outputs.generation import GenerationChunk
from torch import device
import time
from langchain.memory import ConversationBufferMemory
from langchain_community.chat_message_histories import ChatMessageHistory
from langchain.chains.history_aware_retriever import create_history_aware_retriever

from langchain.chains.retrieval import create_retrieval_chain
from langchain_core.runnables import RunnableWithMessageHistory
from langchain.chains.combine_documents import create_stuff_documents_chain

from threading import Thread
import warnings
warnings.filterwarnings("ignore")

In [2]:
# 构建模型
class QianWenLLM(LLM):
    # 基于本地的QianWen7B-Chat模型自定义LLM类
    tokenizer: AutoTokenizer = None
    model: AutoModelForCausalLM = None
    processor: AutoProcessor = None
    
    def __init__(self, model_dir: str):
        # 从本地加载模型
        super().__init__()
        print('正从本地加载模型。。。。。')

        self.tokenizer = AutoTokenizer.from_pretrained(
            pretrained_model_name_or_path=model_dir,
            trust_remote_code=True,
        )
        self.model = AutoModelForCausalLM.from_pretrained(
            pretrained_model_name_or_path=model_dir,
            device_map='auto',
            trust_remote_code=True,
            torch_dtype=torch.bfloat16
            )
        self.model = self.model.eval()
        self.model.generation_config = GenerationConfig.from_pretrained(
            model_dir,
            trust_remote_code=True
        )
        # 可指定不同的生成长度、top_p等相关超参
        self.processor = AutoProcessor.from_pretrained(model_dir)
        print('模型加载完成！')

    def _call(self, prompt: str,
              stop: Optional[List[str]] = None,
              run_manager: Optional[CallbackManagerForLLMRun] = None,
              **kwargs: Any):
        # print('_call函数内查看prompt', prompt)
        response, history = self.model.chat(self.tokenizer, prompt, history=[])
        return response
        
    @property
    def _llm_type(self) -> str:
        return "QwenLM"

In [3]:
# 构建改写问题proompt
def contextualize_question_prompt():
    system_prompt = """\
    请根据聊天历史和最后用户的问题，改写用户最终剔除的问题。
    你只需要改写用户最终的问题，请不要回答问题。
    没有聊天历史则将用户问题直接返回，有聊天历史则改写。
    """
    contextualize_question_prompt = ChatPromptTemplate(
    [
        ('system', system_prompt),
        MessagesPlaceholder('chat_history'),
        ('human', '{input}')
    ]
    )
    return contextualize_question_prompt

# 构建正常问答prompt
def answer_prompt():
    system_prompt = """\
    使用上下文来回答最后的问题。如果你不知道答案，就说你不知道，不要试图编造答案。
    {context}
    """
    # 问题: {question}
    # 有用的回答:
    
    qa_prompt = ChatPromptTemplate(
    [
    ('system', system_prompt),
        MessagesPlaceholder('chat_history'),
        ('human', '{input}')
    ]
    )
    return qa_prompt

# 构造存储函数
def get_session_history(session_id: str):
    if session_id not in store:
        store[session_id] = ChatMessageHistory()
    return store[session_id]

In [4]:
def qa_history_chain():
    """
    构建问答链
    :param persist_directory: 知识库本地保存路径，这里初始化政策信息知识库
    :return: 返回调用LLM回答
    """
    persist_directory = r'vectordb/chroma'
    embeddings_model_cache_path = r'autodl-tmp/embedding_model/Ceceliachenen/paraphrase-multilingual-MiniLM-L12-v2'
    # 加载词向量模型
    embeddings = HuggingFaceEmbeddings(
        model_name=embeddings_model_cache_path)
    # 加载缓存知识库
    vectordb = Chroma(
        persist_directory=persist_directory,
        embedding_function=embeddings,
    )
    # 初始化模型
    model_cache_path = r'autodl-tmp/Qwen/Qwen-7B-Chat'
    llm = QianWenLLM(model_dir=model_cache_path)

    
    question_prompt = contextualize_question_prompt()
    history_aware_retriever = create_history_aware_retriever(llm, vectordb.as_retriever(), question_prompt)
    
    qa_prompt = answer_prompt()
    qa_chain = create_stuff_documents_chain(llm, qa_prompt)
    rag_chain = create_retrieval_chain(history_aware_retriever, qa_chain)
    
    # 包装
    conversational_rag_chain = RunnableWithMessageHistory(
        runnable=rag_chain,
        get_session_history=get_session_history,
        input_messages_key='input',
        history_messages_key='chat_history',
        output_messages_key='answer'
    )

    return conversational_rag_chain

In [5]:
store= {}

In [6]:
# 链接前面的函数
class Model_center():
    """
      存储问答 Chain 的对象
    """

    def __init__(self, ):
        print('初始化知识库问答链。。。。')
        self.qa_chain = qa_history_chain()

    def qa_chain_self_answer(self, question: str, chat_history: list = []):
        print('调用问答链')
        # print('打印用户问题', question)
        if question == None or len(question) < 1:
            print('问答为空。。。。')
            return '', chat_history
        # try:
        print('调用检索问答链。。。。')
        # 结果调用下流式输出
        # response = self.qa_chain.invoke({'query': question})['result']
        
        # 检索问答链+历史聊天组件
        response = self.qa_chain.invoke(
            {'input': question},
            config={'configurable':{'session_id':'test123'}}
        )['answer']

        chat_history.append([question, response])
        # print(chat_history)
        return '', chat_history
                
        # except Exception as e:
        #     print('问答链报错', e)
        #     return e, chat_history

    def clear_history(self):
        self.qa_chain.clear_history()

In [7]:
model_center = Model_center()
def update_chatbot(question, chat_history):
    for char in model_center.qa_chain_self_answer(question, chat_history):
        gr.update(value=chat_history)
        chat_history.append((question, char))
    return chat_history
# def demo():
block = gr.Blocks()
with block as demo:
    with gr.Row(equal_height=True):  # 水平排列子组件
        with gr.Column(scale=15):  # 垂直排列子组件
            gr.Markdown("""<h1><center>QwenLM7B-Chat</center></h1><center>科大讯飞实践-招中标政策智能问答助手</center>""")

    with gr.Row():
        with gr.Column(scale=4):
            # 创建聊天界面的组件。height=450 参数设置了聊天界面的高度为 450 像素。
            # show_copy_button=True参数表示在聊天界面中显示一个复制按钮，允许用户复制聊天内容
            chatbot = gr.Chatbot(height=450, show_copy_button=True)
            # 创建一个文本框组件，用于输入 prompts。
            msg = gr.Textbox(label='Prompt/问题')

            with gr.Row():
                # 创建提交按钮
                db_wo_his_btn = gr.Button('Chat')
            with gr.Row():
                # 创建一个清除按钮，用于清除聊天机器人组件的内容。
                clear_btn = gr.ClearButton(components=[chatbot], value='Clear console')

        # 设置按钮的点击事件。当点击时，调用上面定义的 qa_chain_self_answer 函数，并传入用户的消息和聊天历史记录，然后更新文本框和聊天机器人组件。
        print('进度1')
        # 设置流式输出
        def bot(question, history):
            # print('bot_question',question)
            # print('bot_history',history)
            curr, response = model_center.qa_chain_self_answer(question, history)
            # print('response', response)
            # print('curr', curr)
            history = response
            bot_message = history[-1][1]
            # print('bot_message', bot_message)
            history[-1][1] = ''
            for character in bot_message:
                history[-1][1] += character
                # print(f'累计中：{history}')
                time.sleep(0.1)
                yield '', history
        
        db_wo_his_btn.click(bot, inputs=[msg, chatbot], outputs=[msg, chatbot])

        print('进度2')
        # 点击后清空后端存储的聊天记录
        clear_btn.click(model_center.clear_history)

    # 填写注意事项
    gr.Markdown(
        """
        提醒：<br>
        1. 初始化数据库实践可能较长，请耐心等待。
        2. 使用中如果出现异常，将会在文本输入框进行展示，请不要惊慌。 <br>
        """
    )
# gr.close_all()
# 直接启动
demo.queue()
demo.launch(server_name='127.0.0.1', server_port=6006)

初始化知识库问答链。。。。
正从本地加载模型。。。。。


The model is automatically converting to bf16 for faster inference. If you want to disable the automatic precision, please manually add bf16/fp16/fp32=True to "AutoModelForCausalLM.from_pretrained".
Try importing flash-attention for faster inference...


Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

The repository for autodl-tmp/Qwen/Qwen-7B-Chat contains custom code which must be executed to correctly load the model. You can inspect the repository content at https://hf.co/autodl-tmp/Qwen/Qwen-7B-Chat.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N]  y
The repository for autodl-tmp/Qwen/Qwen-7B-Chat contains custom code which must be executed to correctly load the model. You can inspect the repository content at https://hf.co/autodl-tmp/Qwen/Qwen-7B-Chat.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N]  y


模型加载完成！
进度1
进度2
Running on local URL:  http://127.0.0.1:6006

To create a public link, set `share=True` in `launch()`.




IMPORTANT: You are using gradio version 3.50.2, however version 4.44.1 is available, please upgrade.
--------
调用问答链
调用检索问答链。。。。


The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


调用问答链
调用检索问答链。。。。
调用问答链
调用检索问答链。。。。
调用问答链
调用检索问答链。。。。


In [8]:
store

{'test123': InMemoryChatMessageHistory(messages=[HumanMessage(content='你好'), AIMessage(content='您好！有什么我能帮助您的吗？'), HumanMessage(content='王二有个哥哥叫王大'), AIMessage(content='这是一个假设性的场景，没有实际背景或上下文可供参考。请提供更多信息以便我可以更好地回答您的问题。'), HumanMessage(content='王二的哥哥叫什么名字'), AIMessage(content='王大的名字。'), HumanMessage(content='王二的哥哥叫什么'), AIMessage(content='王大的名字是王二的哥哥。')])}