In [1]:
import gradio as gr
from typing import Optional, List, Any
import torch
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.llms.base import LLM
from langchain.callbacks.manager import CallbackManagerForLLMRun
from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig, TextIteratorStreamer, AutoProcessor
# from modelscope import AutoTokenizer, AutoModelForCausalLM, GenerationConfig
from langchain import HuggingFacePipeline
from langchain.chains import RetrievalQA, ConversationalRetrievalChain
from langchain.prompts import PromptTemplate, ChatPromptTemplate, MessagesPlaceholder
from langchain_core.outputs.generation import GenerationChunk
from torch import device
import time
from langchain.memory import ConversationBufferMemory
from langchain_community.chat_message_histories import ChatMessageHistory
from langchain.chains.history_aware_retriever import create_history_aware_retriever

from langchain.chains.retrieval import create_retrieval_chain
from langchain_core.runnables import RunnableWithMessageHistory
from langchain.chains.combine_documents import create_stuff_documents_chain
from transformers import pipeline

# from threading import Thread
import warnings
warnings.filterwarnings("ignore")

In [2]:
device = torch.device("cuda")

In [3]:
# 构建模型
class GemmaLLM(LLM):
    # 基于本地的QianWen7B-Chat模型自定义LLM类
    tokenizer: AutoTokenizer = None
    model: AutoModelForCausalLM = None
    processor: AutoProcessor = None
    
    def __init__(self, model_dir: str):
        # 从本地加载模型
        super().__init__()
        print('正从本地加载模型。。。。。')

        self.tokenizer = AutoTokenizer.from_pretrained(
            pretrained_model_name_or_path=model_dir,
            trust_remote_code=True,
        )
        self.model = AutoModelForCausalLM.from_pretrained(
            pretrained_model_name_or_path=model_dir,
            # device_map='auto',
            trust_remote_code=True,
            torch_dtype=torch.bfloat16,
            # temperature=0.1
            ).to(device)
        self.model = self.model.eval()
        self.model.generation_config = GenerationConfig.from_pretrained(
            model_dir,
            trust_remote_code=True
        )
        # 可指定不同的生成长度、top_p等相关超参
        self.processor = AutoProcessor.from_pretrained(model_dir)
        print('模型加载完成！')

    def _call(self, messages,
              stop: Optional[List[str]] = None,
              run_manager: Optional[CallbackManagerForLLMRun] = None,
              **kwargs: Any):
        messages = [
            {"role": "user", "content": messages}
        ]
        text = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
        model_inputs = self.tokenizer([text], return_tensors='pt').to(device)
        # print('input_ids:', input_ids)
        # input_ = torch.tensor([input_ids])
        # outputs = self.model.generate(**input_ids,max_new_tokens=1024)
        # response = outputs[0][input_ids.shape[-1] :]
        # res = self.tokenizer.decode(response, skip_special_tokens=True)
        generated_ids = self.model.generate(
            **model_inputs,
            max_new_tokens=1024
        )
        generated_ids = [
            output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
        ]
        response = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
        
        # print('模型输出：',response)
        return response
        
    @property
    def _llm_type(self) -> str:
        return "Gemma2LM"

In [4]:
def init_db():
    persist_directory = r'vectordb/chroma/m3e_base'
    embeddings_model_cache_path = r'autodl-tmp/embedding_model/m3e/AI-ModelScope/m3e-base'
    # 加载词向量模型
    embeddings = HuggingFaceEmbeddings(
        model_name=embeddings_model_cache_path)
    # 加载缓存知识库
    vectordb = Chroma(
        persist_directory=persist_directory,
        embedding_function=embeddings,
    )
    return vectordb

In [5]:
def init_model(vectordb):
    # 初始化模型
    # model_cache_path = r'autodl-tmp/Qwen/Qwen-7B-Chat'
    model_cache_path = r'autodl-tmp/LLM-Research/gemma-2-9b-it'
    llm = GemmaLLM(model_dir=model_cache_path)
    template = """
        使用上下文来回答最后的问题。如果你不知道答案，就说你不知道，不要试图编造答案。总是在回答的最后说“谢谢你的提问！”。
        {context}
        问题：{question}
        """
    QA_CHAIN_PROMPT = PromptTemplate(input_variables=['context', 'question'], template=template)

    # 构造检索问答链
    qa_chain_ = RetrievalQA.from_chain_type(
        llm=llm,
        retriever=vectordb.as_retriever(),
        return_source_documents=True,
        chain_type_kwargs={'prompt': QA_CHAIN_PROMPT},
        # chain_type='stuff'
    )
    return qa_chain_

In [6]:
# 加载数据
import pandas as pd
import csv
import os
import re
from tqdm import tqdm
# 加载数据文本，提取问题和真实数据源
test_res = []
qa_data = pd.read_excel('qa_policy_new.xlsx').iloc[:, 2:]
length = len(qa_data.iloc[0, :])
history = []

# 初始化模型
vectordb = init_db()
model = init_model(vectordb)

file = 'gemma2模型问答测试结果.csv'
# data_res = pd.DataFrame([], columns=['Question','Answer', 'reference', 'Metadata', 'Text', '回答', 'RAG召回'])
if os.path.exists(file):
    data_tested = pd.read_csv(file, encoding='utf-8')['Question'].tolist()
else:
    data_tested = []
for q in tqdm(range(len(qa_data))):
    question = qa_data.iloc[q, 0]
    
    if question not in data_tested:
        # print('打印问题:', question)
        response = model.invoke({'query': question})#['result']
        # print('打印问答链结果：', response)
        answer = response['result']
        call = response['source_documents']
        # target = model._call(question)
        # print('response参数：', response)
        data_ = qa_data.iloc[q].values.tolist() + [answer] + [call]
        with open(file, 'a', newline='', encoding='utf-8') as f:
            fw = csv.writer(f)
            if not os.path.getsize(file):
                header = ['Question', 'Answer', 'reference', 'Metadata', 'Text', '回答', 'RAG召回']
                fw.writerow(header)
            fw.writerow(data_)
    #         # 写入标题行
    # writer.writerow(headers)
    # data_.to_excel('qwen2.5模型测试结果.xlsx', index=False)



正从本地加载模型。。。。。


OSError: Incorrect path_or_model_id: 'autodl-tmp/LLM-Research/gemma-2-9b-it'. Please provide either the path to a local folder or the repo_id of a model on the Hub.