In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers.generation import GenerationConfig
import torch

# import
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.document_loaders import TextLoader
from langchain.embeddings import HuggingFaceEmbeddings

from langchain import LLMChain
from langchain.chains.mapreduce import MapReduceChain
from langchain.prompts import PromptTemplate
from langchain.llms.base import LLM
from transformers import AutoTokenizer, AutoModel, AutoConfig
from typing import Any, Dict, List, Mapping, Optional, Tuple, Union
from torch.mps import empty_cache
import torch
from langchain.chains import RetrievalQA

torch.manual_seed(1234)

from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig
from sentence_transformers import SentenceTransformer, util
from typing import Any, List, Optional
from pydantic import BaseModel


from pydantic import BaseModel
from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig
from langchain.chains import RetrievalQA
from langchain.text_splitter import CharacterTextSplitter
from langchain.document_loaders import TextLoader
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from sentence_transformers import SentenceTransformer, util
from langchain.llms.base import BaseLLM
from typing import List, Optional, Any

In [2]:
class QwenRunnable(LLM, BaseModel):
    model: Any
    tokenizer: Any
    device: str = "cuda:7"

    def _call(self, prompt: str, stop: Optional[List[str]] = None) -> str:
        response, _ = self.model.chat(self.tokenizer, query=prompt, history=None)
        return response

    @property
    def _llm_type(self) -> str:
        return "qwen"

class Qwen:
    def __init__(self, model_path: str, device: str = "cuda:7"):
        self.model_path = model_path
        self.device = device
        self.tokenizer = None
        self.model = None
        self.llm_runnable = None
        self.retriever = None

    def load_model(self):
        try:
            self.tokenizer = AutoTokenizer.from_pretrained(self.model_path, trust_remote_code=True)
            self.model = AutoModelForCausalLM.from_pretrained(self.model_path, trust_remote_code=True)
            self.model.to(self.device)
            self.model.eval()
            self.model.generation_config = GenerationConfig.from_pretrained(self.model_path, trust_remote_code=True)
            self.llm_runnable = QwenRunnable(model=self.model, tokenizer=self.tokenizer, device=self.device)
        except Exception as e:
            print(f"Error loading model: {e}")
            raise

    def generate_response(self, prompt: str, history: list = None):
        try:
            response, history = self.model.chat(self.tokenizer, query=prompt, history=history)
            return response, history
        except Exception as e:
            print(f"Error generating response: {e}")
            return None, history

    def load_retriever(self, doc_path: str, embedding_model_path: str, embedding_device: str = "cuda:0"):
        try:
            # Load documents
            loader = TextLoader(doc_path, encoding="utf-8")
            documents = loader.load()

            # Split documents into chunks
            text_splitter = CharacterTextSplitter(chunk_size=100, chunk_overlap=0)
            docs = text_splitter.split_documents(documents)

            # Create the embedding function
            model_kwargs = {'device': embedding_device}
            embedding_function = HuggingFaceEmbeddings(model_name=embedding_model_path, model_kwargs=model_kwargs)

            # Load into Chroma
            db = Chroma.from_documents(docs, embedding_function)
            self.retriever = db.as_retriever()
        except Exception as e:
            print(f"Error loading retriever: {e}")
            raise

    def run_qa(self, query: str):
        try:
            qa = RetrievalQA.from_chain_type(llm=self.llm_runnable, chain_type="stuff", retriever=self.retriever)
            return qa.run(query)
        except Exception as e:
            print(f"Error running QA: {e}")
            return None

class TopicGPTWithQwen(Qwen):
    def generate_topics(self, documents, example_topics):
        history = []
        topics = example_topics.copy()
        for doc in documents:
            prompt = f"Document: {doc}\nExample Topics: {example_topics}\nGenerate a new topic if the document doesn't fit existing topics."
            response, history = self.generate_response(prompt, history)
            if response:
                topics.append(response)
        return topics

    def refine_topics(self, topics):
        model = SentenceTransformer('/data1/dxw_data/llm/paraphrase-multilingual-MiniLM-L12-v2')
        topic_embeddings = model.encode(topics, convert_to_tensor=True)
        refined_topics = []

        for i in range(len(topics)):
            if topics[i] not in refined_topics:
                for j in range(i + 1, len(topics)):
                    if util.cos_sim(topic_embeddings[i], topic_embeddings[j]) >= 0.5:
                        break
                else:
                    refined_topics.append(topics[i])

        final_topics = []
        history = []
        for topic in refined_topics:
            prompt = f"Topic: {topic}\nRefined Topics: {refined_topics}\nDo you agree this topic should be kept?"
            response, history = self.generate_response(prompt, history)
            if response and response.lower() not in ["no", "disagree"]:
                final_topics.append(topic)

        return final_topics

    def assign_topics(self, documents, topics):
        history = []
        assignments = {}

        for doc in documents:
            prompt = f"Document: {doc}\nTopics: {topics}\nAssign the most relevant topic to the document and provide a quote."
            response, history = self.generate_response(prompt, history)
            if response:
                assignments[doc] = response

        return assignments

    def self_correct(self, assignments):
        history = []
        corrected_assignments = {}

        for doc, assignment in assignments.items():
            if "None" in assignment or "Error" in assignment:
                prompt = f"Document: {doc}\nError: {assignment}\nPlease reassign a valid topic."
                response, history = self.generate_response(prompt, history)
                if response:
                    corrected_assignments[doc] = response
            else:
                corrected_assignments[doc] = assignment

        return corrected_assignments

# 使用示例

# Path to the model directory
model_path = "/data1/dxw_data/llm/Qwen-VL-Chat"
device = 'cuda:7'

# Instantiate and load the model
qwen_model = TopicGPTWithQwen(model_path, device)
qwen_model.load_model()

2024-06-20 13:03:11.978561: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-06-20 13:03:12.099278: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-06-20 13:03:12.696315: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda/lib64
2024-06-20 13:03:12.696386: W tensorflow/compiler/xla/stream_exec

Loading checkpoint shards:   0%|          | 0/10 [00:00<?, ?it/s]

In [3]:
# 示例数据
documents = [
    "The stock market saw a significant increase in value due to positive economic policies.",
    "New agricultural techniques have improved crop yields significantly.",
    "Tech companies are investing heavily in artificial intelligence research.",
    "Economic growth is expected to continue with new trade agreements.",
    "Farmers are adopting new technologies to boost production."
]

example_topics = [
    "Economy: Mentions policies, growth, and financial markets.",
    "Agriculture: Discusses farming techniques, crop yields, and agricultural policies."
]

# Load the retriever
doc_path = "/data1/dxw_data/llm/mkt_llm/starbuck/starbuck_comments_1.txt"
embedding_model_path = "/data1/dxw_data/llm/text2vec-large-chinese"
qwen_model.load_retriever(doc_path, embedding_model_path, embedding_device='cuda:6')



Created a chunk of size 127, which is longer than the specified 100
Created a chunk of size 119, which is longer than the specified 100
Created a chunk of size 101, which is longer than the specified 100
Created a chunk of size 229, which is longer than the specified 100
Created a chunk of size 105, which is longer than the specified 100
Created a chunk of size 152, which is longer than the specified 100
Created a chunk of size 101, which is longer than the specified 100
Created a chunk of size 115, which is longer than the specified 100
Created a chunk of size 182, which is longer than the specified 100
Created a chunk of size 104, which is longer than the specified 100
Created a chunk of size 185, which is longer than the specified 100
Created a chunk of size 199, which is longer than the specified 100
Created a chunk of size 104, which is longer than the specified 100
Created a chunk of size 404, which is longer than the specified 100
No sentence-transformers model found with name /

In [None]:
# -------------更加符合论文agentverse

# 解释
# generate_topics：使用代理处理每个文档，生成新的主题。通过保存和加载 history，每次调用都包含之前的上下文。
# refine_topics：使用 SentenceTransformer 模型对主题进行编码，然后通过余弦相似度去除相似的主题。代理逐个处理每个主题，并通过 history 保存上下文来决定最终的主题。
# assign_topics：代理逐个处理每个文档，分配最相关的主题，并提供引用。通过 history 保存上下文。
# self_correct：检查并修正无效的分配结果。代理逐个处理每个文档，并通过 history 保存上下文来提供修正。

In [4]:
# Generate topics
generated_topics = qwen_model.generate_topics(documents, example_topics)
print("Generated Topics:")
print(generated_topics)

# Refine topics
refined_topics = qwen_model.refine_topics(generated_topics)
print("Refined Topics:")
print(refined_topics)

# Assign topics
assignments = qwen_model.assign_topics(documents, refined_topics)
print("Topic Assignments:")
for doc, assignment in assignments.items():
    print(f"Document: {doc}\nAssignment: {assignment}\n")

# Self-correct
corrected_assignments = qwen_model.self_correct(assignments)
print("Corrected Topic Assignments:")
for doc, assignment in corrected_assignments.items():
    print(f"Document: {doc}\nCorrected Assignment: {assignment}\n")

# Example QA usage
query = "根据文档内容,请说明有哪些这些用户评论分为哪些主题"
qa_response = qwen_model.run_qa(query)
print(qa_response)

Generated Topics:
['Economy: Mentions policies, growth, and financial markets.', 'Agriculture: Discusses farming techniques, crop yields, and agricultural policies.', "['Investment: Analyzes the impact of positive economic policies on the stock market and financial markets as a whole']", "['Agriculture: Examines the benefits of new agricultural techniques on crop yields and their potential impact on the economy and financial markets']", "['Tech Industry: Analyzes the potential impact of increased investment in artificial intelligence research on the tech industry and the economy as a whole']", "['Trade: Examines the potential benefits and risks of new trade agreements on economic growth and the financial markets']", "['Agriculture: Analyzes the impact of new technologies on crop yields and the potential benefits for farmers and the economy as a whole']"]
Refined Topics:
["['Tech Industry: Analyzes the potential impact of increased investment in artificial intelligence research on the t

  warn_deprecated(


根据文档内容，用户评论主要分为以下主题：
1. 老婆饼：有用户表示御蝶坊的老婆饼是最有名的，味道好，但也有用户表示购买的老婆饼口味发酸，怀疑是之前未卖完的重新装袋打日期卖的。
2. 生日蛋糕：有用户表示御蝶坊的生日蛋糕非常受欢迎，性价比高，是经常购买的款式之一。
3. 面包：有用户表示御蝶坊的面包是真材实料，买给孩子吃放心，每周都会多次购买作为早餐，且办理了会员卡。
4. 蛋挞：有用户表示御蝶坊的蛋挞非常好吃，特别是海盐蛋糕和肉松小贝，香芋那个也很好吃。
5. 价格：有用户表示御蝶坊的产品价格越来越高，性价比有所下降。
6. 味道：有用户表示御蝶坊的产品味道一直不错，特别是老婆饼和蛋挞，但也有用户表示价格越来越高，感觉不值。
7. 服务：有用户表示御蝶坊的服务态度很好，卫生环境也很好。
