In [1]:
import json
from pydantic import BaseModel
from typing import Any, Optional, List
from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.document_loaders import TextLoader
from langchain.vectorstores import Chroma
from langchain.schema import Document
from sentence_transformers import SentenceTransformer, util
import torch
from torch.cuda.amp import autocast

class QwenRunnable(BaseModel):
    model: Any
    tokenizer: Any
    device: str

    def _call(self, prompt: str, stop: Optional[List[str]] = None) -> str:
        response, _ = self.model.chat(self.tokenizer, query=prompt, history=None)
        return response

    @property
    def _llm_type(self) -> str:
        return "qwen"

class Qwen:
    def __init__(self, model_path: str, device: str):
        self.model_path = model_path
        self.device = device
        self.tokenizer = None
        self.model = None
        self.llm_runnable = None
        self.retriever = None
        self.vector_db = None

    def load_model(self):
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_path, trust_remote_code=True)
        self.model = AutoModelForCausalLM.from_pretrained(self.model_path, trust_remote_code=True)
        self.model.to(self.device)
        self.model.eval()
        self.model.generation_config = GenerationConfig.from_pretrained(self.model_path, trust_remote_code=True)
        self.llm_runnable = QwenRunnable(model=self.model, tokenizer=self.tokenizer, device=self.device)

    def load_retriever(self, doc_paths: List[str], embedding_model_path: str, embedding_device: str):
        documents = []
        example_topics = []

        for doc_path in doc_paths:
            with open(doc_path, 'r', encoding='utf-8') as file:
                data = json.load(file)
            documents.extend([Document(page_content=doc) for doc in data['documents']])
            example_topics.extend(data['example_topics'])

        # Split documents into chunks
        text_splitter = CharacterTextSplitter(chunk_size=100, chunk_overlap=0)
        docs = text_splitter.split_documents(documents)

        # Create the embedding function
        model_kwargs = {'device': embedding_device}
        embedding_function = HuggingFaceEmbeddings(model_name=embedding_model_path, model_kwargs=model_kwargs)

        # Load into Chroma
        db = Chroma.from_documents(docs, embedding_function)
        self.retriever = db.as_retriever()
        self.vector_db = db

        return documents, example_topics

    def update_vector_db(self, new_documents: List[str]):
        # Convert strings to Document objects
        doc_objects = [Document(page_content=doc) for doc in new_documents]

        # Split documents into chunks
        text_splitter = CharacterTextSplitter(chunk_size=100, chunk_overlap=0)
        docs = text_splitter.split_documents(doc_objects)

        # Update vector database
        if self.vector_db:
            self.vector_db.add_documents(docs)
        else:
            raise ValueError("Vector database not initialized.")

    def append_to_expert_db(self, doc_path: str, new_data: dict):
        with open(doc_path, 'r+', encoding='utf-8') as file:
            data = json.load(file)
            data['documents'].extend(new_data['documents'])
            data['example_topics'].extend(new_data['example_topics'])
            file.seek(0)
            json.dump(data, file, ensure_ascii=False, indent=4)

    def generate_response(self, prompt: str):
        with autocast():
            response, _ = self.model.chat(self.tokenizer, query=prompt, history=None)
        return response

    def retrieve_knowledge(self, query: str):
        if self.retriever:
            return self.retriever.retrieve(query)
        else:
            return None

class TopicGPTWithQwen(Qwen):
    def __init__(self, model_path: str, device: str, embedding_device: str, refine_device: str):
        super().__init__(model_path, device)
        self.embedding_device = embedding_device
        self.refine_device = refine_device

    def generate_topics(self, documents, example_topics, expert_dbs: List[str]):
        topics = example_topics.copy()
        for doc in documents:
            for expert_db in expert_dbs:
                expert_data = self.load_single_expert_db(expert_db)
                self.update_vector_db(expert_data['documents'])
                prompt = f"Document: {doc.page_content}\nExample Topics: {example_topics}\nGenerate a new topic if the document doesn't fit existing topics."
                response = self.generate_response(prompt)
                topics.append(response)
        return topics

    def refine_topics(self, topics):
        model = SentenceTransformer('/data1/dxw_data/llm/paraphrase-multilingual-MiniLM-L12-v2', device=self.refine_device)
        topic_embeddings = model.encode(topics, convert_to_tensor=True)
        refined_topics = []

        for i in range(len(topics)):
            if topics[i] not in refined_topics:
                for j in range(i + 1, len(topics)):
                    if util.cos_sim(topic_embeddings[i], topic_embeddings[j]) >= 0.5:
                        break
                else:
                    refined_topics.append(topics[i])

        final_topics = []
        for topic in refined_topics:
            prompt = f"Topic: {topic}\nRefined Topics: {refined_topics}\nDo you agree this topic should be kept?"
            response = self.generate_response(prompt)
            if response.lower() not in ["no", "disagree"]:
                final_topics.append(topic)

        return final_topics

    def assign_topics(self, documents, topics):
        assignments = {}

        for doc in documents:
            prompt = f"Document: {doc.page_content}\nTopics: {topics}\nAssign the most relevant topic to the document and provide a quote."
            response = self.generate_response(prompt)
            assignments[doc.page_content] = response

        return assignments

    def self_correct(self, assignments):
        corrected_assignments = {}

        for doc, assignment in assignments.items():
            if "None" in assignment or "Error" in assignment:
                prompt = f"Document: {doc}\nError: {assignment}\nPlease reassign a valid topic."
                response = self.generate_response(prompt)
                corrected_assignments[doc] = response
            else:
                corrected_assignments[doc] = assignment

        return corrected_assignments

    def load_single_expert_db(self, doc_path: str):
        with open(doc_path, 'r', encoding='utf-8') as file:
            data = json.load(file)
        return data

# Path settings
model_path = "/data1/dxw_data/llm/Qwen-VL-Chat"

# Instantiate and load model
qwen_model = TopicGPTWithQwen(model_path, device='cuda:0', embedding_device='cuda:4', refine_device='cuda:5')
qwen_model.load_model()

# Load retriever and vector database
doc_paths = [
    "/data1/dxw_data/llm/MKT_data_mining/LLM/agent/memory_step1.txt",
    "/data1/dxw_data/llm/MKT_data_mining/LLM/agent/memory_step2.txt",
    "/data1/dxw_data/llm/MKT_data_mining/LLM/agent/memory_step3.txt"
]
embedding_model_path = "/data1/dxw_data/llm/text2vec-large-chinese"
documents, example_topics = qwen_model.load_retriever(doc_paths, embedding_model_path, embedding_device='cuda:4')

# Generate topics
generated_topics = qwen_model.generate_topics(documents, example_topics, doc_paths)
print("Generated Topics:")
print(generated_topics)

# Dynamically update expert database and vector database
for expert_db in doc_paths:
    new_data = {"documents": [doc.page_content for doc in documents], "example_topics": generated_topics}
    qwen_model.append_to_expert_db(expert_db, new_data)
    qwen_model.update_vector_db(new_data['documents'])

# Refine topics
refined_topics = qwen_model.refine_topics(generated_topics)
print("Refined Topics:")
print(refined_topics)

# Assign topics
assignments = qwen_model.assign_topics(documents, refined_topics)
print("Topic Assignments:")
for doc, assignment in assignments.items():
    print(f"Document: {doc}\nAssignment: {assignment}\n")

# Self-correct assignments
corrected_assignments = qwen_model.self_correct(assignments)
print("Corrected Topic Assignments:")
for doc, assignment in corrected_assignments.items():
    print(f"Document: {doc}\nCorrected Assignment: {assignment}\n")
    
# 定义保存结果的函数
def save_results_to_file(file_path, assignments):
    with open(file_path, 'w', encoding='utf-8') as file:
        for doc, assignment in assignments.items():
            file.write(f"Document: {doc}\nCorrected Assignment: {assignment}\n\n")
            
# 保存结果到txt文件
save_results_to_file("/data1/dxw_data/llm/MKT_data_mining/LLM/agent/results.txt", corrected_assignments)

2024-06-20 19:05:28.274040: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-06-20 19:05:28.427149: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-06-20 19:05:29.134282: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda/lib64
2024-06-20 19:05:29.134380: W tensorflow/compiler/xla/stream_exec

Loading checkpoint shards:   0%|          | 0/10 [00:00<?, ?it/s]

No sentence-transformers model found with name /data1/dxw_data/llm/text2vec-large-chinese. Creating a new one with MEAN pooling.


Generated Topics:
['Economy: Mentions policies, growth, and financial markets.', 'Agriculture: Discusses farming techniques, crop yields, and agricultural policies.', 'Technology: Focuses on tech companies, investments, and innovations.', 'Renewable Energy: Discusses investments and tech company focus.', 'Sustainable Farming: Mentions agricultural policies and practices.', 'Artificial Intelligence: Focuses on AI research and technological innovations.', 'Financial Markets: Analyzes reactions to economic policies.', 'AI Adoption: Details on tech firms adopting AI advancements.', 'Sustainable Farming Techniques: Explores modern sustainable farming methods.', 'Stock Market Response: Analysis of stock market reactions to policies.', 'Global Trade Agreements: Overview of new international trade agreements.', 'Technological Farming: Describes technology use in agriculture.', "'Financial Market Reactions to Economic Policies: A Comprehensive Analysis of the Stock Market's Response to Positive

In [None]:
    # +----------------------------+
    # | Load Data from memory.txt  |
    # | (Load JSON Data)           |
    # +------------+---------------+
    #              |
    #              v
    # +----------------------------+
    # | Convert to Document Object |
    # +------------+---------------+
    #              |
    #              v
    # +----------------------------+
    # | Split Documents into Chunks|
    # +------------+---------------+
    #              |
    #              v
    # +----------------------------+
    # | Create Embeddings          |
    # | (HuggingFaceEmbeddings)    |
    # +------------+---------------+
    #              |
    #              v
    # +----------------------------+
    # | Store in Chroma Database   |
    # +------------+---------------+
    #              |
    #              v
    # +----------------------------+
    # | Create Retriever           |
    # +----------------------------+
    #              |
    #              v
    # +-------------------------------------------------------------+
    # | Generate Topics                                              |
    # | (Iterate through documents, use prompt to generate topics)  |
    # +------------+------------------------------------------------+
    #              |
    #              v
    # +-------------------------------------------------------------+
    # | Refine Topics                                                |
    # | (Convert topics to embeddings, calculate cosine similarity, |
    # | remove redundant topics)                                     |
    # +------------+------------------------------------------------+
    #              |
    #              v
    # +-------------------------------------------------------------+
    # | Assign Topics                                                |
    # | (Iterate through documents, use prompt to assign topics)    |
    # +------------+------------------------------------------------+
    #              |
    #              v
    # +-------------------------------------------------------------+
    # | Self-Correct Assignments                                     |
    # | (Check for errors or None, reassign valid topics)           |
    # +------------+------------------------------------------------+
