In [None]:
import json
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer, util
from transformers import AutoTokenizer, AutoModel, AutoConfig
from typing import List, Optional

# Define the GLM class
class GLM:
    max_token: int = 2048
    temperature: float = 0.8
    top_p = 0.9
    tokenizer: object = None
    model: object = None
    history_len: int = 1024

    def __init__(self):
        pass

    @property
    def _llm_type(self) -> str:
        return "GLM"

    def load_model(self, llm_device="gpu", model_name_or_path=None):
        model_config = AutoConfig.from_pretrained(model_name_or_path, trust_remote_code=True)
        self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
        self.model = AutoModel.from_pretrained(model_name_or_path, config=model_config, trust_remote_code=True, device='cuda:5').half()

    def _call(self, prompt: str, history: List[str] = [], stop: Optional[List[str]] = None):
        response, _ = self.model.chat(
            self.tokenizer, prompt,
            history=history[-self.history_len:] if self.history_len > 0 else [],
            max_length=self.max_token, temperature=self.temperature,
            top_p=self.top_p
        )
        return response

    def generate_topics(self, representative_texts, example_topics, all_vectors, all_texts):
        history = []
        topics = []

        def find_top_k_closest(representative_vector, all_vectors, k=5):
            similarities = cosine_similarity([representative_vector], all_vectors)[0]
            top_k_indices = similarities.argsort()[-k:][::-1]
            return top_k_indices

        for rep_text, rep_vector in zip(representative_texts, all_vectors):
            top_indices = find_top_k_closest(rep_vector, all_vectors)
            top_texts = [all_texts[i] for i in top_indices]
            prompt = f"请概括这些评论数据的主题: {' '.join(top_texts)}"
            response = self._call(prompt, history)
            topics.append(response)
        return topics

    def refine_topics(self, topics):
        model = SentenceTransformer('/data1/dxw_data/llm/paraphrase-multilingual-MiniLM-L12-v2')
        topic_embeddings = model.encode(topics, convert_to_tensor=True)
        refined_topics = []
        for i in range(len(topics)):
            if topics[i] not in refined_topics:
                for j in range(i + 1, len(topic_embeddings)):
                    if util.cos_sim(topic_embeddings[i], topic_embeddings[j]) >= 0.5:
                        break
                else:
                    refined_topics.append(topics[i])
        return refined_topics

    def assign_topics(self, documents, topics):
        history = []
        assignments = {}
        for doc in documents:
            prompt = f"Document: {doc}\nTopics: {topics}\nAssign the most relevant topic to the document and provide a quote."
            response = self._call(prompt, history)
            assignments[doc] = response
        return assignments

    def self_correct(self, assignments):
        history = []
        corrected_assignments = {}
        for doc, assignment in assignments.items():
            if "None" in assignment or "Error" in assignment:
                prompt = f"Document: {doc}\nError: {assignment}\nPlease reassign a valid topic."
                response = self._call(prompt, history)
                corrected_assignments[doc] = response
            else:
                corrected_assignments[doc] = assignment
        return corrected_assignments

# Load representative texts and clustering results
representative_file_path = "/data1/dxw_data/llm/RAG-mkt-kmeans/data/representative_texts.json"
clustering_results_file_path = "/data1/dxw_data/llm/RAG-mkt-kmeans/data/clustering_results.json"

with open(representative_file_path, 'r', encoding='utf-8') as f:
    representative_texts = json.load(f)

with open(clustering_results_file_path, 'r', encoding='utf-8') as f:
    clustering_results = json.load(f)

# Load the pre-trained model
model_path = '/data1/dxw_data/llm/text2vec-large-chinese'
model = SentenceTransformer(model_path)

# Convert representative texts and clustering results to vectors
representative_vectors = model.encode(representative_texts)
all_texts = clustering_results['text']
all_vectors = model.encode(all_texts)

# Initialize GLM and load model
import sys
modelpath = "/data1/dxw_data/llm/chatglm3-6b"
sys.path.append(modelpath)
llm = GLM()
llm.load_model(model_name_or_path=modelpath)

# Define example topics based on provided content
example_topics = ["环境干净", "服务周到", "菜品新鲜", "价格贵"]

# Generate topics based on top 5 closest texts for each representative text
generated_topics = llm.generate_topics(representative_texts, example_topics, representative_vectors, all_texts)

# Refine the generated topics
refined_topics = llm.refine_topics(generated_topics)

# Assign topics to documents
assignments = llm.assign_topics(representative_texts, refined_topics)

# Self-correct the topic assignments
corrected_assignments = llm.self_correct(assignments)

# Save the results to a JSON file
result = {
    "generated_topics": generated_topics,
    "refined_topics": refined_topics,
    "assignments": assignments,
    "corrected_assignments": corrected_assignments
}
result_file_path = "/data1/dxw_data/llm/RAG-mkt-kmeans/data/refine_results.json"
with open(result_file_path, 'w', encoding='utf-8') as f:
    json.dump(result, f, ensure_ascii=False, indent=4)

print("Processing complete. refine_results saved to:", result_file_path)
