In [1]:
import json
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
from sentence_transformers import SentenceTransformer
from typing import List, Optional

# Define the GLM class
class GLM:
    max_token: int = 2048
    temperature: float = 0.8
    top_p = 0.9
    tokenizer: object = None
    model: object = None
    history_len: int = 1024
    
    def __init__(self):
        pass
        
    @property
    def _llm_type(self) -> str:
        return "GLM"
            
    def load_model(self, llm_device="gpu", model_name_or_path=None):
        from transformers import AutoConfig, AutoTokenizer, AutoModel
        model_config = AutoConfig.from_pretrained(model_name_or_path, trust_remote_code=True)
        self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
        self.model = AutoModel.from_pretrained(model_name_or_path, config=model_config, trust_remote_code=True, device='cuda:5').half()

    def _call(self, prompt: str, history: List[str] = [], stop: Optional[List[str]] = None):
        response, _ = self.model.chat(
            self.tokenizer, prompt,
            history=history[-self.history_len:] if self.history_len > 0 else [],
            max_length=self.max_token, temperature=self.temperature,
            top_p=self.top_p
        )
        return response

# Load representative texts and clustering results
# representative_file_path = "/data1/dxw_data/llm/RAG-mkt-kmeans/data1/representative_texts.json"
# representative_file_path = "/data1/dxw_data/llm/RAG-mkt-kmeans/data2/representative_texts.json"
representative_file_path = "/data1/dxw_data/llm/RAG-mkt-kmeans/data3/representative_texts.json"
# clustering_results_file_path = "/data1/dxw_data/llm/RAG-mkt-kmeans/data1/clustering_results.json"
# clustering_results_file_path = "/data1/dxw_data/llm/RAG-mkt-kmeans/data2/clustering_results.json"
clustering_results_file_path = "/data1/dxw_data/llm/RAG-mkt-kmeans/data3/clustering_results.json"

with open(representative_file_path, 'r', encoding='utf-8') as f:
    representative_texts = json.load(f)

with open(clustering_results_file_path, 'r', encoding='utf-8') as f:
    clustering_results = json.load(f)

# Load the pre-trained model
model_path = '/data1/dxw_data/llm/text2vec-large-chinese'
model = SentenceTransformer(model_path)

# Convert representative texts and clustering results to vectors
representative_vectors = model.encode(representative_texts)
all_texts = clustering_results['text']
all_vectors = model.encode(all_texts)

# Function to find top 5 closest texts for each representative text
def find_top_k_closest(representative_vector, all_vectors, k=5):
    similarities = cosine_similarity([representative_vector], all_vectors)[0]
    top_k_indices = similarities.argsort()[-k:][::-1]
    return top_k_indices

# Initialize GLM and load model
import sys
modelpath = "/data1/dxw_data/llm/chatglm3-6b"
sys.path.append(modelpath)
llm = GLM()
llm.load_model(model_name_or_path=modelpath)

# Function to summarize themes using GLM
def summarize_themes(texts, llm):
    query = "请概括这些评论数据的主题: " + " ".join(texts)
    summary = llm._call(query)
    return summary

# Find top 5 closest texts for each representative text and summarize themes
summaries = []
for rep_text, rep_vector in zip(representative_texts, representative_vectors):
    top_indices = find_top_k_closest(rep_vector, all_vectors)
    top_texts = [all_texts[i] for i in top_indices]
    summary = summarize_themes(top_texts, llm)
    summaries.append({
        "representative_text": rep_text,
        "summary": summary,
        "top_texts": top_texts
    })

# Save summaries to a JSON file
# summary_file_path = "/data1/dxw_data/llm/RAG-mkt-kmeans/data1/summaries.json"
# summary_file_path = "/data1/dxw_data/llm/RAG-mkt-kmeans/data2/summaries.json"
summary_file_path = "/data1/dxw_data/llm/RAG-mkt-kmeans/data3/summaries.json"
with open(summary_file_path, 'w', encoding='utf-8') as f:
    json.dump(summaries, f, ensure_ascii=False, indent=4)

print("Clustering and summarization complete. Summaries saved to:", summary_file_path)


No sentence-transformers model found with name /data1/dxw_data/llm/text2vec-large-chinese. Creating a new one with MEAN pooling.
2024-06-22 18:19:15.577810: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-06-22 18:19:15.725971: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-06-22 18:19:16.313424: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No

Loading checkpoint shards:   0%|          | 0/7 [00:00<?, ?it/s]

Clustering and summarization complete. Summaries saved to: /data1/dxw_data/llm/RAG-mkt-kmeans/data3/summaries.json


In [None]:
# --------------其他prompt

In [1]:
import json
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
from sentence_transformers import SentenceTransformer
from typing import List, Optional

# Define the GLM class
class GLM:
    max_token: int = 2048
    temperature: float = 0.8
    top_p = 0.9
    tokenizer: object = None
    model: object = None
    history_len: int = 1024
    
    def __init__(self):
        pass
        
    @property
    def _llm_type(self) -> str:
        return "GLM"
            
    def load_model(self, llm_device="gpu", model_name_or_path=None):
        from transformers import AutoConfig, AutoTokenizer, AutoModel
        model_config = AutoConfig.from_pretrained(model_name_or_path, trust_remote_code=True)
        self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
        self.model = AutoModel.from_pretrained(model_name_or_path, config=model_config, trust_remote_code=True, device='cuda:5').half()

    def _call(self, prompt: str, history: List[str] = [], stop: Optional[List[str]] = None):
        response, _ = self.model.chat(
            self.tokenizer, prompt,
            history=history[-self.history_len:] if self.history_len > 0 else [],
            max_length=self.max_token, temperature=self.temperature,
            top_p=self.top_p
        )
        return response

# Load representative texts and clustering results
# representative_file_path = "/data1/dxw_data/llm/RAG-mkt-kmeans/data1/representative_texts.json"
# representative_file_path = "/data1/dxw_data/llm/RAG-mkt-kmeans/data2/representative_texts.json"
representative_file_path = "/data1/dxw_data/llm/RAG-mkt-kmeans/data3/representative_texts.json"
# clustering_results_file_path = "/data1/dxw_data/llm/RAG-mkt-kmeans/data1/clustering_results.json"
# clustering_results_file_path = "/data1/dxw_data/llm/RAG-mkt-kmeans/data2/clustering_results.json"
clustering_results_file_path = "/data1/dxw_data/llm/RAG-mkt-kmeans/data3/clustering_results.json"

with open(representative_file_path, 'r', encoding='utf-8') as f:
    representative_texts = json.load(f)

with open(clustering_results_file_path, 'r', encoding='utf-8') as f:
    clustering_results = json.load(f)

# Load the pre-trained model
model_path = '/data1/dxw_data/llm/text2vec-large-chinese'
model = SentenceTransformer(model_path)

# Convert representative texts and clustering results to vectors
representative_vectors = model.encode(representative_texts)
all_texts = clustering_results['text']
all_vectors = model.encode(all_texts)

# Function to find top 5 closest texts for each representative text
def find_top_k_closest(representative_vector, all_vectors, k=5):
    similarities = cosine_similarity([representative_vector], all_vectors)[0]
    top_k_indices = similarities.argsort()[-k:][::-1]
    return top_k_indices

# Initialize GLM and load model
import sys
modelpath = "/data1/dxw_data/llm/chatglm3-6b"
sys.path.append(modelpath)
llm = GLM()
llm.load_model(model_name_or_path=modelpath)

# Function to summarize themes using GLM
def summarize_themes(texts, llm):
    query = "请根据这些评论主题，说明还可以改进哪些地方: " + " ".join(texts)
    summary = llm._call(query)
    return summary

# Find top 5 closest texts for each representative text and summarize themes
summaries = []
for rep_text, rep_vector in zip(representative_texts, representative_vectors):
    top_indices = find_top_k_closest(rep_vector, all_vectors)
    top_texts = [all_texts[i] for i in top_indices]
    summary = summarize_themes(top_texts, llm)
    summaries.append({
        "representative_text": rep_text,
        "summary": summary,
        "top_texts": top_texts
    })

# Save summaries to a JSON file
# summary_file_path = "/data1/dxw_data/llm/RAG-mkt-kmeans/data1/improve.json"
# summary_file_path = "/data1/dxw_data/llm/RAG-mkt-kmeans/data2/improve.json"
summary_file_path = "/data1/dxw_data/llm/RAG-mkt-kmeans/data3/improve.json"
with open(summary_file_path, 'w', encoding='utf-8') as f:
    json.dump(summaries, f, ensure_ascii=False, indent=4)

print("Clustering and summarization complete. Summaries saved to:", summary_file_path)


No sentence-transformers model found with name /data1/dxw_data/llm/text2vec-large-chinese. Creating a new one with MEAN pooling.
2024-06-22 18:21:25.341912: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-06-22 18:21:25.492432: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-06-22 18:21:26.105625: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No

Loading checkpoint shards:   0%|          | 0/7 [00:00<?, ?it/s]

Clustering and summarization complete. Summaries saved to: /data1/dxw_data/llm/RAG-mkt-kmeans/data3/improve.json
