In [1]:

# import
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.document_loaders import TextLoader
from langchain.embeddings import HuggingFaceEmbeddings

from langchain import LLMChain
from langchain.chains.mapreduce import MapReduceChain
from langchain.prompts import PromptTemplate
from langchain.llms.base import LLM
from transformers import AutoTokenizer, AutoModel, AutoConfig
from typing import Any, Dict, List, Mapping, Optional, Tuple, Union
from torch.mps import empty_cache
import torch
from langchain.chains import RetrievalQA

from langchain.document_loaders import UnstructuredFileLoader
from langchain.chains.summarize import load_summarize_chain
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain import OpenAI
from langchain.docstore.document import Document  # Ensure this is the correct import for your document class


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
class GLM(LLM):
    max_token: int = 2048
    temperature: float = 0.5
    top_p = 0.9
    tokenizer: object = None
    model: object = None
    history_len: int = 1024
    
    def __init__(self):
        super().__init__()
        
    @property
    def _llm_type(self) -> str:
        return "GLM"
            
    def load_model(self, llm_device="gpu",model_name_or_path=None):
        model_config = AutoConfig.from_pretrained(model_name_or_path, trust_remote_code=True)
        self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path,trust_remote_code=True)
        self.model = AutoModel.from_pretrained(model_name_or_path, config=model_config, trust_remote_code=True, device='cuda:5').half() # GLM模块装在gpu: 6



    def _call(self,prompt:str,history:List[str] = [],stop: Optional[List[str]] = None):
        response, _ = self.model.chat(
                    self.tokenizer,prompt,
                    history=history[-self.history_len:] if self.history_len > 0 else [],
                    max_length=self.max_token,temperature=self.temperature,
                    top_p=self.top_p)
        return response

In [3]:
import sys
modelpath = "/data1/dxw_data/llm/chatglm3-6b-128k"
sys.path.append(modelpath)
llm = GLM()
llm.load_model(model_name_or_path = modelpath)
#---------------------------至此, 成功加载模型

2024-06-06 15:05:26.281635: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-06-06 15:05:26.434876: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-06-06 15:05:26.992666: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda/lib64
2024-06-06 15:05:26.992744: W tensorflow/compiler/xla/stream_exec

In [5]:
import os
import sys
import pandas as pd
from langchain.docstore.document import Document 

# 创建prompt的模板
prompt_template = """
请总结用户的评论的情感是“1-积极，2-消极，还是3-中性”。
并说明这个评论属于符合“[1-疾病与治疗，2-生命经历与生活态度，3-家庭关系与支持，4-情绪与心理状态，5-医疗检查与复查，6-经济负担与医疗费用，7-饮食与健康，8-症状与副作用，9-旅行经历与生活琐事，10-坚强与希望，11-社会与文化]”之中最符合的1个主题。


输出格式为：[“emtion” ,“topic”]。两部分组成。输出对应的数字序号即可。请直接输出上述[]，不用无关内容，不要汉字。举例输出的具体格式：[x, y]

TEXT:
{text}

"""
PROMPT = PromptTemplate(template=prompt_template, input_variables=["text"])
chain = load_summarize_chain(llm, chain_type="stuff", prompt=PROMPT)

# 从Excel中读取评论
input_excel_path = '/data1/dxw_data/llm/mkt_llm/mkt_medicine/data_肺癌.xlsx'  # 替换为你的输入Excel文件路径
df = pd.read_excel(input_excel_path)

# 假设评论在名为'comments'的列中
comments = df['content'].tolist()

# 存储情感和主题的结果
results =[]

i=0
# 分析每条评论
for comment in comments:
     # 准备输入数据
    input_data = {"input_documents": [Document(page_content=comment)]}
    
    # 获取分析结果
    result = chain(input_data, return_only_outputs=True)
    print("result:",result)
    results.append(result)
    
    # # 假设结果包含'emotion'和'topic'字段
    # emotion = result.get('emotion', 'unknown')
    # topic = result.get('topic', 'unknown')
    # print("emotion:",emotion)
    # print("topic:",topic)
    
    # emotions.append(emotion)
    # topics.append(topic)
    print(f"------------------{i}次--------------------")
    i=i+1

# 将结果保存回DataFrame
df['results'] = results

# 保存到新的Excel文件
output_excel_path = '/data1/dxw_data/llm/mkt_llm/mkt_medicine/comments_with_analysis2.xlsx'  # 替换为你的输出Excel文件路径
df.to_excel(output_excel_path, index=False)

result: {'output_text': '[1, 3]'}
------------------0次--------------------
result: {'output_text': '[1, 9]'}
------------------1次--------------------
result: {'output_text': '[1, 3]'}
------------------2次--------------------
result: {'output_text': '[2, 10]'}
------------------3次--------------------
result: {'output_text': '[1, 5]'}
------------------4次--------------------
result: {'output_text': '[1, 8]'}
------------------5次--------------------
result: {'output_text': '[1, 8]'}
------------------6次--------------------
result: {'output_text': '[1, 9]'}
------------------7次--------------------
result: {'output_text': '[1, 8]'}
------------------8次--------------------
result: {'output_text': '[1, 2]'}
------------------9次--------------------
result: {'output_text': '[2, 10]'}
------------------10次--------------------
result: {'output_text': '[1, 3]'}
------------------11次--------------------
result: {'output_text': '[1, 8]'}
------------------12次--------------------
result: {'output_tex