In [1]:
import os
import json
import pandas as pd
from typing import List, Optional
from transformers import AutoConfig, AutoTokenizer, AutoModel
from tqdm import tqdm
import sys

class GLM:
    max_token: int = 2048
    temperature: float = 0.8
    top_p: float = 0.9
    tokenizer: object = None
    model: object = None
    history_len: int = 1024

    def __init__(self):
        super().__init__()

    @property
    def _llm_type(self) -> str:
        return "GLM"

    def load_model(self, llm_device="gpu", model_name_or_path=None):
        model_config = AutoConfig.from_pretrained(model_name_or_path, trust_remote_code=True)
        self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
        self.model = AutoModel.from_pretrained(model_name_or_path, config=model_config, trust_remote_code=True, device='cuda:2').half()

    def _call(self, prompt: str, history: List[str] = [], stop: Optional[List[str]] = None):
        response, _ = self.model.chat(
            self.tokenizer, prompt,
            history=history[-self.history_len:] if self.history_len > 0 else [],
            max_length=self.max_token, temperature=self.temperature,
            top_p=self.top_p)
        return response

modelpath = "/data1/dxw_data/llm/chatglm3-6b-128k"
sys.path.append(modelpath)
llm = GLM()
llm.load_model(model_name_or_path=modelpath)

2024-07-14 16:51:48.867604: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-07-14 16:51:49.095334: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-07-14 16:51:50.072919: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda/lib64
2024-07-14 16:51:50.073008: W tensorflow/compiler/xla/stream_exec

Loading checkpoint shards:   0%|          | 0/7 [00:00<?, ?it/s]

In [33]:
# 读取CSV文件
file_path = '/data1/dxw_data/llm/RA/hku_ivy/esg2/llm/filtered_companies.csv'
df = pd.read_csv(file_path)

# 创建Prompt模板
def create_prompt(incident_category, matched_summary):
    return (
        f"""<Incident Category>: {incident_category}
        <Matched Summary>: {matched_summary}
        
        First, summarize the main events described in the <matched summary>. Then analyze the correlation between the <incident category> and the <matched summary>. If all keywords in the incident category are fully addressed in the summary, return [3.0]. If some keywords are addressed but others are not, return [2.0]. If only a few or no keywords are addressed, return [1.0]. The analysis must strictly require all keywords to be addressed to return [3.0]. ### Please remember to give a final score!!!

        Examples:

        <Incident Category>: corruption, bribery, extortion and money laundering
        <Matched Summary>: The CEO of the company was implicated in fraudulent financial activities and unethical dealings. Several top executives were also found to be involved in these schemes. The investigation is currently underway.
        Output: The main events described in the summary are the involvement of the CEO and several top executives in fraudulent financial activities and unethical dealings, and the ongoing investigation. All keywords in the incident category are addressed in the summary. 
        ## Therefore, the final score is [3.0].


        <Incident Category>: environmental issues, supply chain issues
        <Matched Summary>: The company has been reported for contributing to significant environmental damage through its supply chain practices. Multiple sources have confirmed that the company's facilities are releasing harmful pollutants into nearby water sources.
        Output: The main event described in the summary is the company's role in causing environmental damage through its supply chain practices. Some keywords in the incident category are addressed in the summary, but others are not.
        ## Therefore, the final score is [2.0].


        <Incident Category>: occupational health and safety issues
        <Matched Summary>: The company reported a significant increase in quarterly profits, with revenue rising by 20% compared to the previous year.
        Output: The main event described in the summary is the company's financial performance. None of the keywords in the incident category are addressed in the summary. 
        ## Therefore, the final score is [1.0].
        """
    )

# 设置进度文件路径
progress_file_path = '/data1/dxw_data/llm/RA/hku_ivy/esg2/llm/filtered_companies_with_judge.json'

# 结果输出文件路径
output_txt_path = '/data1/dxw_data/llm/RA/hku_ivy/esg2/llm/results.txt'

# 读取进度文件
if os.path.exists(progress_file_path):
    with open(progress_file_path, 'r') as progress_file:
        progress = json.load(progress_file)
else:
    progress = {'last_processed_index': -1}

# 对每一行数据进行处理
results = []
start_index = progress['last_processed_index'] + 1
save_interval = 10  # 每处理10行保存一次
counter = 0  # 计数器

with open(output_txt_path, 'a', encoding='utf-8') as result_file:  # 以UTF-8编码打开结果输出文件
    for index, row in tqdm(df.iterrows(), total=df.shape[0], desc="Processing rows", initial=start_index):
        if index < start_index:
            continue

        incident_category = row['incident_category']
        matched_summary = row['matched_summary_1']
        
        prompt = create_prompt(incident_category, matched_summary)
        result = llm._call(prompt)
        result_file.write(f"Index: {index}\n{result}\n\n")  # 写入结果到文本文件
        
        # 提取模型返回的结果
        if '[3.0]' in result:
            ll_judge = 3.0
        elif '[2.0]' in result:
            ll_judge = 2.0
        elif '[1.0]' in result:
            ll_judge = 1.0
        else:
            ll_judge = None  # 如果模型的返回值不在预期范围内
        
        results.append((index, ll_judge, result))  # 保存分析结果
        counter += 1
        
        # 每处理10行保存一次结果
        if counter >= save_interval:
            for idx, judge, analysis in results:
                df.at[idx, 'll_judge'] = judge
                df.at[idx, 'll_analysis'] = analysis  # 保存分析结果
            
            # 保存结果到CSV文件
            output_file_path = '/data1/dxw_data/llm/RA/hku_ivy/esg2/llm/filtered_companies_with_judge.csv'
            df.to_csv(output_file_path, index=False)
            
            # 更新进度
            progress['last_processed_index'] = index
            with open(progress_file_path, 'w') as progress_file:
                json.dump(progress, progress_file)
            
            # 重置计数器和结果列表
            counter = 0
            results = []

    # 处理剩余的结果
    for idx, judge, analysis in results:
        df.at[idx, 'll_judge'] = judge
        df.at[idx, 'll_analysis'] = analysis  # 保存分析结果

    # 保存最终结果到CSV文件
    output_file_path = '/data1/dxw_data/llm/RA/hku_ivy/esg2/llm/filtered_companies_with_judge.csv'
    df.to_csv(output_file_path, index=False)

    # 更新进度文件
    progress['last_processed_index'] = df.shape[0] - 1
    with open(progress_file_path, 'w') as progress_file:
        json.dump(progress, progress_file)

print(f"Results saved to {output_file_path}")

Processing rows: 1872it [4:35:46,  8.84s/it]                          

Results saved to /data1/dxw_data/llm/RA/hku_ivy/esg2/llm/filtered_companies_with_judge.csv





In [2]:
# 读取CSV文件
file_path = '/data1/dxw_data/llm/RA/hku_ivy/esg2/llm/final_match.csv'
df = pd.read_csv(file_path)

# 创建Prompt模板
def create_prompt(incident_category, matched_summary):
    return (
        f"""<Incident Category>: {incident_category}
        <Matched Summary>: {matched_summary}
        
        First, summarize the main events described in the <matched summary>. Then analyze the correlation between the <incident category> and the <matched summary>. If all keywords in the incident category are fully addressed in the summary, return [3.0]. If some keywords are addressed but others are not, return [2.0]. If only a few or no keywords are addressed, return [1.0]. The analysis must strictly require all keywords to be addressed to return [3.0]. ### Please remember to give a final score!!!

        Examples:

        <Incident Category>: corruption, bribery, extortion and money laundering
        <Matched Summary>: The CEO of the company was implicated in fraudulent financial activities and unethical dealings. Several top executives were also found to be involved in these schemes. The investigation is currently underway.
        Output: The main events described in the summary are the involvement of the CEO and several top executives in fraudulent financial activities and unethical dealings, and the ongoing investigation. All keywords in the incident category are addressed in the summary. 
        ## Therefore, the final score is [3.0].


        <Incident Category>: environmental issues, supply chain issues
        <Matched Summary>: The company has been reported for contributing to significant environmental damage through its supply chain practices. Multiple sources have confirmed that the company's facilities are releasing harmful pollutants into nearby water sources.
        Output: The main event described in the summary is the company's role in causing environmental damage through its supply chain practices. Some keywords in the incident category are addressed in the summary, but others are not.
        ## Therefore, the final score is [2.0].


        <Incident Category>: occupational health and safety issues
        <Matched Summary>: The company reported a significant increase in quarterly profits, with revenue rising by 20% compared to the previous year.
        Output: The main event described in the summary is the company's financial performance. None of the keywords in the incident category are addressed in the summary. 
        ## Therefore, the final score is [1.0].
        """
    )

# 设置进度文件路径
progress_file_path = '/data1/dxw_data/llm/RA/hku_ivy/esg2/llm/filtered_companies_with_judge.json'

# 结果输出文件路径
output_txt_path = '/data1/dxw_data/llm/RA/hku_ivy/esg2/llm/results2.txt'

# 读取进度文件
if os.path.exists(progress_file_path):
    with open(progress_file_path, 'r') as progress_file:
        progress = json.load(progress_file)
else:
    progress = {'last_processed_index': -1}

# 对每一行数据进行处理
results = []
start_index = progress['last_processed_index'] + 1
save_interval = 10  # 每处理10行保存一次
counter = 0  # 计数器

with open(output_txt_path, 'a', encoding='utf-8') as result_file:  # 以UTF-8编码打开结果输出文件
    for index, row in tqdm(df.iterrows(), total=df.shape[0], desc="Processing rows", initial=start_index):
        if index < start_index:
            continue

        incident_category = row['incident_category']
        matched_summary = row['matched_summary_1']
        
        prompt = create_prompt(incident_category, matched_summary)
        result = llm._call(prompt)
        result_file.write(f"Index: {index}\n{result}\n\n")  # 写入结果到文本文件
        
        # 提取模型返回的结果
        if '[3.0]' in result:
            ll_judge = 3.0
        elif '[2.0]' in result:
            ll_judge = 2.0
        elif '[1.0]' in result:
            ll_judge = 1.0
        else:
            ll_judge = None  # 如果模型的返回值不在预期范围内
        
        results.append((index, ll_judge, result))  # 保存分析结果
        counter += 1
        
        # 每处理10行保存一次结果
        if counter >= save_interval:
            for idx, judge, analysis in results:
                df.at[idx, 'll_judge'] = judge
                df.at[idx, 'll_analysis'] = analysis  # 保存分析结果
            
            # 保存结果到CSV文件
            output_file_path = '/data1/dxw_data/llm/RA/hku_ivy/esg2/llm/final_match.csv'
            df.to_csv(output_file_path, index=False)
            
            # 更新进度
            progress['last_processed_index'] = index
            with open(progress_file_path, 'w') as progress_file:
                json.dump(progress, progress_file)
            
            # 重置计数器和结果列表
            counter = 0
            results = []

    # 处理剩余的结果
    for idx, judge, analysis in results:
        df.at[idx, 'll_judge'] = judge
        df.at[idx, 'll_analysis'] = analysis  # 保存分析结果

    # 保存最终结果到CSV文件
    output_file_path = '/data1/dxw_data/llm/RA/hku_ivy/esg2/llm/final_match.csv'
    df.to_csv(output_file_path, index=False)

    # 更新进度文件
    progress['last_processed_index'] = df.shape[0] - 1
    with open(progress_file_path, 'w') as progress_file:
        json.dump(progress, progress_file)

print(f"Results saved to {output_file_path}")

Processing rows:   4%|▍         | 25/617 [05:26<1:50:42, 11.22s/it]