In [1]:
import json
import glob
import pandas as pd
import numpy as np
from tqdm import tqdm

from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain.schema import Document
from langchain.vectorstores import FAISS
from langchain import PromptTemplate, LLMChain
from langchain.llms.base import LLM
from langchain.chains.summarize import load_summarize_chain
from langchain.chains.mapreduce import MapReduceChain

from transformers import AutoTokenizer, AutoModel, AutoConfig

from sklearn.metrics.pairwise import cosine_similarity

In [6]:
embeddings = HuggingFaceEmbeddings(
            model_name="GanymedeNil/text2vec-large-chinese",
            model_kwargs={'device': 'cuda'})

No sentence-transformers model found with name /root/.cache/torch/sentence_transformers/GanymedeNil_text2vec-large-chinese. Creating a new one with MEAN pooling.


In [7]:
files_names = glob.glob('/root/autodl-tmp/CSV/*.csv')

In [8]:
text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size = 2048,
    chunk_overlap  = 200,
    length_function = len,
    is_separator_regex = False,
)

In [9]:
for i in range(len(files_names)):
    if '深圳高速公路' in files_names[i] and '2019' in files_names[i]:
        print(i)
        print(files_names[i])

4623
/root/autodl-tmp/CSV/2020-03-19__深圳高速公路集团股份有限公司__600548__深高速__2019年__年度报告.csv


In [10]:
df_ = pd.read_csv(files_names[4623])

In [None]:
# keys = df_['chapter'].unique()
# similarity = []
# vector_b = np.array(embeddings.embed_query(x))
# for i in keys:
#     vector_a = np.array(embeddings.embed_query(i))
#     s = cosine_similarity(vector_a.reshape(1, -1), vector_b.reshape(1, -1))[0][0]
#     similarity.append(s)
# max_value = max(similarity)
# max_index = similarity.index(max_value)
# key = keys[max_index]
# content = [''.join(df_.loc[df_.chapter == key, 'inside'])]

In [None]:
# [''.join(df_.loc[df_.chapter == keys[-3], 'inside'])]

In [None]:
# similarity

In [None]:
# vector_a_1 = np.array(embeddings.embed_query(pd.read_csv(files_names[4623]).loc[0, 'chapter']))
# vector_a_2 = np.array(embeddings.embed_query(pd.read_csv(files_names[4623]).loc[1, 'chapter']))
# vector_a_3 = np.array(embeddings.embed_query(pd.read_csv(files_names[4623]).loc[5, 'chapter']))

# vector_a_4 = np.array(embeddings.embed_query(pd.read_csv(files_names[4623]).loc[8, 'chapter']))

# vector_a_13 = np.array(embeddings.embed_query(pd.read_csv(files_names[4623]).loc[59, 'chapter']))
# vector_b = np.array(embeddings.embed_query(x))

In [None]:
# cosine_sim_sklearn_1 = cosine_similarity(vector_a_1.reshape(1, -1), vector_b.reshape(1, -1))[0][0]
# cosine_sim_sklearn_2 = cosine_similarity(vector_a_2.reshape(1, -1), vector_b.reshape(1, -1))[0][0]
# cosine_sim_sklearn_3 = cosine_similarity(vector_a_3.reshape(1, -1), vector_b.reshape(1, -1))[0][0]
# cosine_sim_sklearn_4 = cosine_similarity(vector_a_4.reshape(1, -1), vector_b.reshape(1, -1))[0][0]

# cosine_sim_sklearn_13 = cosine_similarity(vector_a_13.reshape(1, -1), vector_b.reshape(1, -1))[0][0]

# print(cosine_sim_sklearn_1)
# print(cosine_sim_sklearn_2)
# print(cosine_sim_sklearn_3)
# print(cosine_sim_sklearn_4)
# print(cosine_sim_sklearn_13)

In [68]:
test = []
with open('/root/autodl-tmp/test_questions.json', 'r', encoding='utf-8') as file:
    for line in file.readlines():
        dic = json.loads(line)
        test.append(dic)

In [None]:
# for i in test:
#     if '深圳高速公路' in i['question']:
#         print(i)

In [None]:
x = test[4732]['question'].replace('2019', '').replace('深圳高速公路集团股份有限公司', '')
x

In [None]:
# top_k = 3
# query = test[4732]['question']
# # context = vector_store.similarity_search(query)
# context = vector_store.similarity_search_with_score(query, k=top_k)

In [12]:
prompt_template = """根据上下文回答问题，
            上下文:{context}
            问题:{query}"""
prompt = PromptTemplate.from_template(prompt_template)
# prompt = prompt.format(context = context[0][0], query = query)

In [13]:
class GLM(LLM):
    max_token: int = 2500
    temperature: float = 0.8
    top_p = 0.9
    tokenizer: object = None
    model: object = None
    history_len: int = 1024
    
    def __init__(self):
        super().__init__()
        
    @property
    def _llm_type(self) -> str:
        return "GLM"
            
    def load_model(self, llm_device="gpu",model_name_or_path=None):
        model_config = AutoConfig.from_pretrained(model_name_or_path, trust_remote_code=True)
        self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path,trust_remote_code=True)
        self.model = AutoModel.from_pretrained(model_name_or_path, config=model_config, trust_remote_code=True).half().cuda()

    def _call(self, prompt, history = [],stop = None):
        response, _ = self.model.chat(
                    self.tokenizer, prompt,
                    history=history[-self.history_len:] if self.history_len > 0 else [],
                    max_length=self.max_token, temperature=self.temperature,
                    top_p=self.top_p)
        return response
    
modelpath = "/root/chatglm2-6b/"
chatglm = GLM()
chatglm.load_model(model_name_or_path = modelpath)

You are using a model of type chatglm to instantiate a model of type . This is not supported for all configurations of models and can yield errors.


Loading checkpoint shards:   0%|          | 0/7 [00:00<?, ?it/s]

In [14]:
def pipeline(query, df, model, text_splitter, company, year):   # for q1
    question = query['question']
    question = question.replace(company, '')
    question = question.replace(year+'年', '')
    if 'chapter' in df.columns and df['chapter'].notnull().all():
        keys = df['chapter'].unique()
        similarity = []
        vector_b = np.array(embeddings.embed_query(question))
        for i in keys:
            vector_a = np.array(embeddings.embed_query(i))
            s = cosine_similarity(vector_a.reshape(1, -1), vector_b.reshape(1, -1))[0][0]
            similarity.append(s)
        # max_value = max(similarity)
        # max_index = similarity.index(max_value)
        # key = keys[max_index]
        sorted_indices = sorted(range(len(similarity)), key=lambda i: similarity[i], reverse=True)
        top_indices = sorted_indices[:int(len(similarity) / 2)]
        content = ''
        for key in top_indices:
            content.join(df.loc[df.chapter == key, 'inside'])
        content = [content]
    else:
        content = [''.join(df.loc[:, 'inside'])]
    texts = text_splitter.create_documents([''.join(df.loc[:, 'inside'])])
    vector_store = FAISS.from_documents(texts, embeddings)
    context = vector_store.similarity_search_with_score(question, k=3)
    prompt_template = """根据上下文回答问题，
                上下文:{context}
                问题:{query}"""
    prompt = PromptTemplate.from_template(prompt_template)
    chain = LLMChain(llm=model, prompt=prompt)
    response_0 = chain({'context':context[0][0], 'query':question})
    # response_1 = chain({'context':context[0][1], 'query':question})
    # response_2 = chain({'context':context[0][2], 'query':question})
    query['answer'] = response_0['text']
    return query

In [15]:
def split_questions(questions):
    q1 = []
    q2 = []
    for q in questions:
        if '年' in q['question']:
            q1.append(q)
        else:
            q2.append(q)
    return q1, q2

q1, q2 = split_questions(test)
q2

[{'id': 3, 'question': '研发费用对公司的技术创新和竞争优势有何影响？'},
 {'id': 19, 'question': '合并财务报表的编制方法有哪些？'},
 {'id': 54, 'question': '什么是存货周转率？'},
 {'id': 106, 'question': '重要会计政策和会计估计的变更是什么？'},
 {'id': 114, 'question': '什么是营运资本？'},
 {'id': 125, 'question': '财务杠杆和经营杠杆有什么区别？'},
 {'id': 133, 'question': '合营安排的分类及共同经营的会计处理方法是什么？'},
 {'id': 141, 'question': '资本公积金和盈余公积金对公司的股东权益和财务灵活性有何影响？'},
 {'id': 180, 'question': '公允价值变动收益对公司财务状况的影响如何衡量？'},
 {'id': 181, 'question': '如何解释财务报表中的“递延所得税资产”和“递延所得税负债”？'},
 {'id': 196, 'question': '公允价值变动收益如何反映公司资产价值的变化和风险暴露？'},
 {'id': 213, 'question': '营业外收支净额对公司的综合收益和盈利能力有何影响？'},
 {'id': 273, 'question': '应收账款的逾期和坏账如何影响公司的财务状况？'},
 {'id': 276, 'question': '预付款项的管理对公司的资金利用效率和供应商关系有何影响？'},
 {'id': 285, 'question': '固定资产的定义是什么？'},
 {'id': 298, 'question': '什么是流动负债？'},
 {'id': 336, 'question': '什么是成本费用法？它在会计中的应用是什么？'},
 {'id': 343, 'question': '什么是财务风险？'},
 {'id': 352, 'question': '什么是资产收益率？'},
 {'id': 354, 'question': '长期借款和短期借款的利息支付如何影响公司的财务状况和债务风险？'},
 {'id': 369, 'questio

In [16]:
def process_q2(questions, model):
    answers = []
    for q in questions:
        response = model(q['question'])
        q['answer'] = response
        answers.append(q)
    return answers

In [45]:
# outliers = []
# 问题很大
def process_q1(q1, files_names):
    output = []
    for f in files_names:
        company = f.split('_')[2]
        code = f.split('_')[4]
        shortname = f.split('_')[6]
        year = f.split('_')[8][:-1]

        for q in q1:
            if year in q['question']:
                try:
                    if shortname in q['question']:
                        df = pd.read_csv(f, header=0)
                        print(df)
                        print(f)
                        response = pipeline(q, df, chatglm, text_splitter, shortname, year)
                        output.append(response)
                    elif company in q['question']:
                        df = pd.read_csv(f, header=0)
                        print(df)
                        response = pipeline(q, df, chatglm, text_splitter, company, year)
                        output.append(response)
                    else:
                        pass     # 有问题
                except Exception as e:
                    print(e)
                    # outliers.append(q)
            else:
                pass     # 有问题
    return output

In [23]:
answer1 = process_q1(q1, files_names)

In [None]:
answer2 = process_q2(q2, chatglm)

In [36]:
outliers

[{'id': 1437, 'question': '湖北共同药业股份有限公司2021年的营业收入增长率是多少？保留2位小数。'},
 {'id': 3542, 'question': '2021年华辰装备资产总计是多少元?'},
 {'id': 4371, 'question': '瑞达期货在2019年的投资收益占营业收入比率保留到小数点后两位是多少？'},
 {'id': 4691, 'question': '瑞达期货股份有限公司在2019年的利息收入是多少元？'},
 {'id': 1573, 'question': '请问，2021年蜀道装备的总资产增长率是多少?请保留2位小数。'},
 {'id': 1710, 'question': '2021年蜀道装备营业利润是多少元?'}]

In [48]:
outliers_answers = process_q2(outliers, chatglm)

In [49]:
outliers_answers

[{'id': 1437,
  'question': '湖北共同药业股份有限公司2021年的营业收入增长率是多少？保留2位小数。',
  'answer': '根据题目所给信息，我们需要计算湖北共同药业股份有限公司2021年的营业收入增长率。根据公式，增长率的计算公式为（增长额-增长前数）/增长前数×100%。\n\n题目中没有给出增长额和增长前的具体数值，但是题目给出了2020年和2021年的营业收入。因此，我们可以计算出增长额为2021年的营业收入减去2020年的营业收入，即：\n\n增长额 = 2021年营业收入 - 2020年营业收入\n\n接下来，我们需要计算增长前数，即2020年的营业收入。题目中没有给出具体的数值，因此我们可以将其视为一个未知数x。\n\n将上述信息代入公式中，可以得到：\n\n增长率 = （2021年营业收入 - x）/ x × 100%\n\n将题目中给出的增长率代入，可以得到：\n\n增长率 = （25.38 - x）/ x × 100%\n\n将上式中的x代入，可以得到：\n\n增长率 = （25.38 - 22.50）/ 22.50 × 100%\n\n计算可得，增长率为25.38%。\n\n最后，根据题目要求，需要保留两位小数。因此，最终答案为25.4%。'},
 {'id': 3542,
  'question': '2021年华辰装备资产总计是多少元?',
  'answer': '我无法提供2021年华辰装备资产总计的具体数值，因为我的训练时间截止到2021年，而且这类数据通常需要经过审计和统计才能得出。建议您查询华辰装备的官方网站或联系相关客服获取确切信息。'},
 {'id': 4371,
  'question': '瑞达期货在2019年的投资收益占营业收入比率保留到小数点后两位是多少？',
  'answer': '抱歉，瑞达期货在2019年的投资收益占营业收入比率数据不详。请提供更多信息或者查询其他相关信息，以便我为您提供帮助。'},
 {'id': 4691,
  'question': '瑞达期货股份有限公司在2019年的利息收入是多少元？',
  'answer': '根据《公司债券发行与交易管理办法》第四十七条，公司债券利息收入的计算公式为：利息收入=利息收入总额-当期应付债券利息-累计应付债券利息

In [47]:
process_q1([outliers[1]], files_names)

Empty DataFrame
Columns: [type, inside, company, code, shortname, year]
Index: []
/root/autodl-tmp/CSV/2022-04-22__华辰精密装备昆山股份有限公司__300809__华辰装备__2021年__年度报告.csv
list index out of range


[]

In [96]:
answer = answer1 + answer2 + outliers_answers
sorted_answer = sorted(answer, key=lambda x: x['id'])

In [97]:
unique_elements = {}
for item in sorted_answer:
    id_value = item["id"]
    if id_value not in unique_elements:
        unique_elements[id_value] = item
filtered_answer = list(unique_elements.values())

In [90]:
l1 = []
l2 = []
l3 = []

for i in range(len(answer1)):
    l1.append(answer1[i]['id'])
for i in range(len(answer2)):
    l2.append(answer2[i]['id'])
for i in range(len(outliers_answers)):
    l3.append(outliers_answers[i]['id'])

In [98]:
l_ = []
for i in range(len(filtered_answer)):
    l_.append(filtered_answer[i]['id'])

In [101]:
unique = list(set([i for i in range(len(test))]) - set(l_))
outliers_answers_ = process_q2([test[i] for i in unique], chatglm)

In [103]:
final_answer = filtered_answer + outliers_answers_
final_answer = sorted(final_answer, key=lambda x: x['id'])
len(final_answer)

5000

In [105]:
output_file_path = 'csl_result.json'
with open(output_file_path, 'w', encoding='utf-8') as json_file:
    # json.dump(sorted_answer, json_file, ensure_ascii=False)
    for i in final_answer:
        x = json.dumps(i, ensure_ascii=False)
        json_file.write(x + '\n')

In [8]:
question_2019 = []
question_2020 = []
question_2021 = []
for p in test:
    question = p["question"]
    if "2019" in question:
        question_2019.append(question)
    if "2020" in question:
        question_2020.append(question)
    if "2021" in question:
        question_2021.append(question)