In [1]:
from openai import OpenAI
import os
from dotenv import dotenv_values
from ReadLoad import read_jsonl, write_jsonl, read_json, write_json
from tqdm import tqdm
from evaluation import get_embedding, cosine_similarity
from prompt_template import get_sys_prompt, get_qa_prompt, get_refine_prompt, get_re_query_prompt
from eval_data import key_word_score
from qwen import qwen_response
import pandas as pd
import time

config = dotenv_values('.env')
client = OpenAI(
    api_key=config['qwen_key'],
    base_url=config['qwen_url']
)


def get_response(prompt,sys_prompt=get_sys_prompt()):
    time.sleep(0.1)
    try:
        completion = client.chat.completions.create(
            model="qwen2-7b-instruct",
            messages=[
                {'role': 'system', 'content': sys_prompt},
                {'role': 'user', 'content': prompt}
            ]

            )
        return completion.choices[0].message.content
    except Exception as e:
    # Handle any other unexpected exceptions
        print(f"An unexpected error occurred: {e}")
        return f"An unexpected error occurred: {e}"


2024-07-24 22:59:40.162223: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-07-24 22:59:40.163242: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-07-24 22:59:40.166132: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-07-24 22:59:40.174803: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-24 22:59:40.189350: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been 

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [2]:
def re_query(data):
    for d in tqdm(data):
        query = d['问题']
        clause = d['条款']
        name = d['产品名']
        prompt = get_re_query_prompt(name, clause, query)
        d['重述问题'] = qwen_response(prompt)
    return data

In [3]:
def completion(data):
    for d in tqdm(data):
        query = d['问题']
        clause = d['条款']
        name = d['产品名']
        prompt = get_qa_prompt(name, clause, query)
        d['prompt'] = prompt
        #d['answer'] = get_response(prompt)
        #d['answer'] = get_respond(prompt)
        #d['original_answer'] = qwen_response(prompt),
        d['original_answer'] = get_response(prompt),
        d['refine_prompt'] = get_refine_prompt(query, d['original_answer'], name, clause)
        d['answer'] =  d['original_answer'] #qwen_response(d['refine_prompt'])
    return data

In [4]:
def evaluation(data, name="测试数据"):
    similaritys = []
    key_word_simi  = []
    for d in tqdm(data):
        vec1 = dev_data_embd[d['ID']]
        vec2 = get_embedding(d['answer'])
        similarity = cosine_similarity(vec1, vec2)
        kw_score = key_word_score(d['答案'], d['answer'])
        d['similarity'] = similarity
        d['key_word_similarity'] = kw_score
        similaritys.append(similarity)
        key_word_simi.append(kw_score)
    length = len(data)
    score = sum(similaritys)/length
    key_word_simi = sum(key_word_simi)/length
    print(f'测评数据集：{length} \n相识度得分：{score} \n关键词得分：{key_word_simi}')
    df = pd.DataFrame(data)
    df.to_excel(f"{name}_score_{score}.xlsx", index=False)
    return data

In [5]:
def to_summit_json(data, commit="大道至简_result"):
    result = []
    for d in data:
        rd = {
            "ID": d['ID'],
            "question": d['问题'],
            "answer": d['answer']
        }
        result.append(rd)
    #write_json(result, commit)
    write_jsonl(result, commit)

### loading data

In [6]:
# dev_data = read_jsonl("dataset/resultdev_with_embedding.jsonl")
# dev_data_embd = {}
# for d in dev_data:
#     dev_data_embd[d['ID']] = d['ans_embedding']
#     d.pop('ans_embedding')
# requery_data = re_query(data)
# comp_data = completion(dev_data[200:300])
# eval_data = evaluation(comp_data)

In [None]:
#test_data = read_json("dataset/test.json")
from Rag import retrieve_clause
test_data = read_jsonl("dataset/test-B-0722.json")[:5]
for data in test_data:
    query = data['问题']
    data['条款'] = '\n\n'.join(retrieve_clause(query))
# test_data_result = completion(test_data)
# to_summit_json(test_data_result)


100%|██████████| 1000/1000 [00:00<00:00, 17597.99it/s]


In [None]:
test_data

In [None]:
print(read_jsonl("dataset/test-B-0722.json")[0]['条款'])

In [None]:
from ReadLoad import read_jsonl, write_jsonl, read_json, write_json
from llama_index.core.node_parser import SentenceWindowNodeParser
from llama_index.core import Document
from tqdm import tqdm

test_dataB = read_jsonl("dataset/test-B-0722.json")

In [None]:
documents = [Document(text=t['条款'],
                      metadata={
                          '产品名': t['产品名'],
                          'ID': t['ID'],
                          '问题': t['问题']
                      }) for t in test_dataB]
# for data in tqdm(documents):
#     data.text = data.text.replace('\n','. ')
    
# documents[:5]

### 参考资料

https://dashscope.console.aliyun.com/billing