In [2]:
import pandas as pd
import numpy as np
from InstructorEmbedding import INSTRUCTOR
from sklearn.metrics.pairwise import cosine_similarity
import replicate
import json
import torch
import torch.nn as nn

  from tqdm.autonotebook import trange


In [2]:
import torch

torch.cuda.empty_cache()

In [3]:
# 读取本地模型文件
model = INSTRUCTOR("F:\\工作以及比赛\\大一统框架\\dd\\try2\\t1\\model")

load INSTRUCTOR_Transformer
max_seq_length  512


In [4]:
v = model.encode("dafdasfdasfdsa")

In [7]:
v.shape

(768,)

# 根据问题进行分类

In [4]:
# 加载网络结构参数
with open("../1.准确定向问题/model_struct.json","r") as f :
    content = f.read()
model_struct = json.loads(content)

# 定义网络结构
class network(nn.Module) :
    def __init__(self,input_size,hidden_size,drop_prod,num_layers,activate_func) :
        super().__init__()
        # 激活函数可供选择
        if activate_func == 'relu' :
            acf = nn.ReLU()
        elif activate_func == 'sigmoid' :
            acf = nn.Sigmoid()
        elif activate_func == 'tanh' :
            acf = nn.Tanh()
        
        layer_list = [
            nn.Linear(in_features = input_size,out_features = hidden_size,bias = True),
            acf,
            nn.Dropout(p=drop_prod),
        ]

        for i in range(num_layers-1) :
            layer_list.append(nn.Linear(in_features = hidden_size,out_features = hidden_size,bias = True))
            layer_list.append(acf)
            layer_list.append(nn.Dropout(p=drop_prod))
        
        layer_list.append(nn.Linear(in_features = hidden_size,out_features = 4 ,bias = True))
        self.model = nn.Sequential(*layer_list)

    def forward(self,x) :
        return self.model(x)

# 定义网络结构参数
hidden_size = model_struct["hidden_size"]
drop_prod = model_struct["drop_prod"]
num_layers = model_struct["num_layers"]
activate_func = model_struct["activate_func"]

# 定义模型
model_custom = network(input_size=768,hidden_size=int(hidden_size),drop_prod=drop_prod,num_layers=int(num_layers),activate_func=["relu","sigmoid","tanh"][int(activate_func)])

# 加载模型参数
model_custom.load_state_dict(torch.load("../1.准确定向问题/model_params.pth"))
model_custom.eval()

network(
  (model): Sequential(
    (0): Linear(in_features=768, out_features=244, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.32255092815907316, inplace=False)
    (3): Linear(in_features=244, out_features=4, bias=True)
  )
)

In [5]:
# 给定一个问题
question = "Why are nitrogen source materials like ammonia and urea added to the process?"

# 对问题进行编码
question_vec = model.encode(question)

# 求解问题分类
class_type = model_custom(torch.Tensor(question_vec)).argmax() + 1
class_type

tensor(1)

In [25]:
a1 = [None,1]
a1.pop(0)
a1

[1]

# 定制私有类的大模型

In [6]:
with open(f"./classcontent/t{class_type}.txt","r",encoding="utf8") as f :
    add_prompt = f.read()

In [7]:
template="""
You are an expert in the field of anaerobic fermentation. Based on a thorough understanding of the literature, you also have a basic knowledge of the following
{base_know}
Please answer in the following format.
first, Briefly summarize the answers.
second, Answer the question in more detail and logically, explaining every detail of the question in more detail. The format is
    (1) Sub-explanation
    (2) Sub-explanation
etc...
Sub-explanation should not be limited to two, usually three or more;
finally, Summarize all the above conclusions and further explain the problem.

{context}

{chat_history}
The question you are about to answer is:
{question}
answer:

"""

In [None]:
class_type = 4
# 读取私有类知识库
pra_store = pd.read_json(f"../2.准确定向文献/pdf_store/pdf_store{class_type}.json").reset_index(drop=True).iloc[:,0].tolist()

# 获得私有类知识库的encode
pra_page_text = [i["pageText"] for i in pra_store]
pra_pdf_vec = model.encode(pra_page_text)

# 保存为csv暂存
pd.DataFrame(pra_pdf_vec).to_csv(f"./pra_pdf/pra_pdf_vec{class_type}.csv")

# 读取csv
np.array(pd.read_csv(f"./pra_pdf/pra_pdf_vec{class_type}.csv"))[:,1::]

In [17]:
# 进行相似度分析
result_q2p = cosine_similarity(pra_pdf_vec,question_vec.reshape(1,-1))

# 给出期望读取文献的索引
max_len = 10
read_content_index = result_q2p.argsort(axis=0)[0:max_len]

# 给出期望读取的文献详细信息以及内容
read_content_metadata = np.array(pra_store)[read_content_index.transpose()]
expect_metadata = []
expect_content = ""

for i in range(max_len) :
    all =  read_content_metadata[0,i]
    expect_metadata.append(all["metadata"])
    expect_content += all["pageText"]

# 数据持久化(提供给后端)
# pd.DataFrame(expect_metadata).transpose().to_json("./toback.json")

# 提供给前端
json.dumps(pd.DataFrame(expect_metadata).transpose().to_dict())

'{"0": {"format": "PDF 1.3", "title": "es501098g 1..8", "author": "", "subject": "", "keywords": "", "creator": "Arbortext Advanced Print Publisher 10.0.1465/W Unicode", "producer": "Acrobat Distiller 8.1.0 (Windows)", "creationDate": "D:20140512160419-04\'00\'", "modDate": "D:20140512160419-04\'00\'", "trapped": "", "encryption": null}, "1": {"format": "PDF 1.4", "title": "", "author": "", "subject": "", "keywords": "", "creator": "3B2 Total Publishing System 8.07e/W Unicode ", "producer": "Acrobat Distiller 9.5.0 (Windows)", "creationDate": "D:20140111074249+08\'00\'", "modDate": "D:20140111110727+08\'00\'", "trapped": "", "encryption": null}, "2": {"format": "PDF 1.7", "title": "Comparative survival of viruses during thermophilic and mesophilic anaerobic digestion", "author": "", "subject": "", "keywords": "", "creator": "Elsevier", "producer": "Acrobat Distiller 10.0.0 (Windows)", "creationDate": "D:20171106135621+08\'00\'", "modDate": "D:20171108124933Z00\'00\'", "trapped": "", "e

'{"0": {"format": "PDF 1.3", "title": "es501098g 1..8", "author": "", "subject": "", "keywords": "", "creator": "Arbortext Advanced Print Publisher 10.0.1465/W Unicode", "producer": "Acrobat Distiller 8.1.0 (Windows)", "creationDate": "D:20140512160419-04\'00\'", "modDate": "D:20140512160419-04\'00\'", "trapped": "", "encryption": null}, "1": {"format": "PDF 1.4", "title": "", "author": "", "subject": "", "keywords": "", "creator": "3B2 Total Publishing System 8.07e/W Unicode ", "producer": "Acrobat Distiller 9.5.0 (Windows)", "creationDate": "D:20140111074249+08\'00\'", "modDate": "D:20140111110727+08\'00\'", "trapped": "", "encryption": null}, "2": {"format": "PDF 1.7", "title": "Comparative survival of viruses during thermophilic and mesophilic anaerobic digestion", "author": "", "subject": "", "keywords": "", "creator": "Elsevier", "producer": "Acrobat Distiller 10.0.0 (Windows)", "creationDate": "D:20171106135621+08\'00\'", "modDate": "D:20171108124933Z00\'00\'", "trapped": "", "e

In [10]:
# 定义prompt
prompt = template.format(question = question , context = expect_content , chat_history = [] , base_know = add_prompt)

# 定义推理参数
input_kwarg = {
    "prompt": prompt,
    "top_p":1,
    # "max_length":8000,
    "temperature":0.75,
    "repetition_penalty":1,
    "max_new_tokens":2000,
}

In [11]:
# 定义推理api
o1 = replicate.run(
        # "meta/llama-2-70b-chat:02e509c789964a7ea8736978a43525956ef40397be9033abf9fd2badfe68c9e3",
        "meta/llama-2-13b-chat:f4e2de70d66816a838a89eeeb621910adffb0dd0baba3976c96980970978018d",
        # "meta/llama-2-7b:527827021d8756c7ab79fde0abbfaac885c37a3ed5fe23c7465093f0878d55ef",
        input=input_kwarg
    )

ProxyError: HTTPSConnectionPool(host='api.replicate.com', port=443): Max retries exceeded with url: /v1/models/meta/llama-2-13b-chat/versions/f4e2de70d66816a838a89eeeb621910adffb0dd0baba3976c96980970978018d (Caused by ProxyError('Unable to connect to proxy', SSLError(SSLEOFError(8, 'EOF occurred in violation of protocol (_ssl.c:1122)'))))