In [28]:

import os
import json
from openai import OpenAI

openai = OpenAI()
def generate_mc_question_from_embedding(input_text):
    # 构造prompt文本，包括system message和实际的问题内容

    output_format = {"questions": [{"question": "[the question description]", "options": [{"A": ""}, {"B": ""}, {"C": ""}, {"D": ""}], "answer": "[the answer]", "type": "multiple_choice_question"}], "response_type": "multiple_choice_question"}

    # 将对象转换为JSON字符串，并将双引号转义
    output_format_str = json.dumps(output_format).replace('"', '\\"')

    prompt_text = f'''
    System Message: 
    Feel free to use any tools available to look up.
    The generated question must ask the key point of the each retrieved data.
    Try your best to generate only 1 question!
    
    根据以下内容出题：
    {input_text}
    请确保您的题目紧密相关于文件内容，并能够反映出文件的主要观点或知识点。题目应包含四个选项（A、B、C、D），其中只能有一个选项是正确的單選題，並且應該隨機分佈，不能是以上全對或以上全錯"。\n\nCould you please generate some summary questions related to the retrieved documents to help the readers understand the content easily? The output result is a JSON object string and the format must be like this: {output_format_str}
    
    '''

    response = openai.chat.completions.create(
        model= os.getenv("OPENAI_MODEL_NAME"),  # 或者其他可用的模型
        messages= [{ "role": "user", "content": prompt_text }],
        stream= False
    )

    # 返回生成的文本
    return response

# 示例输入文本
input_text = "恒生指數今日升定跌。"

# 生成问题
questions = generate_mc_question_from_embedding(input_text)
print(questions)

ChatCompletion(id='chatcmpl-8zJTSejjn8EoevJRIDEdZw6u8F2rp', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='{"questions": [{"question": "What is the direction of the Hang Seng Index today?", "options": [{"A": "Increase"}, {"B": "Decrease"}, {"C": "Remain unchanged"}, {"D": "Fluctuate"}], "answer": "Decrease", "type": "multiple_choice_question"}], "response_type": "multiple_choice_question"}', role='assistant', function_call=None, tool_calls=None))], created=1709623558, model='gpt-3.5-turbo-16k-0613', object='chat.completion', system_fingerprint=None, usage=CompletionUsage(completion_tokens=77, prompt_tokens=293, total_tokens=370))


In [29]:
questions.choices[0].message.content

'{"questions": [{"question": "What is the direction of the Hang Seng Index today?", "options": [{"A": "Increase"}, {"B": "Decrease"}, {"C": "Remain unchanged"}, {"D": "Fluctuate"}], "answer": "Decrease", "type": "multiple_choice_question"}], "response_type": "multiple_choice_question"}'

In [2]:
from sqlalchemy.orm import sessionmaker
from sqlalchemy.sql.expression import cast
from sqlalchemy.dialects.postgresql import ARRAY, JSONB, UUID, FLOAT
from sqlalchemy import create_engine, Column, ForeignKey, String, JSON, select
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy import func
from pgvector.sqlalchemy import Vector


# 定义基类
Base = declarative_base()

class LangchainPgEmbedding(Base):
    __tablename__ = 'langchain_pg_embedding'

    uuid = Column(UUID(as_uuid=True), primary_key=True)
    collection_id = Column(UUID(as_uuid=True), ForeignKey('langchain_pg_collection.uuid'), nullable=False)
    # 假设 embedding 字段是一个浮点数数组。这里使用 ARRAY(Float) 需要根据实际数据库支持调整
    embedding = Column(Vector(1536))
    document = Column(String)
    cmetadata = Column(JSONB)
    custom_id = Column(String)

# 创建数据库引擎
engine = create_engine('postgresql+psycopg2://docai:docai123123@db-main.postgres.database.azure.com/docai_document_embedding_dev', echo=True)

# 创建Session类
Session = sessionmaker(bind=engine)

# 创建Session实例
session = Session()

# 定义你想要过滤的document_id数组
target_document_ids = ["c6fae7b1-ed8b-49eb-806e-875f9cd84d9d", "54b00b41-cd41-4fa2-be36-800ca0d7718c", "fa671e9c-76ef-4b68-a034-086099827735"]  # 替换为你的目标UUIDs

filtered_query = session.query(LangchainPgEmbedding).filter(
    # 使用func.jsonb_array_elements_text将JSONB数组转换为文本数组，然后与目标ID进行比较
    LangchainPgEmbedding.cmetadata['document_id'].astext.in_(target_document_ids)
).order_by(func.random()).limit(10)

# 获取SQL查询字符串
# query_str = str(filtered_query.statement.compile(compile_kwargs={"literal_binds": True}))

# print(query_str)

# 执行查询
random_documents = filtered_query.all()
    


  Base = declarative_base()


2024-03-05 14:31:48,121 INFO sqlalchemy.engine.Engine select pg_catalog.version()
2024-03-05 14:31:48,121 INFO sqlalchemy.engine.Engine [raw sql] {}
2024-03-05 14:31:48,365 INFO sqlalchemy.engine.Engine select current_schema()
2024-03-05 14:31:48,366 INFO sqlalchemy.engine.Engine [raw sql] {}
2024-03-05 14:31:48,554 INFO sqlalchemy.engine.Engine show standard_conforming_strings
2024-03-05 14:31:48,555 INFO sqlalchemy.engine.Engine [raw sql] {}
2024-03-05 14:31:48,802 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2024-03-05 14:31:48,805 INFO sqlalchemy.engine.Engine SELECT langchain_pg_embedding.uuid AS langchain_pg_embedding_uuid, langchain_pg_embedding.collection_id AS langchain_pg_embedding_collection_id, langchain_pg_embedding.embedding AS langchain_pg_embedding_embedding, langchain_pg_embedding.document AS langchain_pg_embedding_document, langchain_pg_embedding.cmetadata AS langchain_pg_embedding_cmetadata, langchain_pg_embedding.custom_id AS langchain_pg_embedding_custom_id 
FROM

In [3]:

from langchain_community.chat_models import ChatOpenAI
import os
llm = ChatOpenAI(model_name=os.getenv("OPENAI_MODEL_NAME"), temperature=0.2)

doc = random_documents[0]

# # 处理你的随机选中的文档
# for doc in random_documents:
#     print(doc.uuid, doc.document)
#     generate_questions_with_agent(llm, [])

  warn_deprecated(


In [11]:
from langchain.pydantic_v1 import BaseModel, Field
from langchain.tools import BaseTool, StructuredTool, tool

tools = [BaseTool()]


generate_questions_with_agent(llm, tools, doc.document)

TypeError: Can't instantiate abstract class BaseTool with abstract methods _run