In [108]:
import weaviate

from weaviate.classes.init import AdditionalConfig, Timeout
headers = {
    "X-Zhipuai-Api-Key": '4d6c5f8ad43b8fff94480b95a5a6c5d0.ouFQYxQ8pIVKO7J8'
}
client = weaviate.connect_to_local(
    headers=headers,
    additional_config=AdditionalConfig(
        timeout=Timeout(init=30, query=12000, insert=120)  # Values in seconds
    )
)

print(client.is_ready())


True


In [109]:
from weaviate.classes.config import Configure, Property, DataType, VectorDistances
from datetime import datetime

timestamp = datetime.now().strftime("%Y%m%d%H%M%S")

# Collection name with timestamp
collection_name = "AtomgitPapers"
 
collection_name_context = f"{collection_name}Context"
client.collections.delete(collection_name) 
client.collections.delete(collection_name_context) 


In [110]:


client.collections.create(
    collection_name,
    reranker_config=Configure.Reranker.transformers(),
    generative_config=Configure.Generative.zhipuai(
        # These parameters are optional
        model="glm-4-plus",   
        max_tokens=500, 
        temperature=0.7,
        top_p=0.7
    ),
    vectorizer_config=[
        
        # Set another named vector
        Configure.NamedVectors.text2vec_transformers(   
            name="chunk_text", source_properties=["chunk_text"],  
            vector_index_config=Configure.VectorIndex.hnsw(
                distance_metric=VectorDistances.COSINE
            ) 
        ), 
    ],
 
    properties=[  # Define properties
        Property(name="ref_id", data_type=DataType.TEXT),
        Property(name="paper_id", data_type=DataType.TEXT),
        Property(name="paper_title", data_type=DataType.TEXT),
        Property(name="chunk_id", data_type=DataType.NUMBER),
        Property(name="chunk_text", data_type=DataType.TEXT),
        Property(name="original_filename", data_type=DataType.TEXT)
        
    ],
)


client.collections.create(
    collection_name_context,
    
    vectorizer_config=Configure.Vectorizer.text2vec_contextionary( vectorize_collection_name=False),
 
    properties=[  # Define properties
        Property(name="refId", data_type=DataType.TEXT),
        Property(name="paperId", data_type=DataType.TEXT),
        Property(name="chunkId", data_type=DataType.NUMBER),
        Property(name="chunkText", data_type=DataType.TEXT),
        
    ],
)

<weaviate.collections.collection.sync.Collection at 0x7f03a456a9b0>

In [111]:
from weaviate.classes.query import Filter

from typing import List

def prepare_properties(paper_details: List[dict]):
    """
    将 paper_details 数据转化为向量数据库插入所需的 properties 格式
    """
    properties_list = []
    
    for paper in paper_details:
        properties = {
            "ref_id": paper.get("paper_id"),  # 可能是另一个 ID，具体根据实际情况
            "paper_id": paper.get("paper_id"),
            "paper_title": paper.get("paper_title"),
            "chunk_id": paper.get("chunk_id"),
            "chunk_text": paper.get("chunk_text"),
            "original_filename": paper.get("original_filename", "")  # 默认空字符串，如果没有提供
        }
        properties_list.append(properties)
    
    return properties_list

def insert_into_database(collection_name:str, union_id:str, properties_list: List[dict]):
    """
    插入数据到向量数据库,检查唯一
    """
    
    collection = client.collections.get(collection_name)
    union_ids = [item.get(union_id) for item in properties_list]
    response = collection.query.fetch_objects(
        filters=Filter.by_property(union_id).contains_any(union_ids),
    )
        
    exist_ids = [o.properties[union_id] for o in response.objects]
   
    for properties in properties_list:
        if properties[union_id] not in exist_ids:
            collection.data.insert(properties=properties)
    

In [112]:
import requests

# 定义接口 URL
SEARCH_PAPERS_URL = "http://180.184.65.98:38880/atomgit/search_papers"
QUERY_BY_PAPER_ID_URL = "http://180.184.65.98:38880/atomgit/query_by_paper_id"

def search_papers(query, top_k=5):
    """
    根据查询文本 search_papers 接口进行模糊查询，返回论文 ID 列表
    """
    params = {'query': query, 'top_k': top_k}
    response = requests.get(SEARCH_PAPERS_URL, params=params)
    
    if response.status_code == 200:
        # 返回的 JSON 数据，假设是一个包含论文信息的数组
        return response.json()  # 返回论文的列表
    else:
        print(f"Error: {response.status_code}")
        return None

def query_by_paper_id(paper_id, top_k=5):
    """
    根据 paper_id 调用 query_by_paper_id 接口，获取该论文的详细信息
    """
    params = {'paper_id': paper_id, 'top_k': top_k}
    response = requests.get(QUERY_BY_PAPER_ID_URL, params=params)
    
    if response.status_code == 200:
        # 返回的 JSON 数据，假设是论文的详细信息
        return response.json()  # 返回论文的详细信息
    else:
        print(f"Error: {response.status_code}")
        return None
 

In [113]:
import hashlib
import os
import json
import tempfile
import requests

CACHE_DIR = os.path.join(tempfile.gettempdir(), "query_cache")  # 自定义缓存目录

def ensure_cache_dir():
    """确保缓存目录存在"""
    if not os.path.exists(CACHE_DIR):
        os.makedirs(CACHE_DIR)

def get_query_hash(query):
    """根据 query 生成唯一的哈希值"""
    return hashlib.md5(query.encode('utf-8')).hexdigest()

def get_cache_file_path(query):
    """获取 query 对应的缓存文件路径"""
    query_hash = get_query_hash(query)
    return os.path.join(CACHE_DIR, f"{query_hash}.json")

def check_cache(query):
    """检查是否存在 query 的缓存文件"""
    file_path = get_cache_file_path(query)
    if os.path.exists(file_path):
        print(f"Cache hit for query: {query}")
        with open(file_path, 'r', encoding='utf-8') as cache_file:
            return json.load(cache_file)  # 返回缓存内容
    return None

 
def save_to_cache(query, data):
    """将数据保存到 query 的缓存文件中"""
    ensure_cache_dir()
    file_path = get_cache_file_path(query)
    with open(file_path, 'w', encoding='utf-8') as cache_file:
        json.dump(data, cache_file, ensure_ascii=False, indent=4)
        print(f"Data cached for query: {query} at {file_path}")

In [114]:


def exe_query(query, top_k):
    cached_data = check_cache(query)
    if cached_data:
        print(f"Using cached data for query: {query}")
        return cached_data  # 如果有缓存则直接返回
    # 第一步：模糊查询论文
    print(f"Searching papers for query: {query}")
    papers = search_papers(query, top_k)
    
    if papers:
        print(f"Found {len(papers)} papers.")
        
        # 获取所有的 paper_id，并去重
        paper_ids = set()  # 使用 set 去重
        for paper in papers:
            paper_id = paper.get("entity", {}).get("paper_id")
            if paper_id:
                paper_ids.add(paper_id)  # 添加到 set 中，自动去重
        
        # 输出去重后的 paper_id 数量
        print(f"Found {len(paper_ids)} unique paper IDs.")
    
         # 查询每个 unique paper_id 的详细信息
        all_paper_details = [] 
        for paper_id in paper_ids:
            print(f"Fetching details for paper ID: {paper_id}")
            paper_details = query_by_paper_id(paper_id, top_k)
            if paper_details:
                all_paper_details.extend(paper_details)  # 假设返回的是一个列表
            else:
                print(f"Failed to fetch details for paper ID: {paper_id}")
        
        # 将获取的 paper_details 转换为向量数据库可插入的格式
        properties_list = prepare_properties(all_paper_details)
        # 保存 properties_list 到临时文件
        save_to_cache(query, properties_list)
        return properties_list
    else:
        print("No papers found.")
     

In [115]:

query = "Text2SQL研究现状如何?"  # 替换为你的查询文本
top_k =100  # 可选参数，默认查询返回前 5 个结果
properties_list = exe_query(query,top_k)


Cache hit for query: Text2SQL研究现状如何?
Using cached data for query: Text2SQL研究现状如何?


In [116]:
# 插入数据到数据库 
insert_into_database(collection_name, 'ref_id', properties_list)


context_properties_list = [{
    "refId": item.get('ref_id'),
    "paperId": item.get('paper_id'),
    "chunkId": item.get('chunk_id'),
    "chunkText": item.get('chunk_text')
} for item in properties_list]

insert_into_database(collection_name_context, 'refId', context_properties_list)


In [117]:

query = "Text2SQL面临哪些挑战?"  # 替换为你的查询文本
top_k =100  # 可选参数，默认查询返回前 5 个结果
properties_list = exe_query(query,top_k)


Cache hit for query: Text2SQL面临哪些挑战?
Using cached data for query: Text2SQL面临哪些挑战?


In [118]:
# 插入数据到数据库 
insert_into_database(collection_name, 'ref_id', properties_list)


context_properties_list = [{
    "refId": item.get('ref_id'),
    "paperId": item.get('paper_id'),
    "chunkId": item.get('chunk_id'),
    "chunkText": item.get('chunk_text')
} for item in properties_list]

insert_into_database(collection_name_context, 'refId', context_properties_list)


In [119]:
 # Semantic path is not yet supported by the V4 client. Please use a raw GraphQL query instead.
response = client.graphql_raw_query(
  """
{  
  Get {
    AtomgitPapersContext(
      nearText:{
        concepts: ["Text2SQL", "research", "challenges"], 
        distance: 0.23, 
        moveAwayFrom: {
          concepts: ["food"],
          force: 0.45
        },
        moveTo: {
          concepts: ["SQL", "natural language processing", "query generation"],
          force: 0.85
        }
      }, 
      limit: 25
    ) {
     refId
     chunkText
     _additional {
        semanticPath {
          path {
            concept
            distanceToNext
            distanceToPrevious
            distanceToQuery
            distanceToResult
          }
        }
      }
    }
  }
}


  """
)


In [120]:
# Import necessary libraries
import matplotlib.pyplot as plt
import networkx as nx 
from matplotlib import rcParams
import os
import copy

# 设置 HTTP 和 HTTPS 代理
os.environ["HTTP_PROXY"] = "http://127.0.0.1:7890"
os.environ["HTTPS_PROXY"] = "http://127.0.0.1:7890"


# 设置中文字体为 SimHei（黑体）
rcParams['font.sans-serif'] = ['DejaVu Serif']  # Example
 
# rcParams['axes.unicode_minus'] = False  # 解决负号显示问题

# Function to create a graph from semantic path data
def create_caption_graph(data):
    G = nx.DiGraph()
    for i, node in enumerate(data):
        concept = node["concept"]
        G.add_node(concept, distanceToQuery=node["distanceToQuery"], distanceToResult=node["distanceToResult"])
        if i < len(data) - 1:
            next_concept = data[i + 1]["concept"]
            G.add_edge(concept, next_concept, weight=node["distanceToNext"])
    return G

# 图形绘制函数，带上下翻译对比
def plot_graph_with_caption(G, title, caption, pos=None, translate=True):
    translated_title, translated_caption = title, caption
    
    if translate:
        # 自动翻译标题和说明
        translated_title = title
        # translated_caption = GoogleTranslator(source="en", target="zh-CN").translate(caption)
        translated_caption = caption

    plt.figure(figsize=(10, 10))
    pos = pos or nx.spring_layout(G, seed=42)  # 节点布局
    nx.draw(
        G, pos, with_labels=True, node_color="lightblue", edge_color="gray",
        node_size=2000, font_size=10
    )
    labels = nx.get_edge_attributes(G, 'weight')
    nx.draw_networkx_edge_labels(G, pos, edge_labels={k: f"{v:.3f}" for k, v in labels.items()})

    # 显示翻译和原文对比
    plt.title(f"{translated_title}\n(Original: {title})", fontsize=14, pad=20)
    plt.figtext(
        0.5, 0.01,
        f"Translated Caption: {translated_caption}\n(Original Caption: {caption})",
        wrap=True, horizontalalignment='center', fontsize=12
    )
    plt.show()

     
 
# Translate concept names using an API
def translate_concepts(path):
        
    from deep_translator import GoogleTranslator
    translated_path = []
    for node in path:
        try:
            original_concept = node["concept"] 
            # 使用 Google 翻译
            translated_concept = GoogleTranslator(source="en", target="zh-CN").translate(original_concept) 
            node["concept"] = f"{original_concept} - {translated_concept}"
        except Exception as e: 
            print(f"Error translating {node['concept']}: {e}")
        translated_path.append(node)
    return translated_path

# Updated caption graph function
def create_and_plot_all(data_list):
    for i, context in enumerate(data_list):
        # semantic_path = translate_concepts(context["_additional"]["semanticPath"]["path"])
        semantic_path = context["_additional"]["semanticPath"]["path"]
        caption = context["chunkText"]
        graph = create_caption_graph(semantic_path)
        title = f"Semantic Path Visualization {i+1}"
        plot_graph_with_caption(graph, title, caption)



# Process and visualize all examples
data_list = response.get[collection_name_context]
data_list_copy = copy.deepcopy(data_list)
# create_and_plot_all(data_list_copy)


In [125]:
from pyvis.network import Network
from IPython.core.display import display, HTML
import networkx as nx
import json

 
# Function to find root nodes and their properties
def find_root_nodes(data):
    # Create a directed graph using NetworkX
    G = nx.DiGraph()
    node_properties = []

    for context in data: 
        path = context["_additional"]["semanticPath"]["path"]
        for i, node in enumerate(path):
            concept = node["concept"]
             
            # Add node with its properties
            if concept not in G.nodes:
                G.add_node(concept)
            if i < len(path) - 1:
                next_concept = path[i + 1]["concept"]
                if next_concept not in G.nodes:
                    G.add_node(next_concept) 
                G.add_edge(concept, next_concept)

    # Find nodes with in-degree 0 (root nodes)
    root_nodes = [node for node, in_degree in G.in_degree() if in_degree == 0]
      
    return root_nodes

# Function to find concepts with more than 3 outgoing edges
def find_high_outdegree_concepts(data, edge_threshold=3):
    # Create a directed graph using NetworkX
    G = nx.DiGraph()

    for context in data:
        path = context["_additional"]["semanticPath"]["path"]
        for i, node in enumerate(path):
            concept = node["concept"]
            if concept not in G.nodes:
                G.add_node(concept)
            if i < len(path) - 1:
                next_concept = path[i + 1]["concept"]
                if next_concept not in G.nodes:
                    G.add_node(next_concept)
                G.add_edge(concept, next_concept)

    # Find nodes with outgoing edges greater than the threshold
    high_outdegree_concepts = [node for node, out_degree in G.out_degree() if out_degree > edge_threshold]
    return high_outdegree_concepts


# Function to find nodes with in-degree greater than a threshold
def find_high_indegree_nodes(data, threshold=3):
    # Create a directed graph using NetworkX
    G = nx.DiGraph() 
    for context in data: 
        path = context["_additional"]["semanticPath"]["path"]
        for i, node in enumerate(path):
            concept = node["concept"]
 
            # Add node with its properties
            if concept not in G.nodes:
                G.add_node(concept)
            if i < len(path) - 1:
                next_concept = path[i + 1]["concept"]
                if next_concept not in G.nodes:
                    G.add_node(next_concept)
                G.add_edge(concept, next_concept)
 
    # Find nodes with in-degree greater than the threshold
    high_indegree_nodes = [
        node for node, in_degree in G.in_degree() if in_degree > threshold
    ]
    return high_indegree_nodes

def get_color(concept, root_nodes_with_concept, high_outdegree_concepts, high_indegree_concepts):
    # 高亮根节点
    if concept in root_nodes_with_concept:
        color = "green"  # 绿色表示根节点
    # 高亮出度较大的节点
    elif concept in high_outdegree_concepts:
        color = "red"  # 红色表示出度较大的节点
    elif concept in high_indegree_concepts:
        color = "blue"  # 蓝色表示出度较大的节点
    else:
        color = "lightblue"  # 默认节点颜色 

    return color
        
# 修改 create_interactive_graph 函数以标注特定节点
def create_interactive_graph(data, root_nodes_with_concept, high_outdegree_concepts, high_indegree_concepts, filename="semantic_path_interactive.html"):
    net = Network(height="750px", width="100%", directed=True)
 
    for context in data:
        # 直接使用原始路径数据
        path = context["_additional"]["semanticPath"]["path"]
        nodes_added = set()  # Track added nodes
        for i, node in enumerate(path):
            concept = node["concept"]

           
            if concept not in nodes_added:
                color = get_color(concept, root_nodes_with_concept, high_outdegree_concepts, high_indegree_concepts) 
                net.add_node(concept, label=concept, color=color)
                nodes_added.add(concept)

            if i < len(path) - 1:
                next_concept = path[i + 1]["concept"]
                # Ensure the next concept exists before adding the edge
                if next_concept not in nodes_added:
                    
                    color = get_color(next_concept, root_nodes_with_concept, high_outdegree_concepts, high_indegree_concepts) 
                    net.add_node(next_concept, label=next_concept, color=color)
                    nodes_added.add(next_concept)
                net.add_edge(concept, next_concept, title=f"Distance: {node['distanceToNext']:.3f}")

    # 保存 HTML 文件
    net.write_html(filename)

    # 在 Jupyter 中显示 HTML
    iframe = f'<iframe src="{filename}" width="100%" height="750px" frameborder="0"></iframe>'
    display(HTML(iframe))
    
# Example usage
root_nodes = find_root_nodes(data_list_copy)
print("Root nodes:", json.dumps(root_nodes))

# Example usage
high_outdegree_concepts = find_high_outdegree_concepts(data_list_copy)
print("Concepts with more than 3 outgoing edges:", high_outdegree_concepts)


# Example usage
high_indegree_concepts = find_high_indegree_nodes(data_list_copy)
print("Concepts with more than 3 going edges:", high_indegree_concepts)


create_interactive_graph(
    data_list_copy,
    root_nodes_with_concept=root_nodes,
    high_outdegree_concepts=high_outdegree_concepts,
    high_indegree_concepts=high_indegree_concepts,
    filename="gpt4o_semantic_path_interactive.html"
)
    

Root nodes: ["implement", "inclusion", "experts"]
Concepts with more than 3 outgoing edges: ['framework', 'database', 'enabling']
Concepts with more than 3 going edges: ['framework', 'overview', 'enabling', 'expertise']


  from IPython.core.display import display, HTML


In [127]:

# Function to find concepts with more than 3 outgoing edges
def create_G(data, root_nodes_with_concept, high_outdegree_concepts, high_indegree_concepts):
    # 创建有向图
    G = nx.DiGraph()
 
    for context in data:
        path = context["_additional"]["semanticPath"]["path"]
        refId = context.get("refId", None)  # 从context中获取refId字段
        nodes_added = set()  # 用于在同一个context中避免重复添加同一节点属性
        
        for i, node in enumerate(path):
            concept = node["concept"]
            
            # 如果节点在全局图中不存在，则新建节点并添加属性
            if concept not in G:
                color = get_color(concept, root_nodes_with_concept, high_outdegree_concepts, high_indegree_concepts)
                # 将refId信息存入节点属性中。如果同一节点在多个context中出现，可以存成列表/集合
                G.add_node(concept, label=concept, color=color, refId=[refId] if refId else [])
            else:
                # 如果节点已存在，且有新的refId信息，合并进去（去重）
                if refId and refId not in G.nodes[concept]['refId']:
                    G.nodes[concept]['refId'].append(refId)
            
            nodes_added.add(concept)

            if i < len(path) - 1:
                next_concept = path[i + 1]["concept"]
                if next_concept not in G:
                    color = get_color(next_concept, root_nodes_with_concept, high_outdegree_concepts, high_indegree_concepts)
                    G.add_node(next_concept, label=next_concept, color=color, refId=[refId] if refId else [])
                else:
                    # 如果下一个概念节点已存在，更新refId信息
                    if refId and refId not in G.nodes[next_concept]['refId']:
                        G.nodes[next_concept]['refId'].append(refId)

                # 增加边的属性（这里已有Distance信息）
                G.add_edge(concept, next_concept, title=f"Distance: {node['distanceToNext']:.3f}")

    return G



def find_simple_path(G, root_nodes_with_concept, high_outdegree_concepts):
        
    # 筛选出那些包含红色节点的路径
    result_paths = []
    for root in root_nodes_with_concept:
        for out in high_outdegree_concepts:
            if nx.has_path(G, root, out):
                # 寻找从 "root" 到 "out" 的所有简单路径
                paths = nx.all_shortest_paths(G, source=root, target=out)
                
                for path in paths:
                    # 判断路径中是否存在 color='red' 的节点
                    if any(G.nodes[node].get("color") == "red" for node in path):
                        if len(path) == 3:
                            result_paths.append(path)
            else:
                print(f"No path between {root} and {out}")
                continue
                
    return result_paths

G = create_G(data_list_copy,  root_nodes_with_concept=root_nodes,
    high_outdegree_concepts=high_outdegree_concepts,
    high_indegree_concepts=high_indegree_concepts)



result_paths = find_simple_path(G,  root_nodes_with_concept=root_nodes,
    high_outdegree_concepts=high_outdegree_concepts)

print("Root nodes:", json.dumps(root_nodes))
 
print("Concepts with more than 3 outgoing edges:", high_outdegree_concepts)
 
print("Concepts with more than 3 going edges:", high_indegree_concepts)

# result_paths 中即为符合条件的路径列表

refIds = []
# 输出路径以及路径中节点的refId信息
for p in result_paths:
    print("Path: " + " -> ".join(p))
    # 输出路径中每个节点的refId字段信息
    for node in p:
        print(f"Node: {node}, refIds: {G.nodes[node].get('refId')}")
        refIds.extend(G.nodes[node].get('refId'))

  


No path between inclusion and database
No path between experts and database
Root nodes: ["implement", "inclusion", "experts"]
Concepts with more than 3 outgoing edges: ['framework', 'database', 'enabling']
Concepts with more than 3 going edges: ['framework', 'overview', 'enabling', 'expertise']
Path: implement -> database -> framework
Node: implement, refIds: ['6576dccf939a5f40821c2429', '646d8642d68f896efa0a3040']
Node: database, refIds: ['6461b9c9d68f896efad43133', '60cda6c991e011329faa252c', '6576dccf939a5f40821c2429', '64702deed68f896efa5202bb', '6584feac939a5f4082397b62', '646c3addd68f896efa5d1766']
Node: framework, refIds: ['60cda6c991e011329faa252c', '6461b9c9d68f896efad43133', '6576dccf939a5f40821c2429', '646c3addd68f896efa5d1766', '6584feac939a5f4082397b62', '654b5b88939a5f40823c017f', '65406320939a5f40826491aa']


In [135]:

collection = client.collections.get(collection_name)

In [144]:
from weaviate.classes.query import Rerank, MetadataQuery
response = collection.query.fetch_objects(
    filters=Filter.by_property("ref_id").contains_any(refIds),
    rerank=Rerank(
        prop="chunk_text",
        query=query
    ),
    return_metadata=MetadataQuery(score=True, explain_score=True, distance=True),
)

chunk_texts = []
for o in response.objects:
    print(o.metadata)
    if o.properties['chunk_text'] not in chunk_texts:

        chunk_texts.append(o.properties['chunk_text'])
        

TypeError: _FetchObjectsQueryAsync.fetch_objects() got an unexpected keyword argument 'rerank'

In [134]:
from sentence_transformers import CrossEncoder

model = CrossEncoder(
    "/mnt/ceph/develop/jiawei/model_checkpoint/jina-reranker-v2-base-multilingual",
    automodel_args={"torch_dtype": "auto"},
    trust_remote_code=True,
)


  def forward(
  def backward(ctx, dout, *args):


In [141]:


# Example query and documents
query = "Text2SQL研究现状如何？"
 
# construct sentence pairs
sentence_pairs = [[query, doc] for doc in chunk_texts]

scores = model.predict(sentence_pairs, convert_to_tensor=True).tolist()
 
scores

[0.2392578125,
 0.23828125,
 0.154296875,
 0.1025390625,
 0.1923828125,
 0.055908203125,
 0.09521484375,
 0.1083984375,
 0.07373046875,
 0.10986328125,
 0.0771484375,
 0.0927734375,
 0.09423828125,
 0.09033203125,
 0.1416015625,
 0.431640625,
 0.365234375,
 0.373046875,
 0.185546875,
 0.322265625]

In [143]:

rankings = model.rank(query, chunk_texts, return_documents=True, convert_to_tensor=True)
print(f"Query: {query}")
for ranking in rankings:
    print(f"ID: {ranking['corpus_id']}, Score: {ranking['score']:.4f}, Text: {ranking['text']}")
    break


Query: Text2SQL研究现状如何？
ID: 15, Score: 0.4316, Text: # Text2Analysis: A Benchmark of Table Question Answering with Advanced Data Analysis and Unclear Queries
Xinyi $\mathbf{H}\mathbf{e}^{1*}$ , Mengyu Zhou 2† , Xinrun $\mathbf{X}\mathbf{u}^{3*}$ , Xiaojun $\mathbf{M}\mathbf{a}^{2}$ , Rui $\mathbf{Ding}^{2}$ , Lun $\mathbf{D}\mathbf{u}^{2}$ ,Yan $\mathbf{Gao}^{2}$ , Ran $\mathbf{Jia}^{2}$ , Xu Chen 2 , Shi $\mathbf{H}\mathbf{a}\mathbf{n}^{2}$ , Zejian Yuan 1 , Dongmei Zhang 2  

1 Xi’an Jiaotong University, 2 Microsoft, 3 Institute of Software Chinese Academy of Science hxyhxy $@$ stu.xjtu.edu.cn, xuxinrun $20\,\@$ mails.ucas.ac.cn, yuan.ze.jian $@$ xjtu.edu.cn, {mezho, xiaojunma, juding, lun.du, gaoya, raji, xu.chen, shihan,dongmeiz }@microsoft.com

# Abstract
Tabular data analysis is crucial in various fields, and large language models show promise in this area. However, current research mostly focuses on rudimentary tasks like Text2SQL and TableQA, neglecting advanced analysis like fo

In [132]:
from weaviate.classes.query import MetadataQuery, Rerank

prompt = "回答问题：Text2SQL研究现状如何？\r\n参考文本:\r\n{chunk_text} " 
   
response = collection.generate.fetch_objects( 
    filters=Filter.by_property("ref_id").contains_any(refIds[0:2]),
    limit=1,
    single_prompt=prompt
)

print(len(response.objects))
for o in response.objects:
    print(o.uuid)
    print(o.properties)  
    print(o.metadata.score, o.metadata.distance, o.metadata.explain_score)
    print(o.generated)

1
fe35af17-87f4-4e6c-ae3b-38ff040bbb4b
{'paper_title': 'Exploring Chain-of-Thought Style Prompting for Text-to-SQL', 'chunk_id': 0.0, 'ref_id': '646d8642d68f896efa0a3040', 'original_filename': 'Conf_Paper_Meta_Data_EMNLP_2023_with_whole_text.db', 'paper_id': '646d8642d68f896efa0a3040', 'chunk_text': '# Exploring Chain-of-Thought Style Prompting for Text-to-SQL\nChang-You Tai, Ziru Chen, Tianshu Zhang, Xiang Deng, Huan Sun The Ohio State University {tai.97, chen.8336, zhang.11535, deng.595, sun.397}@osu.edu\n\n# Abstract\nConventional supervised approaches for textto-SQL parsing often require large amounts of annotated data, which is costly to obtain in practice. Recently, in-context learning with large language models (LLMs) has caught increasing attention due to its superior few-shot performance in a wide range of tasks. However, most attempts to use in-context learning for text-to-SQL parsing still lag behind supervised methods. We hypothesize that the underperformance is because tex

In [20]:
from weaviate.classes.query import MetadataQuery, Rerank

prompt = "回答问题：Text2SQL研究现状如何？\r\n参考文本:\r\n{chunk_text} "
  
response = collection.generate.hybrid(
    query="Text2SQL研究现状如何",   
    limit=2,
    alpha=0.64, 
    rerank=Rerank(
        prop="chunk_text",
        query="Text2SQL,texttosql"
    ),
    return_metadata=MetadataQuery(distance=True, explain_score=True,score=True),
    single_prompt=prompt
)

print(len(response.objects))
for o in response.objects:
    print(o.uuid)
    print(o.properties)  
    print(o.metadata.score, o.metadata.distance, o.metadata.explain_score)
    print(o.generated)

KeyboardInterrupt: 

In [None]:
from weaviate.classes.query import MetadataQuery, Rerank

prompt = "回答问题：Text2SQL面临哪些挑战？\r\n参考文本:\r\n{chunk_text} "
  
response = collection.generate.hybrid(
      query="Text2SQL面临哪些挑战",   
    limit=1,
    alpha=0.64, 
    rerank=Rerank(
        prop="chunk_text",
        query="Text2SQL,texttosql"
    ),
    return_metadata=MetadataQuery(distance=True, explain_score=True,score=True),
    single_prompt=prompt
)

print(len(response.objects))
for o in response.objects:
    print(o.uuid)
    print(o.properties)  
    print(o.metadata.score, o.metadata.distance, o.metadata.explain_score)
    print(o.generated)

_RawGQLReturn(aggregate={}, explore={}, get={}, errors=[{'locations': [{'column': 4, 'line': 4}], 'message': 'Cannot query field "atomgitPapers" on type "GetObjectsObj". Did you mean "Atomgit_papers_"?', 'path': None}])