In [1]:
# CRS 读入
import json
import numpy as np
import pandas as pd

path = r"./CRS/Musicians_joinable"

def process_crs(path:str):
    correspondence_set = []
    with open(path, "r") as f:
        matchings = json.load(f)
    for m in matchings:
        for c in m:
            if c not in correspondence_set:
                correspondence_set.append(c)
    Views = []
    for match in matchings:
        view = []
        for c in correspondence_set:
            if c in match:
                view.append(1)
            else:
                view.append(0)
        Views.append(view)
    prob = np.array([float(1/len(matchings)) for i in range(len(matchings))])
    
    correspondence_count = {tuple(i):0 for i in correspondence_set}
    
    return np.array(Views, dtype=int), matchings, prob, correspondence_set, correspondence_count

def read_correspondence_pd(source_path, target_path):
    source_df = pd.read_csv(source_path)
    target_df = pd.read_csv(target_path)
    return source_df, target_df

### 初始化 chatgpt 的 tokenzier，用于估计correspondence 的cost func

In [2]:
import tiktoken
encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")

In [3]:
for path in [ r"./CRS/Musicians_joinable",  r"./CRS/Musicians_unionable",  r"./CRS/Musicians_semjoinable",  r"./CRS/Musicians_viewunion"]:
    View, matchings, prob, c_set, correspondence_count = process_crs(path)
    print(len(c_set))
source_pth = r"/root/autodl-tmp/prompt-matcher-reduce-uncertainty/Valentine-datasets/Wikidata/Musicians/Musicians_joinable/musicians_joinable_source.csv"
target_pth = r"/root/autodl-tmp/prompt-matcher-reduce-uncertainty/Valentine-datasets/Wikidata/Musicians/Musicians_joinable/musicians_joinable_target.csv"
source_df, target_df = read_correspondence_pd(source_path=source_pth, target_path=target_pth)

21
39
25
21


### 处理后生成 facts 

In [4]:
from fact import FactSet
len_list = np.array([0]*123)
ex_fact = FactSet(facts=View, prior_p=prob, ground_true=2, len_list=len_list)
random_fact = FactSet(facts=View, prior_p=prob, ground_true=2, len_list=len_list)
brute_fact = FactSet(facts=View, prior_p=prob, ground_true=2, len_list=len_list)
heuristic_fact = FactSet(facts=View, prior_p=prob, ground_true=2, len_list=len_list)

## selector to select correspondence

In [5]:
from query import QuerySelector, BaseQuerySelector, GreedyQuerySelector,RandomQuerySelector, HeuristicQuerySelector
 # 对应fact1, 3是0.8, 0.
query_selector = GreedyQuerySelector()
# selection_idxes, sub_facts, h = query_selector.select(ex_fact, 2, accuracy, cost_func=2)
random_selector = RandomQuerySelector()
base_selector = BaseQuerySelector()
h_selector = HeuristicQuerySelector()

In [6]:
import openai
openai.api_base = "https://openkey.cloud/v1"
openai.api_key = "tokens"
def query_chatgpt(message_param):
    sentence = openai.ChatCompletion.create(
                                    model="gpt-4-0613",
                                    messages= [{"role": "user", "content": message_param}],
                                    # 流式输出
                                    temperature=0.8,
                                    stream = False)
    
    return sentence["choices"][0]["message"]["content"]

In [7]:
def prompt_make(attribute_name1, attribute_name2, values1, values2):
    k = """Determine the two attributes can be took as the same attribute in schema match. Remember some tips.
Tips:
(1) These two schemas are used to store Real-world information
(2) Some letters are extracted from the full names and merged into an abbreviation word.
(3) Schema information sometimes is also added as the prefix of abbreviation.
(4) values exchange verification: match would be likely correct, if the second value instances are also suitable for the first attribute name.
Input:
First Attribute Name: {attribute_name} 
its Value instances: {values} 

Second Attribute Name: {attribute_name2} 
Its Value instances: {values2}. \n
Please answer with [yes or no]""".format(attribute_name=attribute_name1, attribute_name2=attribute_name2, values=values1, values2=values2)
    return k

In [8]:
def post_p_caculate(prior_p, p_a_v, p_a):
    return prior_p*p_a_v / p_a

####  our近似算法

In [None]:

import numpy as np
import time
from query import get_values
cost_sum = 0
turns = 3


c_len = ex_fact.num_fact()
acc = np.array([[0.92 for i in range(c_len)]])
approx_h_list = []
c_index_list = [i for i in range(ex_fact.num_fact())]


while turns>0:
    print("start round")
    start = time.time()
    selection_idxes, h = query_selector.select(ex_fact, 80, acc , cost_func=2, 
                                                          target_pd=target_df, source_pd=source_df,
                                                          correspondence_count=correpondence_count, correspondence_set=c_set, c_index_list=c_index_list)
    end = time.time()
    print(selection_idxes)
    gap_time = end - start
    print(f"1 round cost {gap_time}")
    c_index_list = [k for k in c_index_list if k not in selection_idxes]
    ans = []
    for c_id in selection_idxes:
        c_name = c_set[c_id]
        print(c_name)
        correpondence_count[tuple(c_name)]+=1
        information = 'assay'
        v1, v2 = get_values(source_pd=source_df, target_pd=target_df, correspondence_count=correpondence_count, c_name=c_name)
        prompt = prompt_make(c_name[0], c_name[1], v1, v2)
        print("prompt:",prompt)
        answer = query_chatgpt(prompt).lower()
        print(answer)
        if 'yes' in answer:     
            ans.append(1)
        else:
            ans.append(0)
    turns -=1
    p_a,p_a_v = ex_fact.compute_ans_p(ans, selection_idxes, acc)
    p_post = ex_fact.get_prior_p()*p_a_v/p_a
    ex_fact.set_prior_p(p_post)
    approx_h_list.append(ex_fact.compute_entropy())



In [None]:
for idx, item_bool in enumerate(list(ex_fact.facts[-1])):
    if item_bool:
        print(idx)
        print(c_set[idx])

## random 算法

## Brute Algorithm

In [None]:
import numpy as np
import time

turns
while turns>0:
    selection_idxes, sub_facts, h = base_selector.select(brute_fact, budget, acc)
    if api_use:
         ans = [1 if gpt_check(ix_r, c_set)=="yes" else 0 for ix_r in selection_idxes]
    else:
        ans = [1 if ans_list[ix_r]=="yes" else 0 for ix_r in selection_idxes]
    p_a,p_a_v = brute_fact.compute_ans_p(ans, selection_idxes, acc)
    p_post = brute_fact.get_prior_p()*p_a_v / p_a
    brute_fact.set_prior_p(p_post)
    brute_h_list.append(brute_fact.compute_entropy())
    turns -= 1
end = time.time()

brute_timecost = end - start
print("brute:{} s".format(brute_timecost))

approx_entropies

In [None]:
approx_h_list

random_entropies

In [None]:
all_h_list

In [None]:
import numpy as np
n = np.array(all_h_list)
random_h_l = n.mean(axis=0, keepdims=True)
random_h_l = random_h_l.tolist()[0]
random_h_l