In [4]:
from tqdm import tqdm
import torch.nn.functional as F
import pandas as pd
import random
import json
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

from setting import set_seed
set_seed(42)


In [5]:
# k : output data_size
k=5

In [3]:
df_base = pd.read_excel("../../data/one_hop/BERT_FREQ1_preprocessed_df2.xlsx")
df_base = df_base.dropna()
df_base = df_base.reset_index(drop = True)
new_columns = ['sbj_hop_test', 'obj_true_hop_test', 'obj_new_hop_test']
for col in new_columns:
    df_base[col] = ''

In [6]:
with open('../../counterfact_memit.jsonl', 'r', encoding='utf-8') as file:
    data = [json.loads(line) for line in file]

In [7]:
def extract(df, data):
    result = []
    for idx in range(len(df)):
        dt = data[df.loc[idx, 'index']]
        
        result.append({
            "case_id": str(df.loc[idx, "index"]),
            "prompt": dt['requested_rewrite']['prompt'],
            "subject" : df.loc[idx, 'subject'],
            "fact_knowledge" : df.loc[idx, 'obj_true'],
            "edited_knowledge" : df.loc[idx, 'obj_new'],
            "relation_id": dt['requested_rewrite']['relation_id'],
            "rephrased_prompt" : dt['paraphrase_prompts'][0],
            "locality_prompt" : dt['neighborhood_prompts'][0],
            "locality_ground_truth" : df.loc[idx, 'obj_true'],
            
            
            "sbj_hop_test" : df.loc[idx, 'sbj_hop_test'].split(','),
            "obj_true_hop_test" : df.loc[idx, 'obj_true_hop_test'].split(','),
            "obj_new_hop_test" : df.loc[idx, 'obj_new_hop_test'].split(','),
        })
    return result

# 1. Random 5개

In [6]:
df = df_base.copy()

In [7]:
def filter_a(word_list, k=10):
    word_lists = word_list.split(',')
    if (len(word_lists) < k):
        return word_list
    selected_list = random.sample(word_lists, k)
    return ','.join(selected_list)

for i in range(len(df)):
    df.loc[i, 'sbj_hop_test'] = filter_a(df.loc[i, 'sbj_one_hop'], k)
    df.loc[i, 'obj_true_hop_test'] = filter_a(df.loc[i, 'obj_one_hop'], k)
    df.loc[i, 'obj_new_hop_test'] = filter_a(df.loc[i, 'obj_new_one_hop'], k)

In [8]:
df_a = extract(df, data)   

with open("df_exp1_a_1000.json", "w") as json_file:
    json.dump(extract(df.iloc[:1000], data), json_file, indent=4, ensure_ascii=False)
    
with open("df_exp1_a_full.json", "w") as json_file:
    json.dump(df_a, json_file, indent=4, ensure_ascii=False)

# 2. BERT 유사도 상위 5개

In [9]:
df = df_base.copy()

In [10]:
def filter_b(word_list, k=10):
    word_lists = word_list.split(',')
    if (len(word_lists) < k):
        return word_list
    return ','.join(word_lists[:k])

for i in tqdm(range(len(df))):
    df.loc[i, 'sbj_hop_test'] = filter_b(df.loc[i, 'sbj_hop'], k)   
    df.loc[i, 'obj_true_hop_test'] = filter_b(df.loc[i, 'obj_true_hop'], k)
    df.loc[i, 'obj_new_hop_test'] = filter_b(df.loc[i, 'obj_new_hop'], k)

100%|██████████| 21782/21782 [00:04<00:00, 5039.99it/s]


In [11]:
df_b = extract(df, data)     

with open("df_exp1_b_1000.json", "w") as json_file:
    json.dump(extract(df.iloc[:1000], data), json_file, indent=4, ensure_ascii=False)

with open("df_exp1_b_full.json", "w") as json_file:
    json.dump(df_b, json_file, indent=4, ensure_ascii=False)
    

# 3. 빈도수 하위 5개 단어

In [12]:
df = df_base.copy()

In [13]:
def filter_c(word_list, k = 10):
    word_lists = word_list.split(',')
    # 저장할 때 내림차순 정렬 해놨음
    reversed_list = word_lists[::-1]
    word_lists = reversed_list[:k]
    return ','.join(word_lists)

for i in range(len(df)):
    df.loc[i, 'sbj_hop_test'] = filter_c(df.loc[i, 'sbj_one_hop'], k)
    df.loc[i, 'obj_true_hop_test'] = filter_c(df.loc[i, 'obj_one_hop'], k)
    df.loc[i, 'obj_new_hop_test'] = filter_c(df.loc[i, 'obj_new_one_hop'], k)

In [14]:
df_c = extract(df, data)     

with open("df_exp1_c_1000.json", "w") as json_file:
    json.dump(extract(df.iloc[:1000], data), json_file, indent=4, ensure_ascii=False)
    
with open("df_exp1_c_full.json", "w") as json_file:
    json.dump(df_c, json_file, indent=4, ensure_ascii=False)
    

# 4. 유사도 10개 -> 빈도수 5개 단어

In [15]:
df = df_base.copy()

In [16]:
def filter_d(word_freq, word_bert, k = 10):
    reversed_list = word_freq.split(',')[::-1]
    bert_list = word_bert.split(',')[:2*k]
    
    ans_sorted = sorted(bert_list, key=lambda x: reversed_list.index(x))
    ans_sorted = ans_sorted[:k]
    return ','.join(ans_sorted)

for i in tqdm(range(len(df))):
    df.loc[i, 'sbj_hop_test'] = filter_d(df.loc[i, 'sbj_one_hop'], df.loc[i, 'sbj_hop'], k)   
    df.loc[i, 'obj_true_hop_test'] = filter_d(df.loc[i, 'obj_one_hop'], df.loc[i, 'obj_true_hop'], k)
    df.loc[i, 'obj_new_hop_test'] = filter_d(df.loc[i, 'obj_new_one_hop'], df.loc[i, 'obj_new_hop'], k)

100%|██████████| 21782/21782 [00:05<00:00, 3950.55it/s]


In [17]:
df_d = extract(df, data)     
with open("df_exp1_d_1000.json", "w") as json_file:
    json.dump(extract(df.iloc[:1000], data), json_file, indent=4, ensure_ascii=False)
with open("df_exp1_d_full.json", "w") as json_file:
    json.dump(df_d, json_file, indent=4, ensure_ascii=False)

# 5. 빈도수 10개 -> 유사도 5개 단어

In [22]:
df = df_base.copy()

In [23]:
def filter_e(word_freq, word_bert, k = 10):
    reversed_list = word_freq.split(',')[::-1][:2*k]
    bert_list = word_bert.split(',')
    
    ans_sorted = sorted(reversed_list, key=lambda x: bert_list.index(x))
    ans_sorted = ans_sorted[:k]
    return ','.join(ans_sorted)

for i in tqdm(range(len(df))):
    df.loc[i, 'sbj_hop_test'] = filter_e(df.loc[i, 'sbj_one_hop'], df.loc[i, 'sbj_hop'], k)   
    df.loc[i, 'obj_true_hop_test'] = filter_e(df.loc[i, 'obj_one_hop'], df.loc[i, 'obj_true_hop'], k)
    df.loc[i, 'obj_new_hop_test'] = filter_e(df.loc[i, 'obj_new_one_hop'], df.loc[i, 'obj_new_hop'], k)

100%|██████████| 21782/21782 [00:06<00:00, 3366.10it/s]


In [24]:
df_e = extract(df, data)     
with open("df_exp1_e_1000.json", "w") as json_file:
    json.dump(extract(df.iloc[:1000], data), json_file, indent=4, ensure_ascii=False)
with open("df_exp1_e_full.json", "w") as json_file:
    json.dump(df_e, json_file, indent=4, ensure_ascii=False)
    

In [26]:
df_base

Unnamed: 0,index,subject,subject_id,sbj_one_hop,sbj_hop_num,obj_true,obj_true_id,obj_one_hop,obj_true_hop_num,obj_new,obj_new_id,obj_new_one_hop,obj_new_hop_num,view,sbj_hop,obj_true_hop,obj_new_hop,sbj_hop_test,obj_true_hop_test,obj_new_hop_test
0,0,Danielle Darrieux,Q234149,"human,France,female,actor,voice,film actor,fil...",21,French,Q150,"Brockhaus and Efron Encyclopedic Dictionary,Sm...",99,English,Q1860,"Brockhaus and Efron Encyclopedic Dictionary,Sm...",175,222931,"Georges Mitsinkides,voice,Bordeaux,film,fallin...","Chad,Lebanon,Canada,Luxembourg,franska,Jura,Mo...","natural language,language,Greek,modern languag...",,,
1,1,Edwin of Northumbria,Q348955,"Brockhaus and Efron Encyclopedic Dictionary,En...",12,Christianity,Q5043,"Brockhaus and Efron Encyclopedic Dictionary,Sm...",25,Islam,Q432,"Brockhaus and Efron Encyclopedic Dictionary,Sm...",76,177019,"Whitby Abbey,Catholic saint,Hatfield Chase,Cwe...","Christendom,hiristiyanlik,hristiyanlik,Christ,...","Muslim,Judaism,Muhammad,major religion,humanit...",,,
2,2,Toko Yasuda,Q7813654,"human,Japan,female,musician,keyboard instrumen...",13,guitar,Q6607,"Brockhaus and Efron Encyclopedic Dictionary,Sm...",32,piano,Q5994,"Brockhaus and Efron Encyclopedic Dictionary,Sm...",11,74330,"Japan,musician,human,Enon,rock music,female,To...","chitarra,gitarre,Gitarre,machine head,tgu,Gita...","pianist,keyboard instrument,Klavier,A,kpf,klav...",,,
3,3,Autonomous University of Madrid,Q788091,"Spanish,Medvik,Agence universitaire de la Fran...",9,Spain,Q29,"Brockhaus and Efron Encyclopedic Dictionary,Sm...",231,Sweden,Q34,"Brockhaus and Efron Encyclopedic Dictionary,Sm...",212,87615,"Coalition for Advancing Research Assessment,Eu...","Community of Madrid,Madrid,Region of Murcia,es...","Denmark,Finland,Canada,Japan,Russia,Malaysia,A...",,,
4,4,Lyon,Q456,"Brockhaus and Efron Encyclopedic Dictionary,Sm...",47,Beirut,Q3820,"Brockhaus and Efron Encyclopedic Dictionary,Sm...",51,Manila,Q1461,"Brockhaus and Efron Encyclopedic Dictionary,Sm...",93,2376138,"France,Milan,Francheville,Guangzhou,college to...","Marseille,Damascus,Istanbul,Montreal,Quebec Ci...","Jakarta,Beijing,Osaka,metropolis,Santiago,Sacr...",,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21777,21914,Georges Bernier,Q3406819,"human,France,male,writer,Paris,natural causes,...",22,French,Q150,"Brockhaus and Efron Encyclopedic Dictionary,Sm...",99,Russian,Q7737,"Brockhaus and Efron Encyclopedic Dictionary,Sm...",67,13177,"Odile Vaudelle,Paris,Grodada,Charlie Mensuel,F...","Jura,Luxembourg,Lebanon,France,Monaco,Switzerl...","Russia,language,natural language,rus,East Slav...",,,
21778,21915,Jean-Pierre Dionnet,Q968155,"human,France,male,Paris,journalist,television ...",7,French,Q150,"Brockhaus and Efron Encyclopedic Dictionary,Sm...",99,Spanish,Q1321,"Brockhaus and Efron Encyclopedic Dictionary,Sm...",67,11530,"France,journalist,comics writer,Paris,televisi...","Hainaut,Andorra,Walloon Brabant,Niger,Flemish ...","Spain,human language,Peru,natural language,Gib...",,,
21779,21916,Bong Jung-keun,Q50609,"human,male,South Korea,Seoul,Korean,baseball,b...",11,pitcher,Q1048902,"baseball,baseball player,baseball position,sof...",7,outfielder,Q1142885,"baseball,baseball position,left fielder,right ...",5,12063,"Sungkyunkwan University,South Korea,Seoul,Kore...","metalka,bowler,lanceuse,softball,baseball,base...","baseball position,baseball,left fielder,right ...",,,
21780,21917,Umayyad Caliphate,Q8575586,"Europe,Armenian Soviet Encyclopedia,New Intern...",27,Damascus,Q3766,"Brockhaus and Efron Encyclopedic Dictionary,Sm...",54,Athens,Q1524,"Brockhaus and Efron Encyclopedic Dictionary,Sm...",120,4127429,"Byzantine Empire,Medieval Greek,Kingdom of Tol...","Syria,populated place in Syria,damascenus,Emir...","Tbilisi,Bucharest,Istanbul,Havana,Moscow,Sofia...",,,


# 6. Log prob 상위 5개

In [8]:
import pandas as pd
df = pd.read_excel("../../data/one_hop/log_prob_preprocessed_df2.xlsx")
new_columns = ['sbj_hop_test', 'obj_true_hop_test', 'obj_new_hop_test']
for col in new_columns:
    df[col] = ''

In [10]:
def filter_f(words, k = 10):
    return ','.join(words.split(',')[:k])

for i in tqdm(range(len(df))):
    df.loc[i, 'sbj_hop_test'] = filter_f(df.loc[i, 'sbj_one_hop'], k)   
    df.loc[i, 'obj_true_hop_test'] = filter_f(df.loc[i, 'obj_one_hop'], k)
    df.loc[i, 'obj_new_hop_test'] = filter_f(df.loc[i, 'obj_new_one_hop'], k)

100%|██████████| 21782/21782 [00:05<00:00, 3724.15it/s]


In [11]:
df.head()

Unnamed: 0,index,subject,subject_id,sbj_one_hop,sbj_hop_num,obj_true,obj_true_id,obj_one_hop,obj_true_hop_num,obj_new,obj_new_id,obj_new_one_hop,obj_new_hop_num,view,sbj_hop_test,obj_true_hop_test,obj_new_hop_test
0,0,Danielle Darrieux,Q234149,"singer,female,Bordeaux,actor,theatre,voice,hum...",21,French,Q150,"conditional,Guinea,Jersey,imperative,Luxembour...",99,English,Q1860,"feminine,Jersey,Gibraltar,masculine,Malaysia,e...",175,222931,"singer,female,Bordeaux,actor,theatre","conditional,Guinea,Jersey,imperative,Luxembourg","feminine,Jersey,Gibraltar,masculine,Malaysia"
1,1,Edwin of Northumbria,Q348955,"saint,male,sovereign,human,English,Catholic sa...",12,Christianity,Q5043,"Bible,Judaism,Jesus,Christian,Jerusalem,Christ...",25,Islam,Q432,"salah,Judaism,allegiance,Arabic,Muhammad,surre...",76,177019,"saint,male,sovereign,human,English","Bible,Judaism,Jesus,Christian,Jerusalem","salah,Judaism,allegiance,Arabic,Muhammad"
2,2,Toko Yasuda,Q7813654,"Japan,musician,female,human,rock music,keyboar...",13,guitar,Q6607,"neck,nut,long,bridge,body,fret,sound hole,mapl...",32,piano,Q5994,"A,keyboard instrument,kpf,pianist,klaver,Klavi...",11,74330,"Japan,musician,female,human,rock music","neck,nut,long,bridge,body","A,keyboard instrument,kpf,pianist,klaver"
3,3,Autonomous University of Madrid,Q788091,"Spanish,Biology Department,UAM,Medvik,MAUAM,YE...",9,Spain,Q29,"Portugal,Portuguese,Peru,Serbia,country,Malta,...",231,Sweden,Q34,"Guinea,svenska,Belarus,Iran,Swedish,Hungary,Ar...",212,87615,"Spanish,Biology Department,UAM,Medvik,MAUAM","Portugal,Portuguese,Peru,Serbia,country","Guinea,svenska,Belarus,Iran,Swedish"
4,4,Lyon,Q456,"Birmingham,Frankfurt,Leipzig,Milan,France,Euro...",47,Beirut,Q3820,"Dubai,Lebanon,Cairo,Istanbul,Marseille,Damascu...",51,Manila,Q1461,"Asia,Guam,Philippines,Taipei,Winnipeg,Santiago...",93,2376138,"Birmingham,Frankfurt,Leipzig,Milan,France","Dubai,Lebanon,Cairo,Istanbul,Marseille","Asia,Guam,Philippines,Taipei,Winnipeg"


In [12]:
df_f = extract(df, data)     
with open("df_exp1_f_1000.json", "w") as json_file:
    json.dump(extract(df.iloc[:1000], data), json_file, indent=4, ensure_ascii=False)
with open("df_exp1_f_full.json", "w") as json_file:
    json.dump(df_f, json_file, indent=4, ensure_ascii=False)
    

In [13]:
with open("df_exp1_f_5000.json", "w") as json_file:
    json.dump(extract(df.iloc[:5000], data), json_file, indent=4, ensure_ascii=False)