In [1]:
import json
import numpy as np
import pandas as pd
import copy

## Load calinet probing data

In [2]:
with open('../data/calinet_probing_data_original/probing_data_trex_500each.json', 'r') as f:
    data_calinet = json.load(f)
    starter_df = pd.DataFrame(list(data_calinet['data']))

In [3]:
# starter df
starter_df.head()

Unnamed: 0,fact_id,relation,triplet,sentences
0,1,P47,"{'sub_label': 'Norfolk', 'obj_label': 'Suffolk'}","[[Norfolk shares border with <extra_id_0>., <e..."
1,2,P47,"{'sub_label': 'Jordan', 'obj_label': 'Israel'}","[[<extra_id_0> shares border with Israel., <ex..."
2,3,P47,"{'sub_label': 'Kenya', 'obj_label': 'Ethiopia'}","[[Kenya shares border with <extra_id_0>., <ext..."
3,4,P47,"{'sub_label': 'Egypt', 'obj_label': 'Israel'}","[[<extra_id_0> shares border with Israel., <ex..."
4,5,P47,"{'sub_label': 'Tanzania', 'obj_label': 'Uganda'}","[[Tanzania shares border with <extra_id_0>., <..."


In [4]:
# all of these have to do with fact id 1
# the sentences are formed in this format...
# the start of a factual sentence, involving the subject
# and then two possibilities: one true and one false?
# storing these, then, we should do something like
# sentence stem | correct | incorrect
# and we can strip out the <extra_id_x> parts
# to keep it model agnostic
starter_df['sentences'][0][0]

['Norfolk shares border with <extra_id_0>.',
 '<extra_id_0> Suffolk <extra_id_1>',
 '<extra_id_0> Upper Macungie Township <extra_id_1>']

In [5]:
# create containers to hold our clean data
sentence_stems = []
correct = []
incorrect = []
fact_ids = []
relations = []
subjects = []
objects = []

In [6]:
for index, row in starter_df.iterrows():
    sentence_list = row['sentences']
    for entry in sentence_list:
        
        # minor cleanup 
        cleaned_stem = entry[0].replace("<extra_id_0>", "[BLANK]").strip()
        cleaned_correct = entry[1].replace("<extra_id_0>", "").replace("<extra_id_1>", "").strip()
        cleaned_incorrect = entry[2].replace("<extra_id_0>", "").replace("<extra_id_1>", "").strip()
        
        # grab sub<->obj
        subjects_and_objects = pd.json_normalize(row['triplet'])
        subjects.append(subjects_and_objects.sub_label.values[0])
        objects.append(subjects_and_objects.obj_label.values[0])
        
        # commit 
        sentence_stems.append(cleaned_stem)
        correct.append(cleaned_correct)
        incorrect.append(cleaned_incorrect)
        fact_ids.append(row['fact_id'])
        relations.append(row['relation'])

In [7]:
# sanity check
assert(len(sentence_stems) ==
       len(correct) ==
       len(incorrect) ==
       len(fact_ids) ==
       len(relations) ==
      len(subjects) ==
      len(objects))

In [8]:
# merge into big df
trex_df = pd.DataFrame({'fact_id': fact_ids,
                        'relation': relations, 'subject': subjects,
                        'object': objects, 'stem': sentence_stems, 'true': correct,
                        'false': incorrect})

In [9]:
# full df
trex_df.head()

Unnamed: 0,fact_id,relation,subject,object,stem,true,false
0,1,P47,Norfolk,Suffolk,Norfolk shares border with [BLANK].,Suffolk,Upper Macungie Township
1,1,P47,Norfolk,Suffolk,Norfolk borders with [BLANK].,Suffolk,Vadena
2,1,P47,Norfolk,Suffolk,[BLANK] shares the border with Suffolk.,Norfolk,Northern Cape province
3,1,P47,Norfolk,Suffolk,[BLANK] shares its border with Suffolk.,Norfolk,Sunamganj District
4,1,P47,Norfolk,Suffolk,Norfolk shares a common border with [BLANK].,Suffolk,Anabar


In [10]:
trex_df.tail()

Unnamed: 0,fact_id,relation,subject,object,stem,true,false
142995,13000,P264,X&Y,Parlophone,[BLANK] label : Parlophone.,X&Y,Junior Hanson
142996,13000,P264,X&Y,Parlophone,"[BLANK], released by Parlophone.",X&Y,Doo-Wops & Hooligans
142997,13000,P264,X&Y,Parlophone,Parlophone recording artist [BLANK].,X&Y,Atlas Genius
142998,13000,P264,X&Y,Parlophone,Parlophone artists such as [BLANK].,X&Y,untitled 2008 album
142999,13000,P264,X&Y,Parlophone,[BLANK] artists including X&Y.,Parlophone,ATCO


In [11]:
trex_df.shape

(143000, 7)

In [12]:
print(f'Number of rows in trex_df is {trex_df.shape[0]}')

Number of rows in trex_df is 143000


In [13]:
# write out initial df to json if desired
# trex_df.to_json('../data/calinet_probing_data_original/calinet_trex_full_data.json', orient='records', lines=True)

In [14]:
# how many stems end in [BLANK]? -> 50451, or about 1/3.
c = 0
for stem in trex_df['stem']:
    if stem.endswith("[BLANK]."):
        c+=1
print(c)

50451


In [15]:
def check_for_causal_compatibility(stem):
    return stem.endswith("[BLANK].")

In [16]:
def trim_stem(stem):
    if stem.endswith("[BLANK]."):
        return stem[0: len(stem)-9]

In [17]:
trex_causal_df = trex_df[trex_df.apply(lambda x: check_for_causal_compatibility(x.stem), axis=1)]

In [18]:
trex_causal_df = trex_causal_df.copy()

In [19]:
trimmed_stems = trex_causal_df.apply(lambda x: trim_stem(x.stem), axis=1)

In [20]:
trex_causal_df['stem'] = list(trimmed_stems)

In [21]:
# only about 20% of the calinet data is 'unique' knowledge, since they used paraphrases to calibrate
len(trex_causal_df['fact_id'].unique())

11960

In [22]:
# before sampling, attach arbitrary counter ID, to then track who gets removed
trex_causal_df['calibra_id'] = range(50451)

In [23]:
trex_causal_subset = trex_causal_df.groupby('fact_id').apply(lambda x: x.sample(1, random_state=42)).reset_index(drop=True)

In [24]:
assert(trex_causal_subset.shape[0] == len(trex_causal_df['fact_id'].unique()))


In [25]:
trex_causal_subset.head()


Unnamed: 0,fact_id,relation,subject,object,stem,true,false,calibra_id
0,1,P47,Norfolk,Suffolk,Norfolk borders with,Suffolk,Vadena,1
1,2,P47,Jordan,Israel,Jordan shares a common border with,Israel,Simbach,6
2,3,P47,Kenya,Ethiopia,Kenya shares border with,Ethiopia,Yixing,7
3,4,P47,Egypt,Israel,Egypt shares its border with,Israel,"Montréal, Quebec",14
4,5,P47,Tanzania,Uganda,Tanzania borders with,Uganda,La Pampa,19


In [26]:
trex_causal_subset.tail()


Unnamed: 0,fact_id,relation,subject,object,stem,true,false,calibra_id
11955,12996,P264,Cody Wise,Interscope Records,The music label that is representing Cody Wise is,Interscope Records,Heads Up International,50411
11956,12997,P264,Amy Ray,Daemon Records,Daemon Records artists such as,Amy Ray,Hello Rockview,50426
11957,12998,P264,Martin Sorrondeguy,Lengua Armada Discos,"Martin Sorrondeguy, which is represented by",Lengua Armada Discos,Modern Day Escape,50428
11958,12999,P264,Madlib,Stones Throw,Stones Throw artists such as,Madlib,Bix Beiderbecke,50441
11959,13000,P264,X&Y,Parlophone,X&Y's label is,Parlophone,Re-Constriction Records,50444


In [27]:
removed_ids = {}
removed_counterfacts = {}
for c_id in trex_causal_df['calibra_id']:
    if c_id not in trex_causal_subset['calibra_id'].values:
        fact_id = trex_causal_df[trex_causal_df['calibra_id'] == c_id]['fact_id'].values[0]
        counterfact = trex_causal_df[trex_causal_df['calibra_id'] == c_id]['false'].values[0]
        removed_ids[str(c_id)] = int(fact_id)
        if str(fact_id) in removed_counterfacts:
            removed_counterfacts[str(fact_id)].append(counterfact)
        else:
            removed_counterfacts[str(fact_id)] = [counterfact]

# did we remove as many rows as eq to the difference between the full calinet dataset row number and the unique count?
assert(len(removed_ids) == trex_causal_df.shape[0] - len(trex_causal_df['fact_id'].unique()))

In [28]:
# these are essentially the extra false things we can test against
# that are still worth keeping
c = 0
for k, v in removed_counterfacts.items():
    print(k, v)
    c+=1
    if c == 15:
        break

1 ['Upper Macungie Township', 'Anabar', 'Riau', 'Bologna']
2 ['Mpumalanga']
3 ['James City County, Virginia', 'Portneuf', 'Rockingham County, Virginia', 'Giridih', 'Canazei']
4 ['Sestriere', 'Nitra District', 'Acerra', 'Le Havre']
5 ['Ukrainians', 'First Czechoslovak Republic', 'Ziburu']
6 ['Oliver, British Columbia', 'Kapurthala', 'ASEAN']
7 ['Vinnytsia Oblast', 'Laveno-Mombello', 'Orbassano', 'Arnhem', 'Santa Cristina Gela']
8 ['North America', 'Mogilev Region', 'New Zealand/Aotearoa', 'Phasi Charoen']
9 ['Castile La Mancha', 'Chikballapur district', 'Brewster County']
10 ['Chaumont-Gistoux', 'Magadan Oblast']
11 ['Bulakan', 'East Flanders', 'Arenys de Munt']
12 ['First Czechoslovak Republic', 'South West Africa', 'Churchill, Manitoba']
13 ['Oak Park', 'Rabun County', 'Rio de Janeiro (RJ)', 'Lower Hutt']
14 ['Liberty Village', 'Civitacampomarano', 'Sorano']
15 ['Western region', 'Sheridan Hollow']


In [29]:
# drop extraneous calibra_id column 
trex_causal_subset.drop(['calibra_id'], axis=1, inplace=True)


In [30]:
# there are some fact_id's that only have 1 row
# since we did pull stuff out based on our left to right requirement
trex_causal_subset.shape

(11960, 7)

In [31]:
len(removed_counterfacts)

10563

In [32]:
full_falses = {}
for k, v in removed_counterfacts.items():
    subset_false = trex_causal_subset[trex_causal_subset['fact_id'] == int(k)].false.values[0]
    full_falses[k] = v
    full_falses[k].append(subset_false)

print(len(full_falses))

10563


In [33]:
for k, v in full_falses.items():
    print(k,v)
    break

1 ['Upper Macungie Township', 'Anabar', 'Riau', 'Bologna', 'Vadena']


In [34]:
def replace_false_column(fact_id, false_val, full_false_dict=full_falses):
    if str(fact_id) in full_false_dict:
        return full_false_dict[str(fact_id)]
    else:
        return [false_val]

In [35]:
replaced_falses = list(trex_causal_subset.apply(lambda x: replace_false_column(x.fact_id, x.false), axis=1))


In [36]:
len(replaced_falses)


11960

In [37]:
replaced_falses[:6]


[['Upper Macungie Township', 'Anabar', 'Riau', 'Bologna', 'Vadena'],
 ['Mpumalanga', 'Simbach'],
 ['James City County, Virginia',
  'Portneuf',
  'Rockingham County, Virginia',
  'Giridih',
  'Canazei',
  'Yixing'],
 ['Sestriere', 'Nitra District', 'Acerra', 'Le Havre', 'Montréal, Quebec'],
 ['Ukrainians', 'First Czechoslovak Republic', 'Ziburu', 'La Pampa'],
 ['Oliver, British Columbia', 'Kapurthala', 'ASEAN', 'Kodanad']]

In [38]:
trex_causal_subset['false'] = replaced_falses


In [39]:
trex_causal_subset.head()

Unnamed: 0,fact_id,relation,subject,object,stem,true,false
0,1,P47,Norfolk,Suffolk,Norfolk borders with,Suffolk,"[Upper Macungie Township, Anabar, Riau, Bologn..."
1,2,P47,Jordan,Israel,Jordan shares a common border with,Israel,"[Mpumalanga, Simbach]"
2,3,P47,Kenya,Ethiopia,Kenya shares border with,Ethiopia,"[James City County, Virginia, Portneuf, Rockin..."
3,4,P47,Egypt,Israel,Egypt shares its border with,Israel,"[Sestriere, Nitra District, Acerra, Le Havre, ..."
4,5,P47,Tanzania,Uganda,Tanzania borders with,Uganda,"[Ukrainians, First Czechoslovak Republic, Zibu..."


In [40]:
trex_causal_subset.tail()

Unnamed: 0,fact_id,relation,subject,object,stem,true,false
11955,12996,P264,Cody Wise,Interscope Records,The music label that is representing Cody Wise is,Interscope Records,"[Holy Records, Heads Up International, Disney,..."
11956,12997,P264,Amy Ray,Daemon Records,Daemon Records artists such as,Amy Ray,"[So So Def Recordings, Universal Music Japan, ..."
11957,12998,P264,Martin Sorrondeguy,Lengua Armada Discos,"Martin Sorrondeguy, which is represented by",Lengua Armada Discos,"[Frontiers Records, Barnaby Records, Aggro Ber..."
11958,12999,P264,Madlib,Stones Throw,Stones Throw artists such as,Madlib,"[Green Linnet Records, Mute records, Vee Jay R..."
11959,13000,P264,X&Y,Parlophone,X&Y's label is,Parlophone,"[Angular Recording Corporation, SPV GmbH, Abac..."


In [41]:
data_calinet_input_information = {}
trex_list = trex_causal_subset.to_dict('records')
for i, entry in enumerate(trex_list):
    data_calinet_input_information[i] = trex_list[i]

In [42]:
num_pairs = 0
for x, y in data_calinet_input_information.items():
    data_calinet_input_information[x] = y 
    data_calinet_input_information[x]['false'] = list(set(y['false']))
    
    num_pairs += len(data_calinet_input_information[x]['false'])

In [43]:
print(f'Number of rows in calinet_input_info is {num_pairs}')

Number of rows in calinet_input_info is 50386


In [44]:
# write out initial df to json if desired


# write out cleaned/formatted df
#with open(
#    f"../data/ingestion_tmp_data/calinet_input_information.json", "w"
#) as outfile:
#    json.dump(output_dict, outfile)

In [45]:
# out of curiosity, which relation templates persist in the cleaned, 'causal friendly' set...
trex_causal_df['relation'].value_counts()

P495     4172
P138     3729
P264     3533
P1376    3509
P101     3279
P740     3246
P36      3215
P449     2794
P47      2265
P20      2046
P19      1760
P159     1729
P27      1717
P530     1506
P106     1497
P407     1492
P364     1457
P176     1277
P39      1268
P37      1000
P937      995
P178      995
P136      967
P463      758
P413      245
Name: relation, dtype: int64

## Load in ROME counterfact data

In [46]:
with open('../data/rome_counterfact_original/counterfact.json', 'r') as f:
    data_rome = json.load(f)

In [47]:

print(f'Number of rows in calinet_input_info is {len(data_rome)}')


Number of rows in calinet_input_info is 21919


In [48]:
data_rome_input_information = {}

for i in range(len(data_rome)):
    stem = data_rome[i]['requested_rewrite']['prompt'].replace('{}', data_rome[i]['requested_rewrite']['subject'])
    
    data_rome_input_information[str(i)] = {
        "stem": stem,
        "true": data_rome[i]['requested_rewrite']['target_true']['str'],
        "false": [data_rome[i]['requested_rewrite']['target_new']['str']],
        "case_id":  data_rome[i]['case_id']
    }

In [49]:
#data_rome_input_information

In [50]:
#with open(
#    f"../data/ingestion_tmp_data/rome_counterfact_input_information.json", "w"
#) as outfile:
#    json.dump(data_rome_input_information, outfile)

## Combine the two datasets

In [51]:
data_rome = copy.deepcopy(data_rome_input_information)
data_calinet = copy.deepcopy(data_calinet_input_information)

In [52]:
#data_calinet
#data_rome

mixed_itr = 0
mixed_df = {}

for x, y in data_calinet.items():
    y['dataset_original'] = 'calinet_input_information'
    mixed_df[str(mixed_itr)] = y

    mixed_itr+=1

for x, y in data_rome.items():
    y['dataset_original'] = 'rome_counterfact_input_information'
    mixed_df[str(mixed_itr)] = y
    mixed_itr+=1


In [53]:
itrs = 0
for x, y in mixed_df.items():
    itrs += 1

print(f'Number of rows in mixed_df is {itrs}')


Number of rows in mixed_df is 33879


In [54]:
# optionally write the mixed json to file

#with open(
    #f"../data/ingestion_tmp_data/fact_checking_full_input_information.json", "w"
#) as outfile:
    #json.dump(mixed_df, outfile)

## Convert json to pandas, then upload to huggingface

In [55]:
# test load of mixed json

#with open(
#    f"../data/ingestion_tmp_data/fact_checking_full_input_information.json", "r"
#) as outfile:
#    mixed_df = json.load(outfile)

In [56]:
pairs_list = []
for x, y in mixed_df.items():
    for itr in range(len(y['false'])):
        pairs = [y['stem'] + ' ' + y['true'] + ' ' + y['false'][itr]]
        pairs_list.append(pairs)

print(f'The number of [stem + fact + counterfact] trios in mixed_df is {len(pairs_list)}')

x_list = []
pairs_list = []
for x, y in mixed_df.items():
    pairs = [y['stem'] + ' ' + y['true']]
    pairs_list.append(pairs)

print(f'The number of [stem + fact] pairs in mixed_df is {len(pairs_list)}')

The number of [stem + fact + counterfact] trios in mixed_df is 72305
The number of [stem + fact] pairs in mixed_df is 33879


In [57]:
# update mixed_df to have all info for rome then write that out. 
mixed_df = pd.DataFrame.from_dict(mixed_df).T

In [58]:
# get rome info to look at:
with open('../data/rome_counterfact_original/counterfact.json', 'r') as f:
    data_rome_original = json.load(f)
    rome_df = pd.DataFrame.from_dict(data_rome_original)


In [59]:
# 3/20 data frame cleanup
rome_df.head()

rome_subjects = {}
rome_objects = {}
rome_relations = {}

for i, rewrite in enumerate(rome_df['requested_rewrite']):
    rome_subjects[i] = rewrite['subject']
    rome_objects[i] = rewrite['target_true']['str']
    rome_relations[i] = rewrite['relation_id']

assert(len(rome_subjects) == len(rome_objects) == len(rome_relations) == rome_df.shape[0])

In [60]:
subjects = []
objects = []
ids = []
relations = []

for row in mixed_df.iterrows():
    if row[1]['dataset_original'] == 'calinet_input_information':
        subjects.append(row[1]['subject'])
        objects.append(row[1]['object'])
        relations.append(row[1]['relation'])
        ids.append('calinet_' + str(row[1]['fact_id']))
    if row[1]['dataset_original'] == 'rome_counterfact_input_information':
        # get case id
        case_id = row[1]['case_id']
        
        # get subject
        subjects.append(rome_subjects[case_id])
        # get object
        objects.append(rome_objects[case_id])
        # get relation
        relations.append(rome_relations[case_id])
        ids.append('rome_' + str(case_id))

assert(len(subjects) == len(objects) == len(ids) == len(relations))

In [61]:
mixed_df['subject'] = subjects

In [62]:
mixed_df['object'] = objects

In [63]:
mixed_df['relation'] = relations

In [64]:
mixed_df['dataset_id'] = ids

In [65]:
mixed_df.drop(['fact_id', 'case_id', 'dataset_original'], axis=1, inplace=True)

In [66]:
assert(not mixed_df.isnull().values.any())

In [67]:
# re-arrange cols
mixed_df = mixed_df[['dataset_id', 'stem', 'true', 'false', 'relation', 'subject', 'object' ]]

In [68]:
# write to file as .csv
mixed_df.to_csv('../data/ingestion_tmp_data/fact_checking_full_input_information_3_20_23.csv', index=False)

In [69]:
mixed_df

Unnamed: 0,dataset_id,stem,true,false,relation,subject,object
0,calinet_1,Norfolk borders with,Suffolk,"[Vadena, Upper Macungie Township, Anabar, Riau...",P47,Norfolk,Suffolk
1,calinet_2,Jordan shares a common border with,Israel,"[Simbach, Mpumalanga]",P47,Jordan,Israel
2,calinet_3,Kenya shares border with,Ethiopia,"[Portneuf, Giridih, Canazei, Rockingham County...",P47,Kenya,Ethiopia
3,calinet_4,Egypt shares its border with,Israel,"[Le Havre, Sestriere, Nitra District, Montréal...",P47,Egypt,Israel
4,calinet_5,Tanzania borders with,Uganda,"[First Czechoslovak Republic, Ukrainians, Zibu...",P47,Tanzania,Uganda
...,...,...,...,...,...,...,...
33874,rome_21914,"Georges Bernier, speaker of",French,[Russian],P103,Georges Bernier,French
33875,rome_21915,The language used by Jean-Pierre Dionnet is,French,[Spanish],P1412,Jean-Pierre Dionnet,French
33876,rome_21916,Which position does Bong Jung-keun play? They ...,pitcher,[outfielder],P413,Bong Jung-keun,pitcher
33877,rome_21917,"Umayyad Caliphate's capital,",Damascus,[Athens],P36,Umayyad Caliphate,Damascus


In [70]:
# delete erroneous entries in the dataset
# these were not exhaustively searched for, some
# errors could still exist in the data

rows_to_delete = [

# llama-33b:
'rome_19765',
'calinet_9087',
'rome_9674',
'rome_13669',
'rome_17792',
'calinet_469',
'calinet_12945', 
'rome_17452', 
'rome_597',
'calinet_7656', 
'rome_16474', 
'rome_6020', 
'rome_9479', 
'calinet_5834', 
'rome_9414', 
'rome_6487', 
'rome_10852', 
'rome_14709', 
'rome_4358', 
'rome_10342', 
'calinet_12839', 
'rome_19963', 
'rome_5757', 
'rome_3604', 
'rome_8710', 
'calinet_2551', 
'rome_20688', 
'rome_15441', 
'calinet_12842', 
'calinet_9348', 
'calinet_2516', 
'calinet_12777', 
'rome_13682', 
'calinet_29',

# flan-t5-xl:
'calinet_3198',
'rome_10178',
'rome_19495',
'rome_9674',
'rome_13028',
'calinet_5452',
'rome_19963',
'calinet_2568',
'calinet_5475',
'calinet_9555',
'rome_19788',
'rome_12483',
'rome_14334',
'calinet_10778',
'rome_612',
'rome_8416',
'calinet_5133',
'calinet_5185',
'rome_1525',

# roberta-large:

# random finds
'calinet_9032',


]

# delete these rows
for i in list(mixed_df.index):
    if mixed_df.loc[i].dataset_id in list(set(rows_to_delete)):
        # print(mixed_df.loc[i].dataset_id)
        mixed_df.drop(i, axis=0, inplace=True)


In [71]:
# delete stems that end with "a" or "an"
itr = 0
for i in list(mixed_df.index):

    if (mixed_df.loc[i].stem[-2:] == ' a') or (mixed_df.loc[i].stem[-3:] == ' an'):
        itr+=1
        #print(mixed_df.loc[i].stem, ":", mixed_df.loc[i].true, mixed_df.loc[i].false)
        mixed_df.drop(i, axis=0, inplace=True)


In [72]:
mixed_df.reset_index(drop=True, inplace=True)

In [73]:
# modify erroneous errors when sandwitched with correct data
# dictionary: dataset_id and new counterfact list, with the error removed

rows_to_alter = {

# llama-33b:
'calinet_7809': {'false': "['Gaulish', 'Georgian']"},
'calinet_1917': {'false': "['theology', 'free software', 'accounting']"},
'calinet_7790': {'false': ['Hebrew', 'Swahili']},
'rome_11311': {'false': ['Russian'], 'true': 'French', 'object': 'French'},

# flan-t5-xl:

# roberta-large:

}

for key, dictionary in rows_to_alter.items():
    for column, edit in dictionary.items():
        row_ind = mixed_df[mixed_df.dataset_id==key].false.index[0]
        mixed_df.loc[row_ind, column] = edit



In [74]:
# fix small syntax and grammatical errors:

# context: "shares border with" / "shares the border with"-> "shares a border with"

for i in range(len(mixed_df)):
    if 'shares border with' in mixed_df.loc[i].stem:
        mixed_df.loc[i, "stem"] = mixed_df.loc[i].stem.replace('shares border with', 'shares a border with')

    elif 'shares the border with' in mixed_df.loc[i].stem:
        mixed_df.loc[i, "stem"] = mixed_df.loc[i].stem.replace('shares the border with', 'shares a border with')

    elif 'borders with' in mixed_df.loc[i].stem:
        mixed_df.loc[i, "stem"] = mixed_df.loc[i].stem.replace('borders with', 'shares a border with')
    
    elif 'premiered' in mixed_df.loc[i].stem:
        mixed_df.loc[i, "stem"] = mixed_df.loc[i].stem.replace('premiered', 'originally aired')        

In [75]:
print(f'Number of rows in mixed_df is {len(mixed_df)}')



Number of rows in mixed_df is 33456


In [76]:
# find any duplicates resulting from above fixes
# start with [stem + fact] pairs

pairs_list = []
pairs_list_duplicated = []
itrs_duplicated = []
for i in range(len(mixed_df)):
    pairs_print = mixed_df.loc[i].stem + " {true: " + mixed_df.loc[i].true +"}"
    pairs = (mixed_df.loc[i].stem, mixed_df.loc[i].true)
    if pairs in pairs_list:
        print('duplicate:', pairs)
        pairs_list_duplicated.append(pairs)
        itrs_duplicated.append(i)
    pairs_list.append(pairs)

print(f'\nThe number of [stem + fact] duplicates is {len(pairs_list) - len(list(set(pairs_list)))}')

# repair any duplicates resulting from above fixes
pairs_list_collect = []
for i in range(len(mixed_df)):
    pairs_print = mixed_df.loc[i].stem + " {true: " + mixed_df.loc[i].true +"}"
    pairs = (mixed_df.loc[i].stem, mixed_df.loc[i].true)
    if pairs in pairs_list_duplicated:
        pairs_list_collect.append((mixed_df.loc[i].stem, mixed_df.loc[i].true, mixed_df.loc[i].false))

new_counterfacts = {}
for element in pairs_list_collect:   
    try:
        new_counterfacts[element[0] + " " + element[1]].extend(element[2])
    except KeyError:
        new_counterfacts[element[0] + " " + element[1]] = element[2]

new_counterfacts_2 = {}
for x, y in new_counterfacts.items():
    # print(x,y)
    new_counterfacts_2[x] = list(set(y))

#new_counterfacts_2

for i in range(len(mixed_df)):
    key_item = mixed_df.loc[i].stem + " " + mixed_df.loc[i].true
    if key_item in list(new_counterfacts_2.keys()):

        mixed_df.loc[i, 'false'] = new_counterfacts_2[key_item]

mixed_df.drop_duplicates(subset=['stem', 'true'], inplace=True)
mixed_df.reset_index(drop=True, inplace=True)

duplicate: ('Austria shares a border with', 'Switzerland')
duplicate: ('Japan shares a border with', 'Taiwan')
duplicate: ('Bangladesh shares a border with', 'India')
duplicate: ('Afghanistan shares a border with', 'Pakistan')
duplicate: ('Sichuan shares a border with', 'Guizhou')
duplicate: ('Europe shares a border with', 'Asia')
duplicate: ('Seoul is the capital of', 'South Korean')
duplicate: ('The capital city of Scotland is', 'Edinburgh')
duplicate: ('Harare is the capital city of', 'Zimbabwe')
duplicate: ('London is the capital of', 'England')
duplicate: ('Northern Ireland, which has the capital', 'Belfast')
duplicate: ('Poland, which has the capital', 'Warsaw')
duplicate: ("Israel's capital city is", 'Jerusalem')
duplicate: ('The capital city of Russia is', 'Moscow')
duplicate: ('The capital city of Colorado is', 'Denver')
duplicate: ('Edinburgh is the capital of', 'Scottish')
duplicate: ('The capital city of Belarus is', 'Minsk')
duplicate: ('The capital city of Jiangsu Provinc

In [77]:
# find any duplicates remaining
pairs_list = []
for i in range(len(mixed_df)):
    pairs = (mixed_df.loc[i].stem, mixed_df.loc[i].true)
    pairs_list.append(pairs)

print(f'\nThe number of [stem + fact] duplicates is {len(pairs_list) - len(list(set(pairs_list)))}')


The number of [stem + fact] duplicates is 0


In [78]:
# make sure all counterfacts are sets
pairs_list = []
for i in range(len(mixed_df)):
    mixed_df.loc[i, 'false'] = list(set(mixed_df.loc[i, 'false']))


In [79]:
# find any trio duplicates remaining
pairs_list = []
for i in range(len(mixed_df)):
    for item in mixed_df.loc[i].false:
        pairs = (mixed_df.loc[i].stem, mixed_df.loc[i].true, item)
        pairs_list.append(pairs)

print(f'\nThe number of duplicate [stem + fact + false] trios is {len(pairs_list) - len(list(set(pairs_list)))}')


The number of duplicate [stem + fact + false] trios is 0


In [80]:
# print numbers of pairs and trios in the dataset again

pairs_list = []
for i in range(len(mixed_df)):
    for item in mixed_df.loc[i].false:
        pairs = (mixed_df.loc[i].stem, mixed_df.loc[i].true, item)
        pairs_list.append(pairs)
print(f'The number of [stem + fact + counterfact] trios in mixed_df is {len(pairs_list)}')

pairs_list = []
for i in range(len(mixed_df)):
    pairs = (mixed_df.loc[i].stem, mixed_df.loc[i].true)
    pairs_list.append(pairs)

print(f'The number of [stem + fact] pairs in mixed_df is {len(pairs_list)}')

The number of [stem + fact + counterfact] trios in mixed_df is 71700
The number of [stem + fact] pairs in mixed_df is 33295


In [81]:
print(f'Number of rows in mixed_df is {len(mixed_df)}')


Number of rows in mixed_df is 33295


In [82]:
# shuffle the df's rows (without replacement)
mixed_df = mixed_df.sample(frac=1, replace=False, random_state=42, ignore_index=True)
mixed_df

Unnamed: 0,dataset_id,stem,true,false,relation,subject,object
0,rome_18789,Suzuki Aerio is developed by,Suzuki,[Dodge],P176,Suzuki Aerio,Suzuki
1,calinet_6880,American Wrestling Association's headquarters ...,"Minneapolis, Minnesota","[Wheeling, Illinois, Mongomo, México, OneChica...",P159,American Wrestling Association,"Minneapolis, Minnesota"
2,rome_15269,The Two Babylons was created in,Scotland,[France],P495,The Two Babylons,Scotland
3,calinet_8045,"The Smashing Pumpkins, who plays",rock,"[detective film, cookbook]",P136,The Smashing Pumpkins,rock
4,calinet_8671,Aero Commander 500 family is produced by,Aero Commander,"[Mars Confectionery, Apple computer, ATARI]",P176,Aero Commander 500 family,Aero Commander
...,...,...,...,...,...,...,...
33290,rome_5138,"In Nicaragua, the language spoken is",Spanish,[German],P37,Nicaragua,Spanish
33291,calinet_7237,"Elio Fiorucci, that was started in",Milan,"[Madrid, England, Waltham, Massachusetts, Deni...",P740,Elio Fiorucci,Milan
33292,calinet_12481,"Diocletian, who has the position of",Roman Emperor,"[Strategos, North Carolina Attorney General, p...",P39,Diocletian,Roman Emperor
33293,calinet_871,Claude Flight was born in,London,"[Lebrija, Lebanon, Oregon, Lauterburg]",P19,Claude Flight,London


In [83]:
# write to file as .csv
mixed_df.to_csv('../data/ingestion_tmp_data/fact_checking_full_input_information_3_21_23.csv', index=False)

## Load final csv to HuggingFace

In [84]:
from datasets import load_dataset
data_files = {
    "English": "../data/ingestion_tmp_data/fact_checking_full_input_information_3_21_23.csv", 
}
dataset = load_dataset("csv", data_files=data_files)
dataset

Using custom data configuration default-83934004a9df2b56


Downloading and preparing dataset csv/default to /Users/danielfurman/.cache/huggingface/datasets/csv/default-83934004a9df2b56/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

Dataset csv downloaded and prepared to /Users/danielfurman/.cache/huggingface/datasets/csv/default-83934004a9df2b56/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

DatasetDict({
    English: Dataset({
        features: ['dataset_id', 'stem', 'true', 'false', 'relation', 'subject', 'object'],
        num_rows: 33295
    })
})

In [85]:
dataset

DatasetDict({
    English: Dataset({
        features: ['dataset_id', 'stem', 'true', 'false', 'relation', 'subject', 'object'],
        num_rows: 33295
    })
})

In [86]:
from huggingface_hub import login
from dotenv import load_dotenv

# This reads the environment variables inside .env
load_dotenv() 
# Logs into HF hub
login(os.getenv('HF_TOKEN')) 

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid.
Your token has been saved to /Users/danielfurman/.huggingface/token
Login successful


In [87]:
# push to hub
dataset.push_to_hub("CalibraGPT/Fact_Checking")

Pushing split English to the Hub.


Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Deleting unused files from dataset repository:   0%|          | 0/1 [00:00<?, ?it/s]

In [88]:
# test loading from hub
load_dataset("CalibraGPT/Fact_Checking")

Downloading readme:   0%|          | 0.00/4.02k [00:00<?, ?B/s]

Using custom data configuration CalibraGPT--Fact_Checking-e858a583b1d7a600


Downloading and preparing dataset None/None to /Users/danielfurman/.cache/huggingface/datasets/CalibraGPT___parquet/CalibraGPT--Fact_Checking-e858a583b1d7a600/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/2.41M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

Dataset parquet downloaded and prepared to /Users/danielfurman/.cache/huggingface/datasets/CalibraGPT___parquet/CalibraGPT--Fact_Checking-e858a583b1d7a600/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

DatasetDict({
    English: Dataset({
        features: ['dataset_id', 'stem', 'true', 'false', 'relation', 'subject', 'object'],
        num_rows: 33295
    })
})