In [23]:
from pathlib import Path
records_folder = Path("../data/output/hallucination_factchecking/records")
consolidated_folder = Path("../data/output/hallucination_factchecking/consolidated")

In [24]:
input_files = list(records_folder.glob("*.json"))
len(input_files)

5785

In [50]:
# parse the json files
import json
all_results = []
error_results = []
for in_file in input_files:
    with open(in_file, "r") as f:
        content = json.load(f)        
        results = content["result"]
        id = content["id"]
        question = results["question"]
        answer = results["answer"]
        answerable_by_context = results["answerable_by_context"]
        claims_check = results["claims_check"]
        for claim_idx, claim in enumerate(claims_check):
            claim_text = claim["claim"]
            check_type = claim["check_type"]
            checkworthy = claim["checkworthy"]
            reasoning = claim["reasoning"]
            score = claim["score"]
            links = claim.get("links", [])
            if any([keyword in " ".join(reasoning).lower() for keyword in ["unable to retrieve", "failed to get", "parse", "json", "rate limit"]]):
                error_results.append({
                "id": id,
                "claim_idx": claim_idx,
                "claim_text": claim_text,
                "question": question,
                "answer": answer,
                "check_type": check_type,
                "checkworthy": checkworthy,
                "reasoning": reasoning,
                "score": score,
                "links": links 
            })
                
            else:               
                all_results.append({
                    "id": id,
                    "claim_idx": claim_idx,
                    "claim_text": claim_text,
                    "question": question,
                    "answer": answer,
                    "check_type": check_type,
                    "checkworthy": checkworthy,
                    "reasoning": reasoning,
                    "score": score,
                    "links": links 
                })
        


KeyError: 'factcheck_result'

In [26]:
len(error_results)

107

In [27]:
import pandas as pd
df = pd.DataFrame(all_results)
df_errors = pd.DataFrame(error_results)

In [28]:
df_input = pd.read_csv("/home/watsonchua/sls-lea-evaluation/data/input/hd_fc_by_record.csv")
id_to_filename = {}
for index, row in df_input.iterrows():
    id_to_filename[index] = row["filename"]


In [29]:
# df.to_csv(consolidated_folder / "hallucination_factchecking_results.csv", index=False)

def add_topic_profile_to_df(df_in):
    df_in["filename"] = df_in["id"].map(id_to_filename)
    df_in["topic"] = df_in["filename"].apply(lambda x: x.split("_", maxsplit=1)[0])
    df_in["profile"] = df_in["filename"].apply(lambda x: x.split("_", maxsplit=1)[1][:-5])
    df_in.sort_values(by=["id", "claim_idx"], inplace=True)
    return df_in

df_processed = add_topic_profile_to_df(df)
df_errors_processed = add_topic_profile_to_df(df_errors)


df_processed.to_csv(consolidated_folder / "hallucination_factchecking_results.csv", index=False)
df_errors_processed.to_csv(consolidated_folder / "hallucination_factchecking_errors.csv", index=False)



In [30]:
print(
    len(df_processed[df_processed["check_type"] == "hallucination"]["id"].unique()),
    len(df_processed[df_processed["check_type"] == "factuality"]["id"].unique()),
    len(df_processed[df_processed["check_type"] == "NA"]["id"].unique())
)

1346 2820 4479


In [31]:
# Get all unique IDs
all_ids = df_processed["id"].unique()

# For each ID, check if all its entries have check_type as NA
ids_with_all_na = []
for id_val in all_ids:
    id_entries = df_processed[df_processed["id"] == id_val]
    if (id_entries["check_type"] == "NA").all():
        ids_with_all_na.append(id_val)

print(f"Number of IDs where all check_types are NA: {len(ids_with_all_na)}")
df_processed[df_processed["id"].isin(ids_with_all_na)].groupby("profile").size()


Number of IDs where all check_types are NA: 1618


profile
bad_english     2147
good            1103
out_of_topic    2267
unfactual       1865
dtype: int64

In [73]:
df_processed.groupby(["profile", "check_type"]).value_counts(["score"])

profile       check_type     score         
bad_english   NA             NA                3509
              factuality     PASS              1503
                             FAIL                19
              hallucination  PASS              1397
                             FAIL               266
good          NA             NA                2427
              factuality     PASS              1838
                             FAIL                37
                             PARTIALLY_PASS       1
              hallucination  PASS              1666
                             FAIL               275
out_of_topic  NA             NA                4805
              factuality     PASS              2728
                             FAIL                11
              hallucination  PASS               384
                             FAIL                57
unfactual     NA             NA                3989
              factuality     PASS              2131
                    

In [32]:
len(df_processed["id"].unique())

5784

In [33]:
# break down by claims
df_processed.groupby("check_type")["id"].count()

check_type
NA               14730
factuality        8294
hallucination     5954
Name: id, dtype: int64

In [34]:
# break down by questions
df_processed.drop_duplicates("id").groupby("check_type")["id"].count()

check_type
NA               3400
factuality       1376
hallucination    1008
Name: id, dtype: int64

In [35]:
df_processed[df_processed["check_type"]=="factuality"]["score"].value_counts()

score
PASS              8200
FAIL                91
UNVERIFIED           2
PARTIALLY_PASS       1
Name: count, dtype: int64

In [36]:
df_processed[df_processed["check_type"]=="hallucination"]["score"].value_counts()

score
PASS    5155
FAIL     799
Name: count, dtype: int64

In [37]:
df_processed["score"].value_counts()

score
NA                14730
PASS              13355
FAIL                890
UNVERIFIED            2
PARTIALLY_PASS        1
Name: count, dtype: int64

In [39]:
pass_rate = df_processed[df_processed["score"] == "PASS"].shape[0] / (df_processed[df_processed["score"] == "PASS"].shape[0] + df_processed[df_processed["score"] == "FAIL"].shape[0])
pass_rate


0.9375219375219376

In [41]:
hallucination_pass_rate = df_processed[df_processed["check_type"] == "hallucination"]["score"].value_counts()["PASS"] / (df_processed[df_processed["check_type"] == "hallucination"]["score"].value_counts()["PASS"] + df_processed[df_processed["check_type"] == "hallucination"]["score"].value_counts()["FAIL"])
print(hallucination_pass_rate)

factuality_pass_rate = df_processed[df_processed["check_type"] == "factuality"]["score"].value_counts()["PASS"] / (df_processed[df_processed["check_type"] == "factuality"]["score"].value_counts()["PASS"] + df_processed[df_processed["check_type"] == "factuality"]["score"].value_counts()["FAIL"])
print(factuality_pass_rate)



0.8658045011756802
0.9890242431552285


In [42]:
checkworthy_pass_rate = df_processed[df_processed["checkworthy"] == "PASS"].shape[0] / (df_processed[df_processed["checkworthy"] == "PASS"].shape[0] + df_processed[df_processed["checkworthy"] == "FAIL"].shape[0])
print(checkworthy_pass_rate)

0.4916833459866105


In [43]:
# Hallucination failure examples
hallucination_fail_df = df_processed[df_processed["check_type"] == "hallucination"][df_processed["score"] == "FAIL"]
hallucination_fail_df.drop(columns=["links"]).to_csv(consolidated_folder / "hallucination_fail_df.csv", index=False)

  hallucination_fail_df = df_processed[df_processed["check_type"] == "hallucination"][df_processed["score"] == "FAIL"]


In [44]:
# Factuality failure examples
factuality_fail_df = df_processed[df_processed["check_type"] == "factuality"][df_processed["score"] == "FAIL"]
factuality_fail_df.drop(columns=["links"]).to_csv(consolidated_folder / "factuality_fail_df.csv", index=False)

  factuality_fail_df = df_processed[df_processed["check_type"] == "factuality"][df_processed["score"] == "FAIL"]


## Factcheck hallucination failures

In [45]:
import pandas as pd
from pathlib import Path

hallucination_fail_fact_check_records_folder = Path("/home/watsonchua/sls-lea-evaluation/data/output/hallucination_factchecking/hallucination_fail_fact_check_records")
hallucination_fail_fact_check_records_files = list(hallucination_fail_fact_check_records_folder.glob("*.json"))

In [46]:
len(hallucination_fail_fact_check_records_files)

799

In [63]:
import json
all_hc_fc_results = []
all_hc_fc_errors = []
for file in hallucination_fail_fact_check_records_files:
    with open(file, "r") as f:
        try:
            content = json.load(f) 
        except json.JSONDecodeError as je:
            print(str(je))
            continue
        claim_id = content
        results = content["factcheck_result"]
        id = content["question_id"]
        # question = content["question"]
        # answer = content["answer"]
        # answerable_by_context = results["answerable_by_context"]
        claims_check = results["claims_check"]
        claim_idx = content["claim_id"]            
        claim_text = content["claim"]
        # claim_text = claim["claim"]
        claim = claims_check[0]
        # claim_idx = claim["claim_id"]
        check_type = claim["check_type"]
        # checkworthy = claim["checkworthy"]
        reasoning = claim["reasoning"]
        score = claim["score"]
        links = claim.get("links", [])
        if any([keyword in " ".join(reasoning).lower() for keyword in ["unable to retrieve", "failed to get", "parse", "json", "rate limit"]]):
            all_hc_fc_errors.append({
            "id": id,
            "claim_idx": claim_idx,
            "claim_text": claim_text,
            # "question": question,
            # "answer": answer,
            "check_type": check_type,
            # "checkworthy": None,
            "reasoning": reasoning,
            "score": score,
            "links": links 
        })
            
        else:               
            all_hc_fc_results.append({
                "id": id,
                "claim_idx": claim_idx,
                "claim_text": claim_text,
                # "question": question,
                # "answer": answer,
                "check_type": check_type,
                # "checkworthy": None,
                "reasoning": reasoning,
                "score": score,
                "links": links 
            })


df_hallucination_failure_checks = pd.DataFrame(all_hc_fc_results)
df_hallucination_failure_checks_processed = add_topic_profile_to_df(df_hallucination_failure_checks)

Expecting value: line 1 column 1 (char 0)


In [64]:
df_hallucination_failure_checks_processed["score"].value_counts()

score
PASS    734
FAIL     14
Name: count, dtype: int64

In [72]:
df_hallucination_failure_checks_processed.groupby(["profile"]).value_counts(["score"])

profile       score
bad_english   PASS     250
              FAIL       2
good          PASS     256
              FAIL       3
out_of_topic  PASS      54
              FAIL       1
unfactual     PASS     174
              FAIL       8
Name: count, dtype: int64

In [76]:
df_hallucination_failure_checks_processed[df_hallucination_failure_checks_processed["score"] == "FAIL"]["reasoning"].to_dict()

{15: ['Rubber bands are used in various ways in robotics, particularly in the VEX IQ Robotics platform.',
  'Rubber bands can assist with gripping mechanisms, motor efficiency, and energy storage in robots.',
  'Rubber bands can be used to drive movement and support mechanical tasks by stretching and storing potential energy.'],
 77: ['Cows need to be pregnant and give birth to initiate milk production. This process is triggered by hormonal changes associated with pregnancy and calving.',
  'Once a cow has given birth, she can continue to produce milk for an extended period through regular milking and proper nutrition, even if she is not pregnant again.',
  'The regular removal of milk from the udder is essential for sustaining continued production. If a cow is not milked, milk production will eventually stop.'],
 200: ['The search results and fetched content did not provide specific information directly linking successful metrics to the expansion of Marine Protected Areas (MPAs) or th

## Factcheck factcheck failures due to json parsing errors

In [27]:
import pandas as pd
from pathlib import Path

factuality_fail_factcheck_records_folder = Path("/home/watsonchua/sls-lea-evaluation/data/output/hallucination_factchecking/factcheck_json_fail_fact_check_records")
factuality_fail_factcheck_records_files = list(factuality_fail_factcheck_records_folder.glob("*.json"))
len(factuality_fail_factcheck_records_files)

100

In [29]:
import json

all_records = []
for file in factuality_fail_factcheck_records_files:
    try:
        data = json.load(open(file))
    except json.JSONDecodeError as e:
        print(e)
        print(file)
        continue
    all_records.append(data)

df_factuality_failure_checks = pd.DataFrame(all_records) 

In [41]:
factuality_fail_factcheck_passes = 0
factuality_fail_factcheck_fails = 0
for index, row in df_factuality_failure_checks.iterrows():
    factcheck_result = row["factcheck_result"]["claims_check"][0]["score"]
    if factcheck_result == "PASS":
        factuality_fail_factcheck_passes += 1
    elif factcheck_result == "FAIL":
        factuality_fail_factcheck_fails += 1

print(factuality_fail_factcheck_passes)
print(factuality_fail_factcheck_fails)
print(factuality_fail_factcheck_passes / (factuality_fail_factcheck_passes + factuality_fail_factcheck_fails))

96
4
0.96


In [4]:
import pandas as pd
df = pd.read_csv(consolidated_folder / "hallucination_factchecking_results.csv")

In [5]:
df.iloc[0]

id                                                          3451
claim_idx                                                      0
claim_text         Biking may seem similar to riding a tricycle.
question       I don't think biking is that great! I mean, is...
answer         I appreciate your perspective! While biking ma...
check_type                                                   NaN
checkworthy                                                 FAIL
reasoning                           ['Claim is not checkworthy']
score                                                        NaN
links                                                         []
Name: 0, dtype: object

In [25]:
df.to_csv(consolidated_folder / "hallucination_factchecking_results_with_filename.csv", index=False)


In [24]:
df.head()

Unnamed: 0,id,claim_idx,claim_text,question,answer,check_type,checkworthy,reasoning,score,links,filename,topic,profile
8495,0,0,Food security in Singapore means ensuring that...,I want you to help me understand what food sec...,Food security in Singapore means ensuring that...,factuality,PASS,['Food security in Singapore is defined as hav...,PASS,['https://en.wikipedia.org/wiki/Food_security'...,Food Security in Singapore_out_of_topic.json,Food Security in Singapore,out_of_topic
8496,0,1,"Singapore relies heavily on food imports, maki...",I want you to help me understand what food sec...,Food security in Singapore means ensuring that...,factuality,PASS,['Singapore imports over 90% of its food due t...,PASS,['https://amycookseats.com/how-much-food-does-...,Food Security in Singapore_out_of_topic.json,Food Security in Singapore,out_of_topic
8497,0,2,Ensuring food security helps maintain public h...,I want you to help me understand what food sec...,Food security in Singapore means ensuring that...,factuality,PASS,['Food security ensures that individuals have ...,PASS,['https://www.maidencap.com/insights/the-impor...,Food Security in Singapore_out_of_topic.json,Food Security in Singapore,out_of_topic
8498,0,3,The government actively works on strategies to...,I want you to help me understand what food sec...,Food security in Singapore means ensuring that...,factuality,PASS,"[""The Singapore government has implemented the...",PASS,"['https://www.mse.gov.sg/policies/food', 'http...",Food Security in Singapore_out_of_topic.json,Food Security in Singapore,out_of_topic
15403,1,0,Singapore ensures a stable food supply through...,"Oh, cool! So since Singapore relies on imports...",Singapore ensures a stable food supply through...,factuality,PASS,['Singapore employs several key strategies to ...,PASS,['https://singaporeainews.org/blogs/news/singa...,Food Security in Singapore_out_of_topic.json,Food Security in Singapore,out_of_topic


In [30]:
df_factuality_failure_checks

Unnamed: 0,question_id,claim_id,claim,factcheck_result
0,3003,4,Trees help maintain water quality.,"{'claims_check': [{'check_type': 'factuality',..."
1,4806,3,Opting for energy-efficient devices can contri...,"{'claims_check': [{'check_type': 'factuality',..."
2,2590,2,The sun's size is constant.,"{'claims_check': [{'check_type': 'factuality',..."
3,1322,2,Insects can significantly enhance gardening ef...,"{'claims_check': [{'check_type': 'factuality',..."
4,3004,6,Disrupting the balance of the ecosystem can le...,"{'claims_check': [{'check_type': 'factuality',..."
...,...,...,...,...
95,4775,3,Memes can also perpetuate misinformation.,"{'claims_check': [{'check_type': 'factuality',..."
96,1554,4,"Technology, like boats and drones, can aid in ...","{'claims_check': [{'check_type': 'factuality',..."
97,3058,3,"Habitat destruction, such as deforestation, ca...","{'claims_check': [{'check_type': 'factuality',..."
98,3009,7,Trees prevent flooding.,"{'claims_check': [{'check_type': 'factuality',..."


In [31]:
df.head()



Unnamed: 0,id,claim_idx,claim_text,question,answer,check_type,checkworthy,reasoning,score,links,filename,topic,profile
8495,0,0,Food security in Singapore means ensuring that...,I want you to help me understand what food sec...,Food security in Singapore means ensuring that...,factuality,PASS,['Food security in Singapore is defined as hav...,PASS,['https://en.wikipedia.org/wiki/Food_security'...,Food Security in Singapore_out_of_topic.json,Food Security in Singapore,out_of_topic
8496,0,1,"Singapore relies heavily on food imports, maki...",I want you to help me understand what food sec...,Food security in Singapore means ensuring that...,factuality,PASS,['Singapore imports over 90% of its food due t...,PASS,['https://amycookseats.com/how-much-food-does-...,Food Security in Singapore_out_of_topic.json,Food Security in Singapore,out_of_topic
8497,0,2,Ensuring food security helps maintain public h...,I want you to help me understand what food sec...,Food security in Singapore means ensuring that...,factuality,PASS,['Food security ensures that individuals have ...,PASS,['https://www.maidencap.com/insights/the-impor...,Food Security in Singapore_out_of_topic.json,Food Security in Singapore,out_of_topic
8498,0,3,The government actively works on strategies to...,I want you to help me understand what food sec...,Food security in Singapore means ensuring that...,factuality,PASS,"[""The Singapore government has implemented the...",PASS,"['https://www.mse.gov.sg/policies/food', 'http...",Food Security in Singapore_out_of_topic.json,Food Security in Singapore,out_of_topic
15403,1,0,Singapore ensures a stable food supply through...,"Oh, cool! So since Singapore relies on imports...",Singapore ensures a stable food supply through...,factuality,PASS,['Singapore employs several key strategies to ...,PASS,['https://singaporeainews.org/blogs/news/singa...,Food Security in Singapore_out_of_topic.json,Food Security in Singapore,out_of_topic


In [None]:
for index, row in df_factuality_failure_checks.iterrows():
    # print(row["factcheck_result"]["claims_check"][0]["reasoning"])
    # print(row["factcheck_result"]["claims_check"][0]["score"])
    # print(row["factcheck_result"]["claims_check"][0]["check_type"])
    # print(row["factcheck_result"]["claims_check"][0]["links"])

    print(df.loc[(df["id"] == row["question_id"]) & (df["claim_idx"] == row["claim_id"]), "reasoning"])
    print(row["factcheck_result"]["claims_check"][0]["reasoning"])
    
    df.loc[(df["id"] == row["question_id"]) & (df["claim_idx"] == row["claim_id"]), "reasoning"] = json.dumps(row["factcheck_result"]["claims_check"][0]["reasoning"])
    df.loc[(df["id"] == row["question_id"]) & (df["claim_idx"] == row["claim_id"]), "score"] = json.dumps(row["factcheck_result"]["claims_check"][0]["score"])
    df.loc[(df["id"] == row["question_id"]) & (df["claim_idx"] == row["claim_id"]), "links"] = json.dumps(row["factcheck_result"]["claims_check"][0]["links"])
    print(df.loc[(df["id"] == row["question_id"]) & (df["claim_idx"] == row["claim_id"])])


In [83]:
for index, row in df_factuality_failure_checks.iterrows():    
    df.loc[(df["id"] == row["question_id"]) & (df["claim_idx"] == row["claim_id"]), "reasoning"] = json.dumps(row["factcheck_result"]["claims_check"][0]["reasoning"])
    df.loc[(df["id"] == row["question_id"]) & (df["claim_idx"] == row["claim_id"]), "score"] = row["factcheck_result"]["claims_check"][0]["score"]
    df.loc[(df["id"] == row["question_id"]) & (df["claim_idx"] == row["claim_id"]), "links"] = json.dumps(row["factcheck_result"]["claims_check"][0]["links"])


In [84]:
df.to_csv(consolidated_folder / "hallucination_factchecking_results_with_filename_fixed_json_parsing_errors.csv", index=False)

## Analysis using the consolidated results

In [None]:
grouped = df.groupby(["check_type", "profile"])

In [80]:
df["score"].value_counts()

score
PASS              13454
FAIL                898
UNVERIFIED            2
PARTIALLY_PASS        1
Name: count, dtype: int64

In [88]:
for index, group in grouped:
    print(index[0], index[1])
    print(len(group[group["score"]=="PASS"]), len(group[group["score"]=="FAIL"]))
    print(len(group[group["score"]=="PASS"])/(len(group[group["score"]=="PASS"])+len(group[group["score"]=="FAIL"])))

factuality bad_english
1518 19
0.9876382563435263
factuality good
1852 41
0.9783412572636028
factuality out_of_topic
2754 12
0.9956616052060737
factuality unfactual
2175 27
0.9877384196185286
hallucination bad_english
1397 266
0.8400481058328322
hallucination good
1666 275
0.8583204533745492
hallucination out_of_topic
384 57
0.8707482993197279
hallucination unfactual
1708 201
0.8947092718700891


In [90]:
df_hallucination_failure_checks["filename"] = df_hallucination_failure_checks["question_id"].map(id_to_filename)
df_hallucination_failure_checks["topic"] = df_hallucination_failure_checks["filename"].apply(lambda x: x.split("_", maxsplit=1)[0])
df_hallucination_failure_checks["profile"] = df_hallucination_failure_checks["filename"].apply(lambda x: x.split("_", maxsplit=1)[1][:-5])

In [97]:
df_hallucination_failure_checks["factcheck_result"].iloc[0]['claims_check'][0]['score']

'PASS'

In [98]:
df_hallucination_failure_checks["score"] = df_hallucination_failure_checks.apply(lambda row: row["factcheck_result"]["claims_check"][0]["score"], axis=1)

In [101]:
df_hallucination_failure_checks.groupby("profile")["score"].value_counts()


profile       score
bad_english   PASS     250
              FAIL      15
good          PASS     256
              FAIL      19
out_of_topic  PASS      54
              FAIL       3
unfactual     PASS     174
              FAIL      27
Name: count, dtype: int64

In [103]:
len(df)

29085

In [111]:
df.columns

Index(['id', 'claim_idx', 'claim_text', 'question', 'answer', 'check_type',
       'checkworthy', 'reasoning', 'score', 'links', 'filename', 'topic',
       'profile'],
      dtype='object')

In [110]:
len(df[df["answerable_by_context"] == "PASS"]["id"].unique())

KeyError: 'answerable_by_context'