In [2]:
import json, os, pandas as pd, numpy as np, csv
import requests
import io
import tarfile
import zipfile
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


### Load data for Hallucination Detection training

In [3]:
# Directory where the CSV files are stored
data_dir = os.path.join(os.path.dirname(os.getcwd()), 'data')

# List all CSV files in the directory
csv_files = [f for f in os.listdir(data_dir) if f.endswith('.csv')]

# Load all CSV files into a single DataFrame
df_list = []
for file in csv_files:
    file_path = os.path.join(data_dir, file)
    df = pd.read_csv(file_path)
    df_list.append(df)

# Concatenate all DataFrames
training_df = pd.concat(df_list, ignore_index=True)

# Display the first few rows of the combined DataFrame
training_df.head()

Unnamed: 0,id,grounding,generated_text,label,cut,dataset_origin
0,qags_cnndm0,( cnn ) about 20 hours after the boston marath...,He is a venezuelan dystrophy. He was the last ...,0,test,qags_cnndm
1,qags_cnndm1,( cnn ) after more than nine years of travelin...,New horizons will shed light on a third zone o...,1,val,qags_cnndm
2,qags_cnndm2,( cnn ) call it a little piece of heaven for a...,Sharry was eight months pregnant. Smith fell a...,0,val,qags_cnndm
3,qags_cnndm3,( cnn ) comedian chris rock made light of raci...,A new york grand jury's decision not to indict...,0,val,qags_cnndm
4,qags_cnndm4,( cnn ) debates on climate change can break do...,Climate change can be used to public health is...,0,val,qags_cnndm


In [None]:
# Get the counts of val and test data
val_test_spit = training_df['cut'].value_counts()

# Display the counts
print(val_test_spit)

print("val can be used for training the model and test can be used for evaluating the performance")

cut
val     84152
test    38224
Name: count, dtype: int64
val can be used for training the model and test can be used for evaluation the performance


In [5]:
# Get the counts by dataset origin
training_df['dataset_origin'].value_counts()

dataset_origin
Vitamin C     63054
HaluEval      20000
Fever         19998
PAWS           8000
XSumFaith      2353
SummEval       1698
FactCC         1434
FRANK          1393
Polytope       1268
Cao22           696
CLIFF           600
TofuEval        534
Wang20          474
samsum          250
qags_xsum       239
qags_cnndm      235
Goyal21         150
Name: count, dtype: int64

In [9]:
# save train, val and test to dir data/combined
val_df = training_df[training_df['cut'] == 'val']
test_df = training_df[training_df['cut'] == 'test']
os.makedirs(os.path.join(data_dir, 'combined'), exist_ok=True)

# Save the split DataFrames to CSV files
val_df.to_csv(os.path.join(data_dir, 'combined', 'val.csv'), index=False)
test_df.to_csv(os.path.join(data_dir, 'combined', 'test.csv'), index=False)


In [10]:
# verify the saved files
val_df_verify = pd.read_csv(os.path.join(data_dir, 'combined', 'val.csv'))
test_df_verify = pd.read_csv(os.path.join(data_dir, 'combined', 'test.csv'))
# Display the first few rows of the verified DataFrames
print("Train DataFrame:")
print("Validation DataFrame:")
print(val_df_verify.head())
print("Test DataFrame:")
print(test_df_verify.head())
# Display the counts of each split to verify
print("Counts of each split:")
print("Validation:", len(val_df_verify))
print("Test:", len(test_df_verify))

Train DataFrame:
Validation DataFrame:
            id                                          grounding  \
0  qags_cnndm1  ( cnn ) after more than nine years of travelin...   
1  qags_cnndm2  ( cnn ) call it a little piece of heaven for a...   
2  qags_cnndm3  ( cnn ) comedian chris rock made light of raci...   
3  qags_cnndm4  ( cnn ) debates on climate change can break do...   
4  qags_cnndm5  ( cnn ) did former new england patriot aaron h...   

                                      generated_text  label  cut  \
0  New horizons will shed light on a third zone o...      1  val   
1  Sharry was eight months pregnant. Smith fell a...      0  val   
2  A new york grand jury's decision not to indict...      0  val   
3  Climate change can be used to public health is...      0  val   
4  Aaron hernandez has pleaded not guilty to murd...      0  val   

  dataset_origin  
0     qags_cnndm  
1     qags_cnndm  
2     qags_cnndm  
3     qags_cnndm  
4     qags_cnndm  
Test DataFrame:
       