# Analyzing the effect of data formats in LLM Question Answering

## Filter the Knowledge Graphs quality computed by QualityAnalyzer
QualityAnalyzer analyzes all KG retrievable from the LOD Cloud every week. It analyzes a total of 1,850 datasets. 
For this experiment, the following KG were selected at random from the pool, ensuring heterogeneity of the calculated quality metric values: 
NoiPA, DBLP Knowledge Graph, Bibliography of the Italian Parliament and electoral studies, Allie Abbreviation And Long Form Database in Life Science, WordNet 2.0 (W3C), BBC Programmes, LiLa Lemma Bank, Coronavirus dataset, CIDOC-CRM, Environment Agency Bathing Water Quality. 

### Generate CSV and KG files

In [None]:
import pandas as pd
import os
import glob
from quality_data_filter import QualityDataFilter
import melt_csv
import quality_verbalization

# You can skip this step if the folder /data/quality_data_filtered already contains the filtered data CSV files
quality_data_to_filter_folder = "./data/full_quality_data/"

for file_path in sorted(glob.glob(os.path.join(quality_data_to_filter_folder, "*.csv"))):

    filter_obj = QualityDataFilter(
        input_file=file_path,
        output_file=f"./data/quality_data_filtered/{os.path.basename(file_path)}",
        kg_ids=["NoiPA", "dblp-kg", "bpr", "allie-abbreviation-and-long-form-database-in-life-science", "w3c-wordnet", "bbc-programmes", "LemmaBank", "micro-coronavirus", "CIDOC-CRM", "environment-agency-bathing-water-quality"],
        #selected_columns=["KG id", "KG name", "Score"]
        discard_columns=["MinTPNoOff","MaxTPNoOff","sdTPNoOff","Limited","MinTPNoOff","MeanTPNoOff","MaxTPNoOff","sdTPNoOff","CS2-value","IN3-value","RC1-value",
                            "RC2-value","IN4-value","Minimum throughput","25th percentile throughput","75th percentile throughput","Maximum throughput",
                            "Standard deviation of throughput","U5-value","PE2-value","PE3-value", " Standard deviation length URIs (subject)","Min length URIs (subject)",
                            "25th percentile length URIs (subject)", "75th percentile length URIs (subject)","Max length URIs (subject)"," Standard deviation length URIs (predicate)",
                            "Min length URIs (predicate)","25th percentile length URIs (predicate)","75th percentile length URIs (predicate)","Max length URIs (predicate)","Standard deviation length URIs (object)",
                            "Min length URIs (object)","25th percentile length URIs (object)","75th percentile length URIs (object)","Max length URIs (object)","Minimum latency","25th percentile latency","75th percentile latency",
                            "Maximum latency"," Standard deviation of throughput","Historical updates", "Offline dumps", "Number of triples linked", "Number of triples updated", 
                            "Availability VoID file", "U1-value", "Percentage of data updated", "Standard deviation lenght URIs (subject)","Median length URIs (subject)", "Standard deviation lenght URIs (predicate)", "Median length URIs (predicate)",
                            "Standard deviation lenght URIs (object)", "Median length URIs (object)", "Average latency", "Standard deviation of latency", "Average throughput", "External links", "Number of triples", "Percentage of triples with labels", 
                            "Uses RDF structures","Inactive links", "Url file VoID"]
    )
    filter_obj.filter_data()
    quality_data_folder = "./data/quality_data_filtered/"
    output_csv = "./data/quality_data_filtered/kg_quality_melted.csv"

    # Create a single CSV file with all the quality data
    melt_csv.melt_csv(folder=quality_data_folder, output_csv=output_csv)
    melt_csv.melt_csv_for_rml(folder=quality_data_folder, output_csv="./data/quality_data_filtered/kg_quality_melted_for_rml.csv")

### Generate markdown representation

In [None]:
quality_report_path = "./data/quality_data_filtered/kg_quality_melted.csv"
quality_documentation_path = "../data/metrics_doc.json"
quality_report_example = "./data/verbalized_report_example.md"
quality_report_df = pd.read_csv(quality_report_path)
num_rows = quality_report_df.shape[0]
llm_model = "gpt-5"

for row in range(num_rows):
    
    print(f"Processing row {row}...")
    verbalized_report = quality_verbalization.verbalize_quality_report(quality_report_path, quality_report_example, quality_documentation_path, llm_model, row_to_read=num_rows)
    print("Verbalized Quality Report:\n", verbalized_report)