# Dataset prep for RAG on Dgraph blog post

We decided to use the dataset from the article [RAG with a Graph database](https://cookbook.openai.com/examples/rag_with_graph_db).


RDF is a powerful notation for knowledge graph. It describes information in triples of the form Subject - Predicate - Object (S-P-O).

The original dataset is in JSON format and is 2.7Mb. We have generated an RDF file with the exact same information. The RDF file is only 361 Kb! 


In [13]:
import json
import pandas as pd

In [19]:


# RDF subjects have no spaces
def clean_name(name):
    clean_string = [s for s in name if s.isalnum()]
    return "".join(clean_string)
def clean_text(text):
    return text.replace('"','\\"').replace('\n','').replace('\r','').replace('\t','')

def toRDF(df):
    nquad_list = []
    # correct the material entity type which is not in the original dataset
    for idx,row in df.iterrows():
        if row["relationship"] == "hasMaterial":
            df.at[idx,"entity_type"] = "material"
    # Generates Product data
    products = df[['product', 'product_id','TITLE']].drop_duplicates()
    for idx,row in products.iterrows():
        nquad_list.append(f'<_:P_{row["product_id"]}> <Product.Title> "{clean_text(row["TITLE"])}" .')
        nquad_list.append(f'<_:P_{row["product_id"]}> <dgraph.type> "Product" .')
        nquad_list.append(f'<_:P_{row["product_id"]}> <Product.Name> "{clean_text(row["product"])}" .')
        nquad_list.append(f'<_:P_{row["product_id"]}> <Product.ID> "{row["product_id"]}" .')
    # Generate all other entities
    types = df[['entity_type','entity_value']].drop_duplicates()
    for idx,row in types.iterrows():
        nquad_list.append(f'<_:{row["entity_type"]}_{clean_name(row["entity_value"])}> <dgraph.type> "{row["entity_type"]}" .')
        nquad_list.append(f'<_:{row["entity_type"]}_{clean_name(row["entity_value"])}> <{row["entity_type"]}.Value> "{clean_text(row["entity_value"])}" .')
    
    # generate relations
    for idx,row in df.iterrows():
        nquad_list.append(f'<_:P_{row["product_id"]}> <Product.{row["entity_type"]}> <_:{row["entity_type"]}_{clean_name(row["entity_value"])}> .')
    # nquad_list = [ f'<P_{row["product_id"]}> <TITLE> "{row["TITLE"]}" .' for idx,row in products.iterrows()]
    return nquad_list

# Loading a json dataset from a file
file_path = "data/amazon_product_kg.json"
output_path ="data/products.rdf"
df = pd.read_json(file_path)
with open(output_path,'w') as filehandle:
    filehandle.write("\n".join(toRDF(df)))

    
