In [13]:
from Neo4jDriver import Neo4jDriver
import pandas as pd

# REQUIRES file to be placed in import directory
with Neo4jDriver() as neo4j:

    set_gpt_detections =  """
        CALL apoc.load.json('detected_keys.json') YIELD value as row
        WITH row.source as id, row.json_response as gptOutput
        MATCH (s:Schema {url: id})
        SET s.openAI_GPT_4_response=gptOutput
        """

    neo4j.execute_write_query(set_gpt_detections)

    # Remove bad json response from LLM returns
    remove_bad_json = """MATCH (s:Schema) 
        WHERE s.openAI_GPT_4_response IS NOT NULL
        WITH s
        SKIP 35
        LIMIT 1
        SET s.openAI_GPT_4_response = NULL
        """
    
    neo4j.execute_write_query(remove_bad_json)

Connected to Neo4j database!
Connection to Neo4j closed.


In [14]:
# Update the graph to set LLM detection results

with Neo4jDriver() as neo4j:

    setTruePositivePKs = """
        MATCH (s:Schema) 
        WHERE s.openAI_GPT_4_response IS NOT NULL
        WITH s
        WITH s, apoc.convert.fromJsonList(s.openAI_GPT_4_response) as response
        UNWIND response as table
        WITH s, table.tableName as tableName, table.primaryKey as primaryKey 
        UNWIND primaryKey as pkCols
        // // Identify TRUE_POSITIVE PK matches and update matchStatus properties accordingly
        OPTIONAL MATCH pth=(s)-->(t:Table {name: tableName})-->(c:Column)<--(pk:PrimaryKey {table: tableName})
        WITH s, t, pk, tableName, [col in collect(distinct c) | col.name] as matchedPK, primaryKey
        WITH s, t, pk, tableName, primaryKey, apoc.coll.disjunction(matchedPK, primaryKey) as disjunction
        WHERE size(disjunction)=0
        SET pk.detectedBy = apoc.coll.toSet(coalesce(pk.detectedBy, []) + ['LLM'])
        """

    neo4j.execute_write_query(setTruePositivePKs)

    setFalsePositivePKs = """
        MATCH (s:Schema) 
        WHERE s.openAI_GPT_4_response IS NOT NULL
        WITH s, apoc.convert.fromJsonList(s.openAI_GPT_4_response) as response
        UNWIND response as table
        WITH s, table.tableName as tableName, table.primaryKey as primaryKey 
        UNWIND primaryKey as pkCols
        OPTIONAL MATCH pth=(s)-->(t:Table {name: tableName})-->(c:Column)<--(pk:PrimaryKey {table: tableName})
        WITH s, t, pk, tableName, [col in collect(distinct c) | col.name] as matchedPK, primaryKey
        WITH s, t, pk, tableName, primaryKey, matchedPK, apoc.coll.disjunction(matchedPK, primaryKey) as disjunction
        WHERE size(disjunction)<>0
        MERGE (fpk:PrimaryKey {table:tableName, schemaId: s.url, groundTruth:False })
        ON CREATE
            SET fpk.detectedBy = apoc.coll.toSet(coalesce(fpk.detectedBy, []) + ['LLM'])
        WITH s, t, fpk, primaryKey
        UNWIND primaryKey as fpk_name
        MATCH (s)-->(t)-[:HAS_COLUMN]->(fpkc {name: fpk_name})
        MERGE (fpk)-[fkpcr:PK_COLUMN {groundTruth: False}]->(fpkc)
        ON CREATE
            SET fkpcr.detectedBy = apoc.coll.toSet(coalesce(fkpcr.detectedBy, []) + ['LLM'])
        """
    neo4j.execute_write_query(setFalsePositivePKs)

    setTruePositiveFKs = """
        MATCH (s:Schema) 
        WHERE s.openAI_GPT_4_response IS NOT NULL
        WITH s, apoc.convert.fromJsonList(s.openAI_GPT_4_response) as response
        UNWIND response as table
        WITH s, table.tableName as tableName, table.primaryKey as primaryKey, table.foreignKeys as foreignKeys 
        UNWIND primaryKey as pkCols
        UNWIND (CASE foreignKeys WHEN [] then [null] else foreignKeys end) as foreignKey
        WITH  s,
            tableName,
            primaryKey,
            foreignKey as origForeignKeyFormat,
            CASE
                WHEN foreignKey IS NULL THEN NULL
                ELSE {referenceTable: foreignKey.referenceTable, foreignKeyRefColumnPairs: apoc.coll.zip(foreignKey.foreignKeys, foreignKey.referenceColumns)}
            END as foreignKey
        WITH s, origForeignKeyFormat, tableName, primaryKey, foreignKey
        WHERE NOT foreignKey IS NULL
        UNWIND  foreignKey.foreignKeyRefColumnPairs as foreignKeyRefColumnPair
        WITH s, tableName, origForeignKeyFormat, foreignKey.referenceTable as referenceTable, foreignKeyRefColumnPair[0] as foreignKeyColumn, foreignKeyRefColumnPair[1] as foreignKeyReferenceColumn
        // Do the TRUE POSITIVES
        OPTIONAL MATCH pth=(s)-->(t:Table {name: tableName})-->(fkc:Column {name: foreignKeyColumn})<-[fkcr:FK_COLUMN]-(fk:ForeignKey {table: tableName})-[fkrcr:FK_REFERENCE_COLUMN]-(fkrc:Column {name:foreignKeyReferenceColumn})
        WITH s, t, origForeignKeyFormat, foreignKeyColumn, foreignKeyReferenceColumn, fk, fkcr, fkrcr, tableName, [col in collect(fkc) | col.name] as matchedFKC, [col in collect(fkrc) | col.name] as matchedFKRC, origForeignKeyFormat.foreignKeys as foreignKey, origForeignKeyFormat.referenceColumns as referenceColumns
        WITH s, t, origForeignKeyFormat,foreignKeyColumn, foreignKeyReferenceColumn, fk, fkcr, fkrcr, tableName, apoc.coll.disjunction(matchedFKC, foreignKey) as fkcDisjunction, apoc.coll.disjunction(matchedFKRC, referenceColumns) as fkrcDisjunction
        WHERE size(fkcDisjunction)=0 AND size(fkrcDisjunction)=0
        SET fk.detectedBy = apoc.coll.toSet(coalesce(fk.detectedBy, []) + ['LLM']),
            fkcr.detectedBy = apoc.coll.toSet(coalesce(fkcr.detectedBy, []) + ['LLM']),
            fkrcr.detectedBy = apoc.coll.toSet(coalesce(fkrcr.detectedBy, []) + ['LLM'])
        """
    
    neo4j.execute_write_query(setTruePositiveFKs)

    setFalsePositiveFKs = """
        MATCH (s:Schema) 
        WHERE s.openAI_GPT_4_response IS NOT NULL
        WITH s, apoc.convert.fromJsonList(s.openAI_GPT_4_response) as response
        UNWIND response as table
        WITH s, table.tableName as tableName, table.primaryKey as primaryKey, table.foreignKeys as foreignKeys 
        UNWIND primaryKey as pkCols
        UNWIND (CASE foreignKeys WHEN [] then [null] else foreignKeys end) as foreignKey
        WITH  s,
            tableName,
            primaryKey,
            foreignKey as origForeignKeyFormat,
            CASE
                WHEN foreignKey IS NULL THEN NULL
                ELSE {referenceTable: foreignKey.referenceTable, foreignKeyRefColumnPairs: apoc.coll.zip(foreignKey.foreignKeys, foreignKey.referenceColumns)}
            END as foreignKey
        WITH s, origForeignKeyFormat, tableName, primaryKey, foreignKey
        WHERE NOT foreignKey IS NULL
        UNWIND  foreignKey.foreignKeyRefColumnPairs as foreignKeyRefColumnPair
        WITH s, tableName, origForeignKeyFormat, foreignKey.referenceTable as referenceTable, foreignKeyRefColumnPair[0] as foreignKeyColumn, foreignKeyRefColumnPair[1] as foreignKeyReferenceColumn
        OPTIONAL MATCH pth=(s)-->(t:Table {name: tableName})-->(fkc:Column {name: foreignKeyColumn})<-[fkcr:FK_COLUMN]-(fk:ForeignKey {table: tableName})-[fkrcr:FK_REFERENCE_COLUMN]-(fkrc:Column {name:foreignKeyReferenceColumn})
        WITH s, t, origForeignKeyFormat, foreignKeyColumn, foreignKeyReferenceColumn, fk, fkcr, fkrcr, tableName, [col in collect(fkc) | col.name] as matchedFKC, [col in collect(fkrc) | col.name] as matchedFKRC, origForeignKeyFormat.foreignKeys as foreignKey, origForeignKeyFormat.referenceColumns as referenceColumns
        WITH s, t, origForeignKeyFormat,foreignKeyColumn, foreignKeyReferenceColumn, fk, fkcr, fkrcr, tableName, apoc.coll.disjunction(matchedFKC, foreignKey) as fkcDisjunction, apoc.coll.disjunction(matchedFKRC, referenceColumns) as fkrcDisjunction
        WHERE NOT(size(fkcDisjunction)=0 AND size(fkrcDisjunction)=0)
        MERGE (ffk:ForeignKey {table:tableName, schemaId: s.url, groundTruth:False})
        ON CREATE
            SET ffk.detectedBy = apoc.coll.toSet(coalesce(ffk.detectedBy, []) + ['LLM'])
        WITH *, apoc.coll.zip(origForeignKeyFormat.foreignKeys, origForeignKeyFormat.referenceColumns) as foreignKeyRefColPairs
        UNWIND foreignKeyRefColPairs as foreignKeyRefColPair
        MATCH (s)--(:Table {name: tableName})-[:HAS_COLUMN]->(ffkc:Column {name: foreignKeyRefColPair[0]})
        MATCH (s)--(:Table {name: origForeignKeyFormat.referenceTable})-[:HAS_COLUMN]->(ffkrc:Column {name: foreignKeyRefColPair[1]})
        MERGE (ffkc)<-[ffkcr:FK_COLUMN {groundTruth:False}]-(ffk)-[ffkrcr:FK_REFERENCE_COLUMN {groundTruth:False}]->(ffkrc)
        ON CREATE
            SET ffkcr.detectedBy = apoc.coll.toSet(coalesce(ffkcr.detectedBy, []) + ['LLM']),
                ffkrcr.detectedBy = apoc.coll.toSet(coalesce(ffkrcr.detectedBy, []) + ['LLM'])
    """

    neo4j.execute_write_query(setFalsePositiveFKs)

    # setFalseNegativePKsandFKs = """
    #     MATCH (s:Schema) 
    #     WHERE s.openAI_GPT_4_response IS NOT NULL
    #     OPTIONAL MATCH pth=(s)-[:CONTAINS_TABLE]->(:Table)-[:HAS_COLUMN]->(:Column)<-[:PK_COLUMN|FK_COLUMN]-(key:PrimaryKey|ForeignKey)
    #     WITH distinct key
    #     WHERE key.detectedBy IS NULL
    #     SET key.matchStatus = "FALSE_NEGATIVE"
    # """

    # neo4j.execute_write_query(setFalseNegativePKsandFKs)
    

Connected to Neo4j database!
Connection to Neo4j closed.


CypherSyntaxError: {code: Neo.ClientError.Statement.SyntaxError} {message: Invalid input '[': expected "+" or "-" (line 28, column 81 (offset: 2327))
"            fkrcr.detectedBy = apoc.coll.toSet(coalesce(fkrcr.detectedBy, []) + ['LLM')])"
                                                                                 ^}