In [13]:
import subprocess
import os
from langchain_community.graphs import Neo4jGraph
import re
import json
from nltk.translate.bleu_score import sentence_bleu
from rouge_score import rouge_scorer



In [14]:
def initialize_neo4j_graph(url, username, password):

    graph = Neo4jGraph(
        url=url,
        username=username, 
        password=password,
        refresh_schema=True
    )
    return graph

In [15]:
def build_prompt(schema, question):
    full_prompt = f"<|im_start|>system\nGive me a Cypher Query and not a python code which can be executed on Neo4j, only use the following graph schema:\n{schema}\n\nTo answer the following question:<|im_end|>\n<|im_start|>user\n{question}<|im_end|>\n<|im_start|>assistant\n"
    return full_prompt

In [16]:
def run_llama_model(llama_cli_path, model_path, prompt):
    """Runs the LLaMA model with the given prompt using a subprocess."""
    command = f"{llama_cli_path} -m {model_path} -p '{prompt}' --n-predict 180 --temp 0.1 --top_p 0.91 --logit-bias '27363-100,16659-100'"
    result = subprocess.run(command, shell=True, capture_output=True, text=True)
    print
    return result

In [17]:
def clean_query(query):
    # Check if the last two characters are '")'
    if query.endswith('")'):
        # Remove the last two characters
        return query[:-2]
    return query

In [18]:
def extract_cypher_query(text):
    patterns = [
        r"'[\s\n]*(MATCH[\s\S]*?)'",
        r"'''[\s\n]*(MATCH[\s\S]*?)'''",
        r"`[\s\n]*(MATCH[\s\S]*?)`",
        r"```[\s\n]*(MATCH[\s\S]*?)```",
        r"```cypher[\s\n]*(MATCH[\s\S]*?)```"  # New pattern for ```cypher ... ``` format
    ]
    
    for pattern in patterns:
        matches = re.findall(pattern, text, re.IGNORECASE)
        if matches:
            return clean_query(matches[0].strip())  # Clean here before returning
    
    # If no matches found, try to find any Cypher-like query
    cypher_pattern = r"MATCH[\s\S]*?RETURN.*"
    match = re.search(cypher_pattern, text, re.IGNORECASE)
    if match:
        return clean_query(match.group(0).strip())  # Clean here before returning
    
    return None

In [19]:
def clean_query(query):
    # Check if the last two characters are '")'
    if query.endswith('")'):
        # Remove the last two characters
        return query[:-2]
    return query

In [20]:
def get_cypher_from_nl_query(question):
    """Main function to generate a Cypher query from a natural language question."""
    # Initialize Neo4j graph connection
    url = "neo4j+s://46a80122.databases.neo4j.io:7687"
    username = "<user>"
    password = "<pass>"
    graph = initialize_neo4j_graph(url, username, password)
    
    # Get schema and build prompt
    schema = graph.schema
    full_prompt = build_prompt(schema, question)
    
    # Run LLaMA model
    model_path = "/Users/dhruvchandel/Thesis_Git/NLToSPARQL/models/NLToCypher.Q4.gguf"
    llama_cli_path = "/Users/dhruvchandel/LLaMaCPP/llama.cpp/llama-cli"
    result = run_llama_model(llama_cli_path, model_path, full_prompt)
    
    # Extract and return Cypher query
    cypher_query = extract_cypher_query(result.stdout)
    if cypher_query is None:
        print("Failed to extract Cypher query. Full result:")
        print(result.stdout)
    else:
        cypher_query = cypher_query.replace('\n', ' ')
        print("Generated Cypher Query:", cypher_query)
    return cypher_query

In [21]:
# Example usage
#question = "How many articles uses a dataset?"
#cypher_query = get_cypher_from_nl_query(question)
#print("Generated Cypher Query:", cypher_query)

In [22]:
input_json_file_path = '/Users/dhruvchandel/Thesis_Git/NLToSPARQL/scripts/datas/parametric_trainer_with_repeats.json'

In [23]:
def process_questions(input_path, output_path):
    # Read the input JSON
    with open(input_path, 'r') as f:
        data = json.load(f)
    count = 1
    # Process each question and store results
    results = []
    for item in data:
        question = item['Question']
        print(f"QUESTION: {question}")
        cypher_query = get_cypher_from_nl_query(question)
        
        results.append({
            'Question': question,
            'Cypher': cypher_query
        })
        count+=1
        print(f"Query Count : {count}")

    # Save the results to a new JSON file
    with open(output_path, 'w') as f:
        json.dump(results, f, indent=2)

    print(f"Processing complete. Results saved to {output_path}")

In [24]:
input_file_path = '/Users/dhruvchandel/Thesis_Git/NLToSPARQL/scripts/datas/evaluation_data.json'
output_file_path = '/Users/dhruvchandel/Thesis_Git/NLToSPARQL/scripts/datas/finetunedoutput.json'  

process_questions(input_file_path, output_file_path)

QUESTION: Find the total number of Article in the graph!
Generated Cypher Query: MATCH (a:Article) RETURN count(a) AS articleCount
Query Count : 2
QUESTION: Find the total number of Code in the graph!
Generated Cypher Query: MATCH (c:Code) RETURN count(c) AS total_code
Query Count : 3
QUESTION: Find the total number of Dataset in the graph!
Generated Cypher Query: MATCH (d:Dataset) RETURN count(d) AS total_datasets
Query Count : 4
QUESTION: Find the total number of SoftwareApplication in the graph!
Generated Cypher Query: MATCH (s:SoftwareApplication) RETURN count(s) AS totalSoftwareApplications
Query Count : 5
QUESTION: Fetch the Article nodes and extract their id property!
Generated Cypher Query: MATCH (a:Article) RETURN a.id
Query Count : 6
QUESTION: Fetch the Article nodes and extract their id property!
Generated Cypher Query: MATCH (a:Article) RETURN a.id
Query Count : 7
QUESTION: Fetch the Code nodes and extract their id property!
Generated Cypher Query: MATCH (c:Code) RETURN c.i

KeyboardInterrupt: 

In [None]:
import json
from nltk.translate.bleu_score import sentence_bleu
from rouge_score import rouge_scorer

# Load the JSON data from files
with open('/Users/dhruvchandel/Thesis_Git/NLToSPARQL/scripts/datas/evaluation_data.json', 'r') as f:
    reference_data = json.load(f)

with open('/Users/dhruvchandel/Thesis_Git/NLToSPARQL/scripts/datas/finetunedoutput.json', 'r') as f:
    generated_data = json.load(f)

# Initialize ROUGE scorer
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

# Function to calculate BLEU and ROUGE scores
def calculate_scores(ref_data, gen_data):
    bleu_scores = []
    rouge1_scores = []
    rouge2_scores = []
    rougeL_scores = []
    
    for ref, gen in zip(ref_data, gen_data):
        # Check if Cypher query is None
        if ref['Cypher'] is None or gen.get('Cypher') is None:
            continue
        
        # BLEU score
        reference = ref['Cypher'].split()
        hypothesis = gen['Cypher'].split()
        bleu = sentence_bleu([reference], hypothesis)
        bleu_scores.append(bleu)
        
        # ROUGE scores
        rouge_scores = scorer.score(ref['Cypher'], gen['Cypher'])
        rouge1_scores.append(rouge_scores['rouge1'].fmeasure)
        rouge2_scores.append(rouge_scores['rouge2'].fmeasure)
        rougeL_scores.append(rouge_scores['rougeL'].fmeasure)
    
    # Calculate average scores, checking if lists are not empty to avoid division by zero
    avg_bleu = sum(bleu_scores) / len(bleu_scores) if bleu_scores else 0
    avg_rouge1 = sum(rouge1_scores) / len(rouge1_scores) if rouge1_scores else 0
    avg_rouge2 = sum(rouge2_scores) / len(rouge2_scores) if rouge2_scores else 0
    avg_rougeL = sum(rougeL_scores) / len(rougeL_scores) if rougeL_scores else 0
    
    return avg_bleu, avg_rouge1, avg_rouge2, avg_rougeL

avg_bleu, avg_rouge1, avg_rouge2, avg_rougeL = calculate_scores(reference_data, generated_data)
print(f"Average BLEU Score: {avg_bleu}")
print(f"Average ROUGE-1 Score: {avg_rouge1}")
print(f"Average ROUGE-2 Score: {avg_rouge2}")
print(f"Average ROUGE-L Score: {avg_rougeL}")
