In [6]:
%pip install transformers torch pandas tqdm accelerate fuzzywuzzy

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Defaulting to user installation because normal site-packages is not writeable
Collecting fuzzywuzzy
  Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)
Installing collected packages: fuzzywuzzy
Successfully installed fuzzywuzzy-0.18.0
Note: you may need to restart the kernel to use updated packages.


In [7]:
import os
import torch
import pandas as pd
from tqdm import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer
from TextToCypherDataLoader import Text2CypherDataset
from torch.utils.data import DataLoader
from datasets import load_dataset

# Enable TensorFloat32 for faster matrix operations
torch.backends.cuda.matmul.allow_tf32 = True

# Paths
DATASET_PATH = "/work/pi_wenlongzhao_umass_edu/9/dpatel/"
MODEL_PATH = "/datasets/ai/"
DEEPSEEK_DISTILL_LLAMA_70B_PATH = "deepseek/hub/models--deepseek-ai--DeepSeek-R1-Distill-Llama-70B/snapshots/0d6d11a6ea1187363aa7b78543f824fc02e06b14"
DEEPSEEK_DISTILL_QWEN_7B_PATH = "deepseek/hub/models--deepseek-ai--DeepSeek-R1-Distill-Qwen-7B/snapshots/6602cadec947dbb53e64f3d8d6425320b2197247"
DEEPSEEK_DISTILL_QWEN_32B_PATH = "deepseek/hub/models--deepseek-ai--DeepSeek-R1-Distill-Qwen-32B/snapshots/3865e12a1eb7cbd641ab3f9dfc28c588c6b0c1e9"
LLAMA_3_70B_INSTRUCT = "llama3/hub/Meta-Llama-3-70B-Instruct/original"

DEEPSEEK_DISTILL_LLAMA_70B = os.path.join(MODEL_PATH, DEEPSEEK_DISTILL_LLAMA_70B_PATH)
DEEPSEEK_DISTILL_QWEN_7B = os.path.join(MODEL_PATH, DEEPSEEK_DISTILL_QWEN_7B_PATH)
DEEPSEEK_DISTILL_QWEN_32B = os.path.join(MODEL_PATH, DEEPSEEK_DISTILL_QWEN_32B_PATH)

dataset = load_dataset("neo4j/text2cypher-2024v1")["train"].shuffle(seed=42).select(range(1))

In [8]:
MODEL_LIST = [
    # (DEEPSEEK_DISTILL_QWEN_7B, "DeepSeek Distill QWEN 7B"),
    # (DEEPSEEK_DISTILL_LLAMA_70B, "DeepSeek R1 Distill Llama 70B"),
    (DEEPSEEK_DISTILL_QWEN_32B, "DeepSeek Distill QWEN 32B"),
]

In [9]:
BATCH_SIZE = 8

In [10]:
import re
from fuzzywuzzy import fuzz

def generate_cypher(batch_questions, batch_schemas, model, tokenizer):
    batch_inputs = [
        (
            "Convert this question into a **valid** Cypher query.\n\n"
            f"Question: {question}\n"
            f"Schema: {schema}\n"
            "Answer:\n"
        ) for question, schema in zip(batch_questions, batch_schemas)
    ]
    
    inputs = tokenizer(batch_inputs, return_tensors="pt", truncation=True, padding=True, max_length=512)
    
    inputs = {key: value.to("cuda") for key, value in inputs.items()}

    with torch.no_grad():
        output_tokens = model.generate(
            **inputs, 
            max_new_tokens=512, 
            do_sample=False, 
            pad_token_id=tokenizer.eos_token_id
        )

    return tokenizer.batch_decode(output_tokens.cpu(), skip_special_tokens=True)

def extract_cypher(output_text):
    matches = re.findall(r"```cypher\n(.*?)\n```", output_text, re.DOTALL)
    return matches[-1].strip() if matches else output_text.strip()  # Assume the last query returned is the correct one

def normalize_cypher(query):
    return " ".join(query.lower().split())

def is_exact_match(predicted_cypher, true_cypher):
    return normalize_cypher(predicted_cypher) == normalize_cypher(true_cypher)

def similarity_score(predicted_cypher, true_cypher):
    # Check for similariy between the query instead of exact match
    return fuzz.ratio(normalize_cypher(predicted_cypher), normalize_cypher(true_cypher))



In [11]:
def generate_baseline(train_loader, model, tokenizer, name):
    correct_predictions = 0
    total_samples = 0
    exact_matches = 0
    similarity_scores = []

    for batch in tqdm(train_loader, desc=f"Generating Cypher Queries - {name}"):
        batch_questions = batch["question"]
        batch_schemas = batch["schema"]
        batch_true_cyphers = batch["cypher"]

        predicted_cyphers = generate_cypher(batch_questions, batch_schemas, model, tokenizer)

        for true_cypher, raw_predicted_cypher in zip(batch_true_cyphers, predicted_cyphers):
            predicted_cypher = extract_cypher(raw_predicted_cypher)
            if is_exact_match(predicted_cypher, true_cypher):
                exact_matches += 1
            similarity_scores.append(similarity_score(predicted_cypher, true_cypher))
            total_samples += 1
            
    exact_match_accuracy = (exact_matches / total_samples) * 100
    avg_similarity = sum(similarity_scores) / total_samples

    new_result = {
        "Model Name": name,
        "Exact Match Accuracy": f"{exact_match_accuracy:.2f}%",
        "Average Similarity Score": f"{avg_similarity:.2f}",
        "Total Samples": total_samples
    }

    OUTPUT_PATH = os.path.join(DATASET_PATH, "zero_shot_baseline_results.csv")
    if os.path.exists(OUTPUT_PATH):
        df = pd.read_csv(OUTPUT_PATH)
        df = pd.concat([df, pd.DataFrame([new_result])], ignore_index=True)
    else:
        df = pd.DataFrame([new_result])
    
    df.to_csv(OUTPUT_PATH, index=False)


    print(f"✅ Results saved to {OUTPUT_PATH} for model {name}")
    print(f"✅ Exact Match Accuracy: {exact_matches}/{total_samples} ({exact_match_accuracy:.2f}%)")
    print(f"✅ Average Similarity Score: {avg_similarity:.2f}")

In [None]:
for model_path, model_name in MODEL_LIST:
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=torch.float16, device_map="auto")

    if torch.cuda.is_available():
        model = torch.compile(model)

    train_dataset = Text2CypherDataset(dataset, tokenizer)
    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=False)

    generate_baseline(train_loader, model, tokenizer, model_name)

Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

Some parameters are on the meta device because they were offloaded to the cpu.


In [None]:
import pandas as pd
df = pd.read_csv(OUTPUT_PATH)
print(df.head())