In [3]:
%pip install transformers torch pandas tqdm accelerate>=0.26.0

[0mNote: you may need to restart the kernel to use updated packages.


In [15]:
import os
import torch
import pandas as pd
from tqdm import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer
from TextToCypherDataLoader import Text2CypherDataset
from torch.utils.data import DataLoader
from datasets import load_dataset

# Path to your dataset and models
DATASET_PATH = "/work/pi_wenlongzhao_umass_edu/9/dpatel/"
MODEL_PATH = "/datasets/ai/"
DEEPSEEK_DISTILL_LLAMA_70B_PATH = "deepseek/hub/models--deepseek-ai--DeepSeek-R1-Distill-Llama-70B/snapshots/0d6d11a6ea1187363aa7b78543f824fc02e06b14"
DEEPSEEK_DISTILL_QWEN_7B_PATH = "deepseek/hub/models--deepseek-ai--DeepSeek-R1-Distill-Qwen-7B/snapshots/6602cadec947dbb53e64f3d8d6425320b2197247"

DEEPSEEK_DISTILL_LLAMA_70B = os.path.join(MODEL_PATH, DEEPSEEK_DISTILL_LLAMA_70B_PATH)
DEEPSEEK_DISTILL_QWEN_7B = os.path.join(MODEL_PATH, DEEPSEEK_DISTILL_QWEN_7B_PATH)

dataset = load_dataset("neo4j/text2cypher-2024v1")["train"].shuffle(seed=42).select(range(10))


In [2]:
tokenizer = AutoTokenizer.from_pretrained(DEEPSEEK_DISTILL_QWEN_7B)
model = AutoModelForCausalLM.from_pretrained(DEEPSEEK_DISTILL_QWEN_7B, torch_dtype=torch.float16, device_map="auto")

Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [16]:
BATCH_SIZE = 8
train_dataset = Text2CypherDataset(dataset, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [17]:
def generate_cypher(question, schema):
    input_text = f"Convert this natural language question to a Cypher query.\nQuestion: {question}\nSchema: {schema}\nCypher Query:"
    
    # inputs = tokenizer(input_text, return_tensors="pt", truncation=True, max_length=512).to("cuda")
    inputs = tokenizer(input_text, return_tensors="pt", truncation=True, max_length=512)
    
    with torch.no_grad():
        output_tokens = model.generate(**inputs, max_length=512, max_new_tokens=256, do_sample=True, temperature=0.7)

    generated_cypher = tokenizer.decode(output_tokens[0], skip_special_tokens=True)
    
    return generated_cypher.strip()

In [18]:
results = []
correct_predictions = 0
total_samples = 0

for batch in tqdm(train_loader, desc="Generating Cypher Queries"):
    for i in range(len(batch["input_ids"])):
        question = dataset[i]["question"]
        schema = dataset[i]["schema"]
        true_cypher = dataset[i]["cypher"]

        predicted_cypher = generate_cypher(question, schema)

        is_correct = predicted_cypher.strip().lower() == true_cypher.strip().lower()
        if is_correct:
            correct_predictions += 1

        total_samples += 1

results.append({
    "Model Name": DEEPSEEK_DISTILL_LLAMA_70B.split("/")[-1],
    "Correctly Predicted": correct_predictions,
    "Total Samples": total_samples,
    "Accuracy": f"{(correct_predictions/total_samples)*100:.2f}%"
})

# Convert results to DataFrame
df = pd.DataFrame(results)

# Save results as CSV
OUTPUT_PATH = os.path.join(DATASET_PATH, "zero_shot_baseline_results.csv")
df.to_csv(OUTPUT_PATH, index=False)

print(f"Results saved to {OUTPUT_PATH}")
print(f"Accuracy: {correct_predictions}/{total_samples} ({(correct_predictions/total_samples)*100:.2f}%)")

Generating Cypher Queries:   0%|          | 0/2 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Both `max_new_tokens` (=256) and `max_length`(=512) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Both `max_new_tokens` (=256) and `max_length`(=512) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Both `max_new_tokens` (=256) and `max_length`(=512) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/

Results saved to /work/pi_wenlongzhao_umass_edu/9/dpatel/zero_shot_baseline_results.csv
Accuracy: 0/10 (0.00%)





In [None]:
import pandas as pd
df = pd.read_csv(OUTPUT_PATH)
print(df.head())