In [4]:
%pip install transformers torch pandas tqdm accelerate>=0.26.0

zsh:1: 0.26.0 not found
Note: you may need to restart the kernel to use updated packages.


In [None]:
import os
import torch
import pandas as pd
from tqdm import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer
from TextToCypherDataLoader import Text2CypherDataset
from torch.utils.data import DataLoader
from datasets import load_dataset

torch.backends.cuda.matmul.allow_tf32 = True

DATASET_PATH = "/work/pi_wenlongzhao_umass_edu/9/dpatel/"
MODEL_PATH = "/datasets/ai/"
DEEPSEEK_DISTILL_LLAMA_70B_PATH = "deepseek/hub/models--deepseek-ai--DeepSeek-R1-Distill-Llama-70B/snapshots/0d6d11a6ea1187363aa7b78543f824fc02e06b14"
DEEPSEEK_DISTILL_QWEN_7B_PATH = "deepseek/hub/models--deepseek-ai--DeepSeek-R1-Distill-Qwen-7B/snapshots/6602cadec947dbb53e64f3d8d6425320b2197247"

DEEPSEEK_DISTILL_LLAMA_70B = os.path.join(MODEL_PATH, DEEPSEEK_DISTILL_LLAMA_70B_PATH)
DEEPSEEK_DISTILL_QWEN_7B = os.path.join(MODEL_PATH, DEEPSEEK_DISTILL_QWEN_7B_PATH)

dataset = load_dataset("neo4j/text2cypher-2024v1")["train"].shuffle(seed=42).select(range(50))

MODEL_LIST = [
    (DEEPSEEK_DISTILL_QWEN_7B, "Deepseek Distill QWEN 7B")
]


In [6]:
tokenizer = AutoTokenizer.from_pretrained(DEEPSEEK_DISTILL_QWEN_7B)
model = AutoModelForCausalLM.from_pretrained(DEEPSEEK_DISTILL_QWEN_7B, torch_dtype=torch.float16, device_map="auto")

OSError: Incorrect path_or_model_id: '/datasets/ai/deepseek/hub/models--deepseek-ai--DeepSeek-R1-Distill-Qwen-7B/snapshots/6602cadec947dbb53e64f3d8d6425320b2197247'. Please provide either the path to a local folder or the repo_id of a model on the Hub.

In [16]:
BATCH_SIZE = 8
train_dataset = Text2CypherDataset(dataset, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [17]:
def generate_cypher(batch_questions, batch_schemas, model, tokenizer):
    batch_inputs = [f"Convert this natural language question to a Cypher query.\nQuestion: {q}\nSchema: {s}\nCypher Query:"
                    for q, s in zip(batch_questions, batch_schemas)]
    
    inputs = tokenizer(batch_inputs, return_tensors="pt", truncation=True, padding=True, max_length=512).to("cuda")

    with torch.no_grad():
        output_tokens = model.generate(**inputs, max_new_tokens=256, do_sample=False)

    return tokenizer.batch_decode(output_tokens, skip_special_tokens=True)

In [18]:
def generate_baseline(train_loader, model, tokenizer, name):
    results = []
    correct_predictions = 0
    total_samples = 0

    for batch in tqdm(train_loader, desc=f"Generating Cypher Queries - {name}"):
        batch_questions = []
        batch_schemas = []
        batch_true_cyphers = []

        for i in range(len(batch["input_ids"])):
            batch_questions.append(dataset[i]["question"])
            batch_schemas.append(dataset[i]["schema"])
            batch_true_cyphers.append(dataset[i]["cypher"])

        predicted_cyphers = generate_cypher(batch_questions, batch_schemas, model, tokenizer)


        for true_cypher, predicted_cypher in zip(batch_true_cyphers, predicted_cyphers):
            is_correct = predicted_cypher.strip().lower() == true_cypher.strip().lower()
            if is_correct:
                correct_predictions += 1
            total_samples += 1

    new_result = {
        "Model Name": name,
        "Correctly Predicted": correct_predictions,
        "Total Samples": total_samples,
        "Accuracy": f"{(correct_predictions/total_samples)*100:.2f}%"
    }

    OUTPUT_PATH = os.path.join(DATASET_PATH, "zero_shot_baseline_results.csv")
    if os.path.exists(OUTPUT_PATH):
        df = pd.read_csv(OUTPUT_PATH)
        df = pd.concat([df, pd.DataFrame([new_result])], ignore_index=True)
    else:
        df = pd.DataFrame([new_result])
    
    df.to_csv(OUTPUT_PATH, index=False)


    print(f"Results saved to {OUTPUT_PATH} for model {name}")
    print(f"Accuracy: {correct_predictions}/{total_samples} ({(correct_predictions/total_samples)*100:.2f}%)")

Generating Cypher Queries:   0%|          | 0/2 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Both `max_new_tokens` (=256) and `max_length`(=512) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Both `max_new_tokens` (=256) and `max_length`(=512) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Both `max_new_tokens` (=256) and `max_length`(=512) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/

Results saved to /work/pi_wenlongzhao_umass_edu/9/dpatel/zero_shot_baseline_results.csv
Accuracy: 0/10 (0.00%)





In [None]:
for model_name, name in (MODEL_LIST):
    tokenizer = AutoTokenizer.from_pretrained(DEEPSEEK_DISTILL_QWEN_7B)
    model = AutoModelForCausalLM.from_pretrained(DEEPSEEK_DISTILL_QWEN_7B, torch_dtype=torch.float16, device_map="auto")
    model = torch.compile(model)  
    generate_baseline(train_loader, model, tokenizer, name)

In [None]:
import pandas as pd
df = pd.read_csv(OUTPUT_PATH)
print(df.head())