In [1]:
pip install -U datasets huggingface_hub fsspec transformers

Collecting datasets
  Downloading datasets-4.0.0-py3-none-any.whl.metadata (19 kB)
Collecting huggingface_hub
  Downloading huggingface_hub-0.33.5-py3-none-any.whl.metadata (14 kB)
Collecting transformers
  Downloading transformers-4.53.3-py3-none-any.whl.metadata (40 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.9/40.9 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
Collecting fsspec
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-4.0.0-py3-none-any.whl (494 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m494.8/494.8 kB[0m [31m29.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading huggingface_hub-0.33.5-py3-none-any.whl (515 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m515.7/515.7 kB[0m [31m36.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl (193 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m16.0 MB/s[0m eta

## Local Inference on GPU
Model page: https://huggingface.co/dataguy50/finetuned_llama_text2cypher

⚠️ If the generated code snippets do not work, please open an issue on either the [model repo](https://huggingface.co/dataguy50/finetuned_llama_text2cypher)
			and/or on [huggingface.js](https://github.com/huggingface/huggingface.js/blob/main/packages/tasks/src/model-libraries-snippets.ts) 🙏

In [2]:
# Import libraries
from transformers import pipeline
from datasets import load_dataset
import random

In [3]:
# Function to format test set
def format_test_prompts(examples):

    prompt = '''
                You are an assistant designed to generate cypher statements to allow querying of a graph database. Use only the provided relationship types and properties in the provided schema.

                ### Schema:
                {}

                ### Question:
                {}

                ### Response:
            '''

    schemas = examples["schema"]
    questions = examples["question"]

    texts = []

    for schema, question, in zip(schemas, questions):

        # Must add EOS_TOKEN, otherwise your generation will go on forever
        text = prompt.format(schema, question) + '<|eot_id|>'
        texts.append(text)

    return {"prompt": texts}

In [4]:
# Load the data
ds = load_dataset("neo4j/text2cypher-2025v1")

README.md:   0%|          | 0.00/109 [00:00<?, ?B/s]

train.parquet:   0%|          | 0.00/4.52M [00:00<?, ?B/s]

test.parquet:   0%|          | 0.00/594k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/35946 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/4438 [00:00<?, ? examples/s]

In [5]:
# Grab a prompt for model sampling
sample = random.choice([i for i in range(len(ds['test']))])

In [6]:
ds.column_names

{'train': ['question',
  'schema',
  'cypher',
  'data_source',
  'instance_id',
  'database_reference_alias'],
 'test': ['question',
  'schema',
  'cypher',
  'data_source',
  'instance_id',
  'database_reference_alias']}

In [7]:
# Apply function to convert test set for modeling
test =  ds['test'].map(format_test_prompts, batched = True,)

Map:   0%|          | 0/4438 [00:00<?, ? examples/s]

In [8]:
obs = test['prompt'][sample]
print(obs)


                You are an assistant designed to generate cypher statements to allow querying of a graph database. Use only the provided relationship types and properties in the provided schema.

                ### Schema:
                Node properties:
- **Question**
  - `link`: STRING Example: "https://stackoverflow.com/q/65697972"
  - `accepted_answer_id`: INTEGER Min: 61447621, Max: 69272967
  - `creation_date`: INTEGER Min: 1587746198, Max: 1632249176
  - `view_count`: INTEGER Min: 4, Max: 1851
  - `answer_count`: INTEGER Min: 0, Max: 4
  - `body_markdown`: STRING Example: "I want to create a graph from my recursive JSON fi"
  - `uuid`: INTEGER Min: 61413144, Max: 69273945
  - `title`: STRING Example: "Create graph from recursive JSON data using apoc.l"
- **User**
  - `uuid`: INTEGER Min: deleted, Max: 16922964
  - `display_name`: STRING Example: "schernichkin"
- **Tag**
  - `name`: STRING Example: "neo4j"
  - `link`: STRING Example: "https://stackoverflow.com/questions/tagged

In [9]:
# Use a pipeline as a high-level helper
pipe = pipeline("text-generation", model = "dataguy50/finetuned_llama_text2cypher")
messages = [{"role": "user", "content": obs},]
pipe(messages)

config.json: 0.00B [00:00, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/234 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

chat_template.jinja: 0.00B [00:00, ?B/s]

Device set to use cuda:0


[{'generated_text': [{'role': 'user',
    'content': '\n                You are an assistant designed to generate cypher statements to allow querying of a graph database. Use only the provided relationship types and properties in the provided schema.\n\n                ### Schema:\n                Node properties:\n- **Question**\n  - `link`: STRING Example: "https://stackoverflow.com/q/65697972"\n  - `accepted_answer_id`: INTEGER Min: 61447621, Max: 69272967\n  - `creation_date`: INTEGER Min: 1587746198, Max: 1632249176\n  - `view_count`: INTEGER Min: 4, Max: 1851\n  - `answer_count`: INTEGER Min: 0, Max: 4\n  - `body_markdown`: STRING Example: "I want to create a graph from my recursive JSON fi"\n  - `uuid`: INTEGER Min: 61413144, Max: 69273945\n  - `title`: STRING Example: "Create graph from recursive JSON data using apoc.l"\n- **User**\n  - `uuid`: INTEGER Min: deleted, Max: 16922964\n  - `display_name`: STRING Example: "schernichkin"\n- **Tag**\n  - `name`: STRING Example: "neo4j"