<a href="https://colab.research.google.com/github/elizabethavargas/Dataset-Description-Generation/blob/main/testing_prompts.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Testing Prompts
The paper used GPT 4o-mini and LLaMA-3.1-8B-Instruct. However, inorder to make generation and


models = [
    "unsloth/Meta-Llama-3.1-8B-Instruct",
    "unsloth/Meta-Llama-3.1-70B-Instruct",
    "unsloth/Qwen2-72B-Instruct",
    "unsloth/Qwen2-7B-Instruct",
]


### Setup LLMs

In [1]:
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git@31b667b54139962832ea2de890383eed14a0a17d"
import unsloth
from unsloth import FastLanguageModel
import torch
import pandas as pd
from tqdm import tqdm

Collecting unsloth@ git+https://github.com/unslothai/unsloth.git@31b667b54139962832ea2de890383eed14a0a17d (from unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git@31b667b54139962832ea2de890383eed14a0a17d)
  Using cached unsloth-2025.10.10-py3-none-any.whl
ü¶• Unsloth: Will patch your computer to enable 2x faster free finetuning.
ü¶• Unsloth Zoo will now patch everything to make training faster!


## Create Objects

In [2]:
generation_models = ["unsloth/Meta-Llama-3.1-70B-Instruct",
          "unsloth/Meta-Llama-3.1-8B-Instruct",
          "unsloth/Qwen2-7B-Instruct",
          "unsloth/Qwen2-72B-Instruct"]

class HFGenerator:
    """Generates descriptions using a Hugging Face model"""
    def __init__(self, model_name="unsloth/Meta-Llama-3.1-8B-Instruct"):
        if model_name not in generation_models:
          raise ValueError(f"Model '{model_name}' is not in the list of available models. Please choose from: {generation_models}")

        # Load the model and tokenizer from Hugging Face
        self.model_name = model_name
        self.model, self.tokenizer = FastLanguageModel.from_pretrained(
            model_name = model_name,
            max_seq_length = 4096,
            dtype = None,
            load_in_4bit = True,
        )

        # Prepare model for inference
        FastLanguageModel.for_inference(self.model)

        if 'Qwen' in model_name:
          tokenizer.bos_token = "<s>"  # typical for Qwen
          tokenizer.eos_token = "</s>"
        else:
          self.eos_id = self.tokenizer.eos_token_id
          self.eot_id = self.tokenizer.convert_tokens_to_ids("<|eot_id|>")

    def generate_description(self, prompt, temperature=0.0):
        """Generates a description given a prompt and temperature"""

        inputs = self.tokenizer(prompt, return_tensors="pt").to("cuda")

        if 'Llama' in self.model_name:
          with torch.no_grad():
              outputs = self.model.generate(
                  input_ids=inputs.input_ids,
                  attention_mask=inputs.attention_mask,
                  max_new_tokens=200,
                  do_sample=True if temperature > 0 else False,
                  temperature=temperature,
                  num_beams=1,
                  eos_token_id=[self.eos_id, self.eot_id],
                  pad_token_id=self.eos_id,
                  use_cache=True,
              )
        else:
          pass

        response_text = self.tokenizer.decode(outputs[0], skip_special_tokens=True).strip()

        # Extract only the generated description by removing the input prompt part
        generated_description = response_text[len(prompt):].strip()
        return generated_description

In [None]:
generation_models = [
    "unsloth/Meta-Llama-3.1-70B-Instruct",
    "unsloth/Meta-Llama-3.1-8B-Instruct",
    "unsloth/Qwen2-7B-Instruct",
    "unsloth/Qwen2-72B-Instruct",
]

class HFGenerator:
    """Generates descriptions using a Hugging Face model"""

    def __init__(self, model_name):
        if model_name not in generation_models:
            raise ValueError(f"Model '{model_name}' is not in the list of available models. "
                             f"Choose from: {generation_models}")

        self.model_name = model_name

        # Load model + tokenizer
        self.model, self.tokenizer = FastLanguageModel.from_pretrained(
            model_name=model_name,
            max_seq_length=4096,
            dtype=None,
            load_in_4bit=True,
        )

        FastLanguageModel.for_inference(self.model)

        if "Qwen" in model_name:
            self.tokenizer.pad_token = "<|extra_0|>"
            self.tokenizer.eos_token = "</s>"
            self.tokenizer.bos_token = "<s>"

            self.eos_ids = [self.tokenizer.eos_token_id]

        else:  # LLaMA
            self.eos_ids = [
                self.tokenizer.eos_token_id,
                self.tokenizer.convert_tokens_to_ids("<|eot_id|>")
            ]

    def generate_description(self, prompt, temperature=0.0):
        """Generates a description given a prompt and temperature"""

        inputs = self.tokenizer(prompt, return_tensors="pt").to("cuda")
        do_sample = temperature > 0
        # ----------------------------
        if "Llama" in self.model_name or "Meta-Llama" in self.model_name:
            with torch.no_grad():
                outputs = self.model.generate(
                    **inputs,
                    max_new_tokens=200,
                    do_sample=do_sample,
                    temperature=temperature,
                    num_beams=1,
                    eos_token_id=self.eos_ids,
                    pad_token_id=self.tokenizer.eos_token_id,
                    use_cache=True,
                )

        else:  # Qwen branch
            with torch.no_grad():
                outputs = self.model.generate(
                    **inputs,
                    max_new_tokens=200,
                    do_sample=do_sample,
                    temperature=temperature,
                    eos_token_id=self.eos_ids,
                    pad_token_id=self.tokenizer.pad_token_id,
                    use_cache=True,
                )
        text = self.tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
        return text[len(prompt):].strip()


In [None]:
llama_generator = HFGenerator("unsloth/Meta-Llama-3.1-8B-Instruct")


In [None]:
llama_generator.generate_description(test_prompt)

In [None]:
qwen_generator = HFGenerator("unsloth/Qwen2-7B-Instruct")
qwen_generator.generate_description(test_prompt)

==((====))==  Unsloth 2025.10.10: Fast Qwen2 patching. Transformers: 4.56.2.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.5.0
\        /    Bfloat16 = FALSE. FA [Xformers = None. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


ValueError: Some modules are dispatched on the CPU or the disk. Make sure you have enough GPU RAM to fit the quantized model. If you want to dispatch the model on the CPU or the disk while keeping these modules in 32-bit, you need to set `llm_int8_enable_fp32_cpu_offload=True` and pass a custom `device_map` to `from_pretrained`. Check https://huggingface.co/docs/transformers/main/en/main_classes/quantization#offload-between-cpu-and-gpu for more details. 

## Basic Testing

In [None]:
test_prompt = """You are a data documentation expert.
Your task is to rewrite the dataset description so it sounds professional, informative, and engaging ‚Äî suitable for the NYC Open Data catalog.

Dataset title: 2019 For Hire Vehicles Trip Data
Category: Transportation
Agency: Taxi and Limousine Commission (TLC)
Tags: ['taxi', 'trip data', 'fhv', 'trip', 'base', 'high volume', 'uber', 'lyft', 'via']

Current description:
"These records are generated from the For-Hire Vehicle (‚ÄúFHV‚Äù) Trip Record submissions made by traditional livery, luxury, and black car bases. The FHV trip records include fields capturing the dispatching base license number and the pick-up date, time, and taxi zone location ID, which correspond with the NYC Taxi Zones open dataset. Each row represents a single trip in an FHV."

Example row:
{
  "dispatching_base_num": "B01239",
  "pickup_datetime": "2019-01-01T00:10:37.000",
  "dropoff_datetime": "2019-01-01T00:26:19.000",
  "dolocationid": "265"
}

Column definitions:
dispatching_base_num: The TLC Base License Number of the base that dispatched the trip
pickup_datetime: The date and time of the trip pick-up
dropOff_datetime: The date and time of the trip dropoff
PUlocationID: TLC Taxi Zone in which the trip began
DOlocationID: TLC Taxi Zone in which the trip ended
SR_Flag: Indicates if the trip was a part of a shared ride chain offered by a High Volume FHV company (e.g. Uber Pool, Lyft Line).
Affiliated_base_number: Base number of the base with which the vehicle is affiliated.

When improving the description:
- Do NOT restate or list individual column definitions.
- Expand on what the dataset enables ‚Äî such as transportation planning, ride-share regulation, equity analysis, or urban mobility research.
- Include *context* (why this data matters, who uses it, what insights it offers).
- Use confident, clear, natural language.
- Keep it concise (1‚Äì2 paragraphs).
- Write as if it were the official NYC Open Data description.

**Improved description:**
"""


## Create & Apply Prompt Templates
The first prompt is adapted from the autoDDG prompts

In [None]:
description = None
dataset_sample = None
title = None
agency = None
category = None
column_definitions = None
tags = None


system_message = f"""You are an assistant for a dataset search engine. Your goal
is to improve the readability of dataset descriptions for dataset search engine users."""

introduction = f"""Answer the question using the following information.

    First, consider the dataset sample:

    {dataset_sample}"""

initial_description = f"""The initial description is {description}."""

title_agency_cat = f"""Additionally the dataset title is {title}, the agency is {agency} and the category is
{category} Based on this topic and agency, please add sentence(s) describing what this
dataset can be used for."""

tag = f"""The tags are {tags}."""

column_defs = f"""Additionally, the column definitions are {column_definitions}."""

closing_instruction = f"""Question: Based on the information above and the
requirements, provide a dataset description in sentences. Use only natural,
readable sentences without special formatting."""



In [None]:
# read datasets.pkl
import pandas as pd
datasets = pd.read_pickle("datasets.pkl")
datasets[1]

{'dataset_id': 'npwk-bcm6',
 'data_example': {'school_year': '2006-2007',
  'report_type': 'Citywide',
  'program': 'GENERAL EDUCATION',
  'grade_or_service_category': 'Kindergarten',
  'average_class_size': '20.7'},
 'dataset_name': 'Class Size Report (2006-2007)',
 'category': 'Education',
 'description': 'For schools with students in any grades between Kindergarten and 9th grade (where 9th grade is the termination grade for the school), class size is reported by four program areas: general education, special education self-contained class, collaborative team teaching and gifted and talented self-contained class. Within each program area class size is reported by grade or service category, which indicates how a special education self-contained class is delivered. Class size is calculated by dividing the number of students in a program and grade by the number of official classes in that program and grade.\nThe following data is excluded from all the reports: District 75 schools, bridg

In [None]:
# Load the model and tokenizer from Hugging Face
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Meta-Llama-3.1-8B-Instruct",
    max_seq_length = 4096,
    dtype = None,
    load_in_4bit = True,
)

generator = HFGenerator(model, tokenizer)

for dataset in datasets:
  dataset_sample = dataset["dataset_example"]
  description = dataset['description']
  title = dataset['dataset_name']
  agency = dataset['agency']
  category = dataset['category']
  column_definitions = dataset["column_info"]
  tags = dataset['tags']

  prompt = system_message
  if dataset["dataset_sample"] is not None:
    prompt += introduction
  if dataset["description"] is not None:
    prompt += initial_description
  if dataset["title"] is not None:
    prompt += title_agency_cat
  if dataset["tags"] is not None:
    prompt += tag
  if dataset["column_definitions"] is not None:
    prompt += column_defs
  prompt += closing_instruction

  new_description = openAIGenerator.generate_description(prompt)
  generator.generate_description(prompt)