<a href="https://colab.research.google.com/github/elizabethavargas/Dataset-Description-Generation/blob/main/generate_autoddg_descriptions.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## AutoDDG Descriptions
Here were generate descriptions for a random sample of 500 NYC Open Data datasets using AutoDDG which will be used as a baseline for our model.

In [1]:
!pip install git+https://github.com/VIDA-NYU/AutoDDG@main

import pandas as pd
from openai import OpenAI

from autoddg import AutoDDG, GPTEvaluator
from autoddg.utils import get_sample

Collecting git+https://github.com/VIDA-NYU/AutoDDG@main
  Cloning https://github.com/VIDA-NYU/AutoDDG (to revision main) to /tmp/pip-req-build-go3xv_jt
  Running command git clone --filter=blob:none --quiet https://github.com/VIDA-NYU/AutoDDG /tmp/pip-req-build-go3xv_jt
  Resolved https://github.com/VIDA-NYU/AutoDDG to commit 5f26a43b216d2ee4079b8e4969397a12d414db95
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


### Run AutoDDG on a random sample of datasets


In [3]:
import random
import pickle

# take random sample of dataserts
datasets = pd.read_pickle('datasets.pkl')
sampled_datasets = random.sample(datasets, 500)

with open('sampled_datasets.pkl', 'wb') as f:
    pickle.dump(sampled_datasets, f)

In [6]:
# create model object
key = KEY
client = OpenAI(
            api_key=key, base_url="https://api.deepinfra.com/v1/openai"
        )
auto_ddg = AutoDDG(client=client, model_name="meta-llama/Meta-Llama-3-8B-Instruct", description_temperature=0.0)

In [26]:
import concurrent.futures
import pandas as pd
import time
from openai import OpenAI
from autoddg import AutoDDG
from autoddg.utils import get_sample


def process_single_dataset(dataset_entry, api_key, max_retries=3, retry_delay=2):
    """Process one dataset safely in an isolated process."""

    # retry loop to handle rate-limit errors or corrupted responses
    for attempt in range(1, max_retries + 1):
        try:
            # Create fresh, isolated client
            local_client = OpenAI(
                api_key=api_key,
                base_url="https://api.deepinfra.com/v1/openai"
            )

            local_auto_ddg = AutoDDG(
                client=local_client,
                model_name="meta-llama/Meta-Llama-3-8B-Instruct",
                description_temperature=0.0
            )

            # extract dataset info
            title = dataset_entry["dataset_name"]
            original_description = dataset_entry["description"]
            data_dict = dataset_entry["data_example"]
            dataset_id = dataset_entry["dataset_id"]

            # dataframe
            data_df = pd.DataFrame([data_dict])

            # sample rows
            sample_df, dataset_sample = get_sample(data_df, sample_size=1)

            # profiles
            basic_profile, structural_profile = local_auto_ddg.profile_dataframe(data_df)
            semantic_profile_details = local_auto_ddg.analyze_semantics(sample_df)

            semantic_profile = "\n".join(
                section for section in [structural_profile, semantic_profile_details]
                if section
            )

            # Topic
            data_topic = local_auto_ddg.generate_topic(
                title=title,
                original_description=original_description,
                dataset_sample=dataset_sample,
            )

            # Description
            prompt, description = local_auto_ddg.describe_dataset(
                dataset_sample=dataset_sample,
                dataset_profile=basic_profile,
                use_profile=True,
                semantic_profile=semantic_profile,
                use_semantic_profile=True,
                data_topic=data_topic,
                use_topic=True,
            )

            # SUCCESS
            return {
                "id": dataset_id,
                "prompt": prompt,
                "description": description,
            }

        except Exception as e:
            print(
                f"[{dataset_entry['dataset_id']}] Attempt {attempt}/{max_retries} failed: {e}"
            )
            if attempt == max_retries:
                # give up
                return {
                    "id": dataset_entry["dataset_id"],
                    "error": str(e),
                    "prompt": None,
                    "description": None
                }

            time.sleep(retry_delay)  # backoff


processed_results = []

with concurrent.futures.ProcessPoolExecutor(max_workers=10) as executor:
    future_to_dataset = {
        executor.submit(process_single_dataset, dataset, key): dataset
        for dataset in sampled_datasets
    }

    for future in concurrent.futures.as_completed(future_to_dataset):
        dataset = future_to_dataset[future]
        result = future.result()
        processed_results.append(result)

# Summary output
print(f"Processed {len(processed_results)} datasets.\n")

# Show first successful result
first_ok = next((r for r in processed_results if r.get("prompt")), None)
if first_ok:
    print("First processed dataset with valid results:")
    print(first_ok)
else:
    print("No successful dataset found.")


[r6ub-zhff] Attempt 1/3 failed: 'dict' object has no attribute 'lower'
[7umd-hdjb] Attempt 1/3 failed: 'bool' object has no attribute 'get'




[rdjw-z878] Attempt 1/3 failed: 'bool' object has no attribute 'get'
[r6ub-zhff] Attempt 2/3 failed: 'str' object has no attribute 'get'
[m64b-i6yz] Attempt 1/3 failed: 'bool' object has no attribute 'get'




[vf4p-p8ui] Attempt 1/3 failed: 'bool' object has no attribute 'get'
[ya22-5bh7] Attempt 1/3 failed: 'NoneType' object has no attribute 'get'
[ucdy-byxd] Attempt 1/3 failed: 'dict' object has no attribute 'lower'
[wtqm-fd2z] Attempt 1/3 failed: 'str' object has no attribute 'get'
[rhe8-mgbb] Attempt 1/3 failed: 'dict' object has no attribute 'lower'
[pjs3-c3z5] Attempt 1/3 failed: 'bool' object has no attribute 'get'
[r6ub-zhff] Attempt 3/3 failed: Error code: 400 - {'error': {'message': 'Input too long'}}
[usc3-8zwd] Attempt 1/3 failed: 'str' object has no attribute 'get'
[yeba-ynb5] Attempt 1/3 failed: Error code: 400 - {'error': {'message': 'Input too long'}}
[5kqf-fg3n] Attempt 1/3 failed: 'dict' object has no attribute 'lower'
[usc3-8zwd] Attempt 2/3 failed: 'dict' object has no attribute 'lower'
[tyv9-j3ti] Attempt 1/3 failed: 'dict' object has no attribute 'lower'
[yeba-ynb5] Attempt 2/3 failed: Error code: 400 - {'error': {'message': 'Input too long'}}
[m8p6-tp4b] Attempt 1/3 f

In [27]:
# save results
with open('autoddg_results.pkl', 'wb') as f:
    pickle.dump(processed_results, f)