In [1]:
# Uncomment the following line to install all dependencies
# !pip install openai instructor pydantic tqdm

# Parsing NcML using GPT4

This notebook demonstrates how to use GPT4 to extract metadata from `ncml` files drawn from Dataverse datasets. Each `ncml` file has been obtained from the [Harvard Dataverse](https://dataverse.harvard.edu/) repository, a platform that facilitates sharing and preservation of research data. For extraction, we've utilized the [Dataverse API](https://guides.dataverse.org/en/latest/api/index.html) and saved the output as an XML file.

Our goal with this notebook is to leverage GPT4 to extract metadata from `ncml` data and convert it into a structured format using [`instructor`](https://jxnl.github.io/instructor/) compliant with a questionnaire model, implemented through [`pydantic`](https://docs.pydantic.dev/latest/) classes. This approach allows us to obtain metadata from varying ke-value data found in `ncml`, which can be further analyzed and scaled with ease.

Please note, in order to use this notebook it is necessary to provide an OpenAI API Token via the `OPENAPI_API_KEY` environment variable and sufficient credits to use GPT-4.

In [2]:
import openai
import instructor
import glob
import os
import json
import tqdm

from model.questionaires import ResponseModel

instructor.patch(openai.OpenAI())
openai.api_key = os.environ["OPENAI_API_KEY"]

In [3]:
ncmls = {}
for path in glob.glob("../data/ncml-files/*.xml"):
    fname = os.path.basename(path)
    ncmls[fname] = {
        "content": open(path).read(),
    }


In [5]:
system_msg = "You are a helpful assistant who understands geospatial sciences, NetCDF Markup Language (NcML) and how to parse XML."

for key in tqdm.tqdm(ncmls.keys()):
    
    if ncmls[key].get("response") is not None:
        continue
    
    user_msg = (
        "Extract metadata from the following NetCDF Markup Language file (XML encoded): "
        + ncmls[key]["content"]
    )
    
    response: ResponseModel = openai.ChatCompletion.create(
        model="gpt-4",
        response_model=ResponseModel,
        messages=[
            {"role": "system", "content": system_msg},
            {"role": "user", "content": user_msg},
        ],
    )
    
    ncmls[key]["response"] = response.model_dump(exclude_unset=True)

  0%|          | 0/134 [00:00<?, ?it/s]

100%|██████████| 134/134 [21:39<00:00,  9.69s/it]


In [None]:
with open("../data/gpt4_on_ncml.json", "w") as f:
    json.dump(ncmls, f, indent=2)