Skip to content

feat: add dataset tools and libraries #1

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 11 commits into from
Oct 10, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
test.gpt
bin/
.idea/
.vscode/
3 changes: 3 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
.PHONY: build
build:
CGO_ENABLED=0 go build -o bin/gptscript-go-tool -tags "${GO_TAGS}" -ldflags "-s -w" .
11 changes: 11 additions & 0 deletions go.mod
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
module github.com/gptscript-ai/datasets

go 1.23.2

require github.com/stretchr/testify v1.9.0

require (
github.com/davecgh/go-spew v1.1.1 // indirect
github.com/pmezard/go-difflib v1.0.0 // indirect
gopkg.in/yaml.v3 v3.0.1 // indirect
)
10 changes: 10 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg=
github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
38 changes: 38 additions & 0 deletions main.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
package main

import (
"fmt"
"os"

"github.com/gptscript-ai/datasets/pkg/tools"
)

func main() {
if len(os.Args) < 2 {
fmt.Println(`usage: gptscript-go-tool <command>
subcommands: listDatasets, listElements, getElement, createDataset, addElement
env vars: GPTSCRIPT_WORKSPACE_DIR`)
}

workspace := os.Getenv("GPTSCRIPT_WORKSPACE_DIR")
if workspace == "" {
fmt.Println("missing GPTSCRIPT_WORKSPACE_DIR")
os.Exit(1)
}

switch os.Args[1] {
case "listDatasets":
tools.ListDatasets(workspace)
case "listElements":
tools.ListElements(workspace, os.Getenv("DATASETID"))
case "getElement":
tools.GetElement(workspace, os.Getenv("DATASETID"), os.Getenv("ELEMENT"))
case "createDataset":
tools.CreateDataset(workspace, os.Getenv("DATASETNAME"), os.Getenv("DATASETDESCRIPTION"))
case "addElement":
tools.AddElement(workspace, os.Getenv("DATASETID"), os.Getenv("ELEMENTNAME"), os.Getenv("ELEMENTDESCRIPTION"), []byte(os.Getenv("ELEMENTCONTENT")))
default:
fmt.Printf("unknown command: %s\n", os.Args[1])
os.Exit(1)
}
}
110 changes: 110 additions & 0 deletions pkg/dataset/dataset.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
package dataset

import (
"encoding/json"
"fmt"
"os"
"path/filepath"

"github.com/gptscript-ai/datasets/pkg/util"
)

type ElementMeta struct {
Name string `json:"name"`
Description string `json:"description"`
}

type Element struct {
ElementMeta `json:",inline"`
File string `json:"file"`
}

type DatasetMeta struct {
ID string `json:"id"`
Name string `json:"name"`
Description string `json:"description"`
}

type Dataset struct {
DatasetMeta `json:",inline"`
BaseDir string `json:"baseDir,omitempty"`
Elements map[string]Element `json:"elements"`
}

func (d *Dataset) GetID() string {
return d.ID
}

func (d *Dataset) GetName() string {
return d.Name
}

func (d *Dataset) GetDescription() string {
return d.Description
}

func (d *Dataset) GetLength() int {
return len(d.Elements)
}

func (d *Dataset) ListElements() []ElementMeta {
var elements []ElementMeta
for _, element := range d.Elements {
elements = append(elements, element.ElementMeta)
}
return elements
}

func (d *Dataset) GetElement(name string) ([]byte, Element, error) {
e, exists := d.Elements[name]
if !exists {
return nil, Element{}, fmt.Errorf("element %s not found", name)
}

contents, err := os.ReadFile(d.BaseDir + string(os.PathSeparator) + e.File)
if err != nil {
return nil, Element{}, fmt.Errorf("failed to read element %s: %w", name, err)
}

return contents, e, nil
}

func (d *Dataset) AddElement(name, description string, contents []byte) (Element, error) {
if _, exists := d.Elements[name]; exists {
return Element{}, fmt.Errorf("element %s already exists", name)
}

fileName, err := util.EnsureUniqueFilename(d.BaseDir, util.ToFileName(name))
if err != nil {
return Element{}, fmt.Errorf("failed to generate unique file name: %w", err)
}

loc := filepath.Join(d.BaseDir, fileName)
if err := os.WriteFile(loc, contents, 0644); err != nil {
return Element{}, fmt.Errorf("failed to write element %s: %w", name, err)
}

e := Element{
ElementMeta: ElementMeta{
Name: name,
Description: description,
},
File: fileName,
}

d.Elements[name] = e
return e, d.save()
}

func (d *Dataset) save() error {
datasetJSON, err := json.Marshal(d)
if err != nil {
return fmt.Errorf("failed to marshal dataset: %w", err)
}

if err := os.WriteFile(d.BaseDir+extension, datasetJSON, 0644); err != nil {
return fmt.Errorf("failed to write dataset file: %w", err)
}

return nil
}
110 changes: 110 additions & 0 deletions pkg/dataset/dataset_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
package dataset

import (
"os"
"path/filepath"
"testing"

"github.com/stretchr/testify/require"
)

const testWorkspace = "testworkspace"

func TestDatasetsRead(t *testing.T) {
wd, err := os.Getwd()
require.NoError(t, err)

workspaceDir := filepath.Join(wd, testWorkspace)
m, err := NewManager(workspaceDir)
require.NoError(t, err)

datasetMetas, err := m.ListDatasets()
require.NoError(t, err)
require.Len(t, datasetMetas, 2)

datasetOne, err := m.GetDataset("one")
require.NoError(t, err)
require.Equal(t, "one", datasetOne.GetName())
require.Equal(t, "The first test dataset", datasetOne.GetDescription())
require.Equal(t, 2, datasetOne.GetLength())

oneMetas := datasetOne.ListElements()
require.Len(t, oneMetas, 2)

oneOneBytes, _, err := datasetOne.GetElement("file1")
require.NoError(t, err)
require.Equal(t, "This is dataset 1, file 1.\n", string(oneOneBytes))

oneTwoBytes, _, err := datasetOne.GetElement("file2")
require.NoError(t, err)
require.Equal(t, "This is dataset 1, file 2.\n", string(oneTwoBytes))

datasetTwo, err := m.GetDataset("two")
require.NoError(t, err)
require.Equal(t, "two", datasetTwo.GetName())
require.Equal(t, "The second test dataset", datasetTwo.GetDescription())
require.Equal(t, 2, datasetTwo.GetLength())

twoMetas := datasetTwo.ListElements()
require.Len(t, twoMetas, 2)

twoOneBytes, _, err := datasetTwo.GetElement("file1")
require.NoError(t, err)
require.Equal(t, "This is dataset 2, file 1.\n", string(twoOneBytes))

twoTwoBytes, _, err := datasetTwo.GetElement("file2")
require.NoError(t, err)
require.Equal(t, "This is dataset 2, file 2.\n", string(twoTwoBytes))
}

func TestDatasetWrite(t *testing.T) {
wd, err := os.Getwd()
require.NoError(t, err)

workspaceDir := filepath.Join(wd, testWorkspace)
m, err := NewManager(workspaceDir)
require.NoError(t, err)

t.Cleanup(func() {
threeFiles, _ := filepath.Glob(filepath.Join(workspaceDir, "datasets", "three", "*"))

for _, file := range threeFiles {
_ = os.Remove(file)
}

_ = os.Remove(filepath.Join(workspaceDir, "datasets", "three"))
_ = os.Remove(filepath.Join(workspaceDir, "datasets", "three.dataset.json"))
})

datasetThree, err := m.NewDataset("three", "The third test dataset")
require.NoError(t, err)
require.Equal(t, "three", datasetThree.GetName())
require.Equal(t, "The third test dataset", datasetThree.GetDescription())
require.Equal(t, 0, datasetThree.GetLength())

// Let's add a couple elements.
_, err = datasetThree.AddElement("file1", "The first file", []byte("This is dataset 3, file 1.\n"))
require.NoError(t, err)
require.Equal(t, 1, datasetThree.GetLength())

_, err = datasetThree.AddElement("file2", "The second file", []byte("This is dataset 3, file 2.\n"))
require.NoError(t, err)
require.Equal(t, 2, datasetThree.GetLength())

// Let's read it back.
datasetThree, err = m.GetDataset(datasetThree.GetID())
require.NoError(t, err)
require.Equal(t, "three", datasetThree.GetName())
require.Equal(t, "The third test dataset", datasetThree.GetDescription())

threeMetas := datasetThree.ListElements()
require.Len(t, threeMetas, 2)

threeOneBytes, _, err := datasetThree.GetElement("file1")
require.NoError(t, err)
require.Equal(t, "This is dataset 3, file 1.\n", string(threeOneBytes))

threeTwoBytes, _, err := datasetThree.GetElement("file2")
require.NoError(t, err)
require.Equal(t, "This is dataset 3, file 2.\n", string(threeTwoBytes))
}
Loading