Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Let notebook detection code use underlying metadata if available #1574

Merged
merged 5 commits into from
Jul 10, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
76 changes: 66 additions & 10 deletions libs/notebook/detect.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,41 +12,97 @@ import (
"github.com/databricks/databricks-sdk-go/service/workspace"
)

// FileInfoWithWorkspaceObjectInfo is an interface implemented by [fs.FileInfo] values that
// contain a file's underlying [workspace.ObjectInfo].
//
// This may be the case when working with a [filer.Filer] backed by the workspace API.
// For these files we do not need to read a file's header to know if it is a notebook;
// we can use the [workspace.ObjectInfo] value directly.
type FileInfoWithWorkspaceObjectInfo interface {
WorkspaceObjectInfo() workspace.ObjectInfo
}

// Maximum length in bytes of the notebook header.
const headerLength = 32

// readHeader reads the first N bytes from a file.
func readHeader(fsys fs.FS, name string) ([]byte, error) {
// file wraps an fs.File and implements a few helper methods such that
// they don't need to be inlined in the [DetectWithFS] function below.
type file struct {
f fs.File
}

func openFile(fsys fs.FS, name string) (*file, error) {
f, err := fsys.Open(name)
if err != nil {
return nil, err
}

defer f.Close()
return &file{f: f}, nil
}

func (f file) close() error {
return f.f.Close()
}

func (f file) readHeader() (string, error) {
// Scan header line with some padding.
var buf = make([]byte, headerLength)
n, err := f.Read([]byte(buf))
n, err := f.f.Read([]byte(buf))
if err != nil && err != io.EOF {
return nil, err
return "", err
}

// Trim buffer to actual read bytes.
return buf[:n], nil
buf = buf[:n]

// Read the first line from the buffer.
scanner := bufio.NewScanner(bytes.NewReader(buf))
scanner.Scan()
return scanner.Text(), nil
}

// getObjectInfo returns the [workspace.ObjectInfo] for the file if it is
// part of the [fs.FileInfo] value returned by the [fs.Stat] call.
func (f file) getObjectInfo() (oi workspace.ObjectInfo, ok bool, err error) {
stat, err := f.f.Stat()
if err != nil {
return workspace.ObjectInfo{}, false, err
}

// Use object info if available.
if i, ok := stat.(FileInfoWithWorkspaceObjectInfo); ok {
return i.WorkspaceObjectInfo(), true, nil
}

return workspace.ObjectInfo{}, false, nil
}

// Detect returns whether the file at path is a Databricks notebook.
// If it is, it returns the notebook language.
func DetectWithFS(fsys fs.FS, name string) (notebook bool, language workspace.Language, err error) {
header := ""

buf, err := readHeader(fsys, name)
f, err := openFile(fsys, name)
if err != nil {
return false, "", err
}

defer f.close()

// Use object info if available.
oi, ok, err := f.getObjectInfo()
if err != nil {
return false, "", err
}
if ok {
return oi.ObjectType == workspace.ObjectTypeNotebook, oi.Language, nil
}

// Read the first line of the file.
fileHeader, err := f.readHeader()
if err != nil {
return false, "", err
}
scanner := bufio.NewScanner(bytes.NewReader(buf))
scanner.Scan()
fileHeader := scanner.Text()

// Determine which header to expect based on filename extension.
ext := strings.ToLower(filepath.Ext(name))
Expand Down
18 changes: 18 additions & 0 deletions libs/notebook/detect_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -99,3 +99,21 @@ func TestDetectFileWithLongHeader(t *testing.T) {
require.NoError(t, err)
assert.False(t, nb)
}

func TestDetectWithObjectInfo(t *testing.T) {
fakeFS := &fakeFS{
fakeFile{
fakeFileInfo{
workspace.ObjectInfo{
ObjectType: workspace.ObjectTypeNotebook,
Language: workspace.LanguagePython,
},
},
},
}

nb, lang, err := DetectWithFS(fakeFS, "doesntmatter")
require.NoError(t, err)
assert.True(t, nb)
assert.Equal(t, workspace.LanguagePython, lang)
}
77 changes: 77 additions & 0 deletions libs/notebook/fakefs_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
package notebook

import (
"fmt"
"io/fs"
"time"

"github.com/databricks/databricks-sdk-go/service/workspace"
)

type fakeFS struct {
fakeFile
}

type fakeFile struct {
fakeFileInfo
}

func (f fakeFile) Close() error {
return nil
}

func (f fakeFile) Read(p []byte) (n int, err error) {
return 0, fmt.Errorf("not implemented")
}

func (f fakeFile) Stat() (fs.FileInfo, error) {
return f.fakeFileInfo, nil
}

type fakeFileInfo struct {
oi workspace.ObjectInfo
}

func (f fakeFileInfo) WorkspaceObjectInfo() workspace.ObjectInfo {
return f.oi
}

func (f fakeFileInfo) Name() string {
return ""
}

func (f fakeFileInfo) Size() int64 {
return 0
}

func (f fakeFileInfo) Mode() fs.FileMode {
return 0
}

func (f fakeFileInfo) ModTime() time.Time {
return time.Time{}
}

func (f fakeFileInfo) IsDir() bool {
return false
}

func (f fakeFileInfo) Sys() any {
return nil
}

func (f fakeFS) Open(name string) (fs.File, error) {
return f.fakeFile, nil
}

func (f fakeFS) Stat(name string) (fs.FileInfo, error) {
panic("not implemented")
}

func (f fakeFS) ReadDir(name string) ([]fs.DirEntry, error) {
panic("not implemented")
}

func (f fakeFS) ReadFile(name string) ([]byte, error) {
panic("not implemented")
}
Loading