Skip to content

Commit

Permalink
Exporter: command-line option to control output format for notebooks (#…
Browse files Browse the repository at this point in the history
…2569)

New command-line option `-notebooksFormat` allows to export notebooks in DBC and IPython formats.

This fixes #2568
  • Loading branch information
alexott committed Aug 24, 2023
1 parent 4ed877c commit 337b1e8
Show file tree
Hide file tree
Showing 6 changed files with 105 additions and 28 deletions.
1 change: 1 addition & 0 deletions docs/guides/experimental-exporter.md
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ All arguments are optional and they tune what code is being generated.
* `-importAllUsers` - optionally include all users and service principals even if they only part of the `users` group.
* `-incremental` - experimental option for incremental export of modified resources and merging with existing resources. *Please note that only limited set of resources (notebooks, SQL queries/dashboards/alerts, ...) provides information about last modified date - all other resources will be re-exported again! Also, it's not possible to detect deletion of the resources, so you will need to do periodic full export if resources are deleted!* **Requires** `-updated-since` option if no `exporter-run-stats.json` file exists in the output directory.
* `-updated-since` - timestamp (in ISO8601 format supported by Go language) for exporting of resources modified since a giving timestamp. I.e. `2023-07-24T00:00:00Z`. If not specified, exporter will try to load last run timestamp from the `exporter-run-stats.json` file generated during the export, and use it.
* `-notebooksFormat` - optional format for exporting of notebooks. Supported values are `SOURCE` (default), `DBC`, `JUPYTER`. This could be used to export of notebooks with embedded dashboards.

## Services

Expand Down
2 changes: 2 additions & 0 deletions exporter/command.go
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,8 @@ func Run(args ...string) error {
flags.BoolVar(&ic.mounts, "mounts", false, "List DBFS mount points.")
flags.BoolVar(&ic.generateDeclaration, "generateProviderDeclaration", true,
"Generate Databricks provider declaration.")
flags.StringVar(&ic.notebooksFormat, "notebooksFormat", "SOURCE",
"Format to export notebooks: SOURCE, DBC, JUPYTER. Default: SOURCE")
services, listing := ic.allServicesAndListing()
flags.StringVar(&ic.services, "services", services,
"Comma-separated list of services to import. By default all services are imported.")
Expand Down
8 changes: 8 additions & 0 deletions exporter/context.go
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,7 @@ type importContext struct {
prefix string
accountLevel bool
shImports map[string]bool
notebooksFormat string
}

type mount struct {
Expand Down Expand Up @@ -153,6 +154,7 @@ func newImportContext(c *common.DatabricksClient) *importContext {
allWorkspaceObjects: []workspace.ObjectStatus{},
workspaceConfKeys: workspaceConfKeys,
shImports: make(map[string]bool),
notebooksFormat: "SOURCE",
}
}

Expand Down Expand Up @@ -201,6 +203,12 @@ func (ic *importContext) Run() error {
log.Printf("[INFO] Importing %s module into %s directory Databricks resources of %s services",
ic.Module, ic.Directory, ic.services)

ic.notebooksFormat = strings.ToUpper(ic.notebooksFormat)
_, supportedFormat := fileExtensionFormatMapping[ic.notebooksFormat]
if !supportedFormat && ic.notebooksFormat != "SOURCE" {
return fmt.Errorf("unsupported notebook format: '%s'", ic.notebooksFormat)
}

info, err := os.Stat(ic.Directory)
if os.IsNotExist(err) {
err = os.MkdirAll(ic.Directory, 0755)
Expand Down
67 changes: 40 additions & 27 deletions exporter/importables.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,21 +33,33 @@ import (
)

var (
adlsGen2Regex = regexp.MustCompile(`^(abfss?)://([^@]+)@([^.]+)\.(?:[^/]+)(/.*)?$`)
adlsGen1Regex = regexp.MustCompile(`^(adls?)://([^.]+)\.(?:[^/]+)(/.*)?$`)
wasbsRegex = regexp.MustCompile(`^(wasbs?)://([^@]+)@([^.]+)\.(?:[^/]+)(/.*)?$`)
s3Regex = regexp.MustCompile(`^(s3a?)://([^/]+)(/.*)?$`)
gsRegex = regexp.MustCompile(`^gs://([^/]+)(/.*)?$`)
globalWorkspaceConfName = "global_workspace_conf"
nameNormalizationRegex = regexp.MustCompile(`\W+`)
jobClustersRegex = regexp.MustCompile(`^((job_cluster|task)\.[0-9]+\.new_cluster\.[0-9]+\.)`)
dltClusterRegex = regexp.MustCompile(`^(cluster\.[0-9]+\.)`)
uuidRegex = regexp.MustCompile(`^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$`)
predefinedClusterPolicies = []string{"Personal Compute", "Job Compute", "Power User Compute", "Shared Compute"}
secretPathRegex = regexp.MustCompile(`^\{\{secrets\/([^\/]+)\/([^}]+)\}\}$`)
sqlParentRegexp = regexp.MustCompile(`^folders/(\d+)$`)
dltDefaultStorageRegex = regexp.MustCompile(`^dbfs:/pipelines/[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$`)
ignoreIdeFolderRegex = regexp.MustCompile(`^/Users/[^/]+/\.ide/.*$`)
adlsGen2Regex = regexp.MustCompile(`^(abfss?)://([^@]+)@([^.]+)\.(?:[^/]+)(/.*)?$`)
adlsGen1Regex = regexp.MustCompile(`^(adls?)://([^.]+)\.(?:[^/]+)(/.*)?$`)
wasbsRegex = regexp.MustCompile(`^(wasbs?)://([^@]+)@([^.]+)\.(?:[^/]+)(/.*)?$`)
s3Regex = regexp.MustCompile(`^(s3a?)://([^/]+)(/.*)?$`)
gsRegex = regexp.MustCompile(`^gs://([^/]+)(/.*)?$`)
globalWorkspaceConfName = "global_workspace_conf"
nameNormalizationRegex = regexp.MustCompile(`\W+`)
jobClustersRegex = regexp.MustCompile(`^((job_cluster|task)\.[0-9]+\.new_cluster\.[0-9]+\.)`)
dltClusterRegex = regexp.MustCompile(`^(cluster\.[0-9]+\.)`)
uuidRegex = regexp.MustCompile(`^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$`)
predefinedClusterPolicies = []string{"Personal Compute", "Job Compute", "Power User Compute", "Shared Compute"}
secretPathRegex = regexp.MustCompile(`^\{\{secrets\/([^\/]+)\/([^}]+)\}\}$`)
sqlParentRegexp = regexp.MustCompile(`^folders/(\d+)$`)
dltDefaultStorageRegex = regexp.MustCompile(`^dbfs:/pipelines/[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$`)
ignoreIdeFolderRegex = regexp.MustCompile(`^/Users/[^/]+/\.ide/.*$`)
fileExtensionLanguageMapping = map[string]string{
"SCALA": ".scala",
"PYTHON": ".py",
"SQL": ".sql",
"R": ".r",
}
fileExtensionFormatMapping = map[string]string{
"HTML": ".html",
"JUPYTER": ".ipynb",
"DBC": ".dbc",
"R_MARKDOWN": ".Rmd",
}
)

func generateMountBody(ic *importContext, body *hclwrite.Body, r *resource) error {
Expand Down Expand Up @@ -1215,18 +1227,20 @@ var resourcesMap map[string]importable = map[string]importable{
Import: func(ic *importContext, r *resource) error {
ic.emitUserOrServicePrincipalForPath(r.ID, "/Users")
notebooksAPI := workspace.NewNotebooksAPI(ic.Context, ic.Client)
contentB64, err := notebooksAPI.Export(r.ID, "SOURCE")
contentB64, err := notebooksAPI.Export(r.ID, ic.notebooksFormat)
if err != nil {
return err
}
language := r.Data.Get("language").(string)
ext := map[string]string{
"SCALA": ".scala",
"PYTHON": ".py",
"SQL": ".sql",
"R": ".r",
var fileExtension string
if ic.notebooksFormat == "SOURCE" {
language := r.Data.Get("language").(string)
fileExtension = fileExtensionLanguageMapping[language]
r.Data.Set("language", "")
} else {
fileExtension = fileExtensionFormatMapping[ic.notebooksFormat]
}
name := r.ID[1:] + ext[language] // todo: replace non-alphanum+/ with _
r.Data.Set("format", ic.notebooksFormat)
name := r.ID[1:] + fileExtension // todo: replace non-alphanum+/ with _
content, _ := base64.StdEncoding.DecodeString(contentB64)
fileName, err := ic.createFileIn("notebooks", name, []byte(content))
if err != nil {
Expand All @@ -1240,7 +1254,8 @@ var resourcesMap map[string]importable = map[string]importable{
})
}

// TODO: it's not completely correct condition - we need to make emit smarter - emit only if permissions are different from their parent's permission.
// TODO: it's not completely correct condition - we need to make emit smarter -
// emit only if permissions are different from their parent's permission.
if ic.meAdmin {
directorySplits := strings.Split(r.ID, "/")
directorySplits = directorySplits[:len(directorySplits)-1]
Expand All @@ -1252,9 +1267,7 @@ var resourcesMap map[string]importable = map[string]importable{
})
}

log.Printf("Creating %s for %s", fileName, r)
r.Data.Set("source", fileName)
return r.Data.Set("language", "")
return r.Data.Set("source", fileName)
},
Depends: []reference{
{Path: "source", File: true},
Expand Down
52 changes: 51 additions & 1 deletion exporter/importables_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -803,9 +803,9 @@ func TestNotebookGeneration(t *testing.T) {
},
},
}, "notebooks", func(ic *importContext) {
ic.notebooksFormat = "SOURCE"
err := resourcesMap["databricks_notebook"].List(ic)
assert.NoError(t, err)

ic.generateHclForResources(nil)
assert.Equal(t, commands.TrimLeadingWhitespace(`
resource "databricks_notebook" "first_second_123" {
Expand All @@ -815,6 +815,56 @@ func TestNotebookGeneration(t *testing.T) {
})
}

func TestNotebookGenerationJupyter(t *testing.T) {
testGenerate(t, []qa.HTTPFixture{
{
Method: "GET",
Resource: "/api/2.0/workspace/list?path=%2F",
Response: workspace.ObjectList{
Objects: []workspace.ObjectStatus{
{
Path: "/Repos/Foo/Bar",
ObjectType: "NOTEBOOK",
},
{
Path: "/First/Second",
ObjectType: "NOTEBOOK",
},
},
},
},
{
Method: "GET",
Resource: "/api/2.0/workspace/get-status?path=%2FFirst%2FSecond",
Response: workspace.ObjectStatus{
ObjectID: 123,
ObjectType: "NOTEBOOK",
Path: "/First/Second",
Language: "PYTHON",
},
},
{
Method: "GET",
Resource: "/api/2.0/workspace/export?format=JUPYTER&path=%2FFirst%2FSecond",
Response: workspace.ExportPath{
Content: "YWJj",
},
},
}, "notebooks", func(ic *importContext) {
ic.notebooksFormat = "JUPYTER"
err := resourcesMap["databricks_notebook"].List(ic)
assert.NoError(t, err)
ic.generateHclForResources(nil)
assert.Equal(t, commands.TrimLeadingWhitespace(`
resource "databricks_notebook" "first_second_123" {
source = "${path.module}/notebooks/First/Second.ipynb"
path = "/First/Second"
language = "PYTHON"
format = "JUPYTER"
}`), string(ic.Files["notebooks"].Bytes()))
})
}

func TestDirectoryGeneration(t *testing.T) {
testGenerate(t, []qa.HTTPFixture{
{
Expand Down
3 changes: 3 additions & 0 deletions exporter/util.go
Original file line number Diff line number Diff line change
Expand Up @@ -630,6 +630,9 @@ func createListWorkspaceObjectsFunc(objType string, resourceType string, objName
modifiedAt, updatedSinceMs)
continue
}
if !ic.MatchesName(object.Path) {
continue
}
ic.Emit(&resource{
Resource: resourceType,
ID: object.Path,
Expand Down

0 comments on commit 337b1e8

Please sign in to comment.