Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Exporter: command-line option to control output format for notebooks #2569

Merged
merged 1 commit into from
Aug 24, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/guides/experimental-exporter.md
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ All arguments are optional and they tune what code is being generated.
* `-importAllUsers` - optionally include all users and service principals even if they only part of the `users` group.
* `-incremental` - experimental option for incremental export of modified resources and merging with existing resources. *Please note that only limited set of resources (notebooks, SQL queries/dashboards/alerts, ...) provides information about last modified date - all other resources will be re-exported again! Also, it's not possible to detect deletion of the resources, so you will need to do periodic full export if resources are deleted!* **Requires** `-updated-since` option if no `exporter-run-stats.json` file exists in the output directory.
* `-updated-since` - timestamp (in ISO8601 format supported by Go language) for exporting of resources modified since a giving timestamp. I.e. `2023-07-24T00:00:00Z`. If not specified, exporter will try to load last run timestamp from the `exporter-run-stats.json` file generated during the export, and use it.
* `-notebooksFormat` - optional format for exporting of notebooks. Supported values are `SOURCE` (default), `DBC`, `JUPYTER`. This could be used to export of notebooks with embedded dashboards.

## Services

Expand Down
2 changes: 2 additions & 0 deletions exporter/command.go
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,8 @@ func Run(args ...string) error {
flags.BoolVar(&ic.mounts, "mounts", false, "List DBFS mount points.")
flags.BoolVar(&ic.generateDeclaration, "generateProviderDeclaration", true,
"Generate Databricks provider declaration.")
flags.StringVar(&ic.notebooksFormat, "notebooksFormat", "SOURCE",
"Format to export notebooks: SOURCE, DBC, JUPYTER. Default: SOURCE")
services, listing := ic.allServicesAndListing()
flags.StringVar(&ic.services, "services", services,
"Comma-separated list of services to import. By default all services are imported.")
Expand Down
8 changes: 8 additions & 0 deletions exporter/context.go
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,7 @@ type importContext struct {
prefix string
accountLevel bool
shImports map[string]bool
notebooksFormat string
}

type mount struct {
Expand Down Expand Up @@ -153,6 +154,7 @@ func newImportContext(c *common.DatabricksClient) *importContext {
allWorkspaceObjects: []workspace.ObjectStatus{},
workspaceConfKeys: workspaceConfKeys,
shImports: make(map[string]bool),
notebooksFormat: "SOURCE",
}
}

Expand Down Expand Up @@ -201,6 +203,12 @@ func (ic *importContext) Run() error {
log.Printf("[INFO] Importing %s module into %s directory Databricks resources of %s services",
ic.Module, ic.Directory, ic.services)

ic.notebooksFormat = strings.ToUpper(ic.notebooksFormat)
_, supportedFormat := fileExtensionFormatMapping[ic.notebooksFormat]
if !supportedFormat && ic.notebooksFormat != "SOURCE" {
return fmt.Errorf("unsupported notebook format: '%s'", ic.notebooksFormat)
}

info, err := os.Stat(ic.Directory)
if os.IsNotExist(err) {
err = os.MkdirAll(ic.Directory, 0755)
Expand Down
67 changes: 40 additions & 27 deletions exporter/importables.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,21 +33,33 @@ import (
)

var (
adlsGen2Regex = regexp.MustCompile(`^(abfss?)://([^@]+)@([^.]+)\.(?:[^/]+)(/.*)?$`)
adlsGen1Regex = regexp.MustCompile(`^(adls?)://([^.]+)\.(?:[^/]+)(/.*)?$`)
wasbsRegex = regexp.MustCompile(`^(wasbs?)://([^@]+)@([^.]+)\.(?:[^/]+)(/.*)?$`)
s3Regex = regexp.MustCompile(`^(s3a?)://([^/]+)(/.*)?$`)
gsRegex = regexp.MustCompile(`^gs://([^/]+)(/.*)?$`)
globalWorkspaceConfName = "global_workspace_conf"
nameNormalizationRegex = regexp.MustCompile(`\W+`)
jobClustersRegex = regexp.MustCompile(`^((job_cluster|task)\.[0-9]+\.new_cluster\.[0-9]+\.)`)
dltClusterRegex = regexp.MustCompile(`^(cluster\.[0-9]+\.)`)
uuidRegex = regexp.MustCompile(`^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$`)
predefinedClusterPolicies = []string{"Personal Compute", "Job Compute", "Power User Compute", "Shared Compute"}
secretPathRegex = regexp.MustCompile(`^\{\{secrets\/([^\/]+)\/([^}]+)\}\}$`)
sqlParentRegexp = regexp.MustCompile(`^folders/(\d+)$`)
dltDefaultStorageRegex = regexp.MustCompile(`^dbfs:/pipelines/[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$`)
ignoreIdeFolderRegex = regexp.MustCompile(`^/Users/[^/]+/\.ide/.*$`)
adlsGen2Regex = regexp.MustCompile(`^(abfss?)://([^@]+)@([^.]+)\.(?:[^/]+)(/.*)?$`)
adlsGen1Regex = regexp.MustCompile(`^(adls?)://([^.]+)\.(?:[^/]+)(/.*)?$`)
wasbsRegex = regexp.MustCompile(`^(wasbs?)://([^@]+)@([^.]+)\.(?:[^/]+)(/.*)?$`)
s3Regex = regexp.MustCompile(`^(s3a?)://([^/]+)(/.*)?$`)
gsRegex = regexp.MustCompile(`^gs://([^/]+)(/.*)?$`)
globalWorkspaceConfName = "global_workspace_conf"
nameNormalizationRegex = regexp.MustCompile(`\W+`)
jobClustersRegex = regexp.MustCompile(`^((job_cluster|task)\.[0-9]+\.new_cluster\.[0-9]+\.)`)
dltClusterRegex = regexp.MustCompile(`^(cluster\.[0-9]+\.)`)
uuidRegex = regexp.MustCompile(`^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$`)
predefinedClusterPolicies = []string{"Personal Compute", "Job Compute", "Power User Compute", "Shared Compute"}
secretPathRegex = regexp.MustCompile(`^\{\{secrets\/([^\/]+)\/([^}]+)\}\}$`)
sqlParentRegexp = regexp.MustCompile(`^folders/(\d+)$`)
dltDefaultStorageRegex = regexp.MustCompile(`^dbfs:/pipelines/[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$`)
ignoreIdeFolderRegex = regexp.MustCompile(`^/Users/[^/]+/\.ide/.*$`)
fileExtensionLanguageMapping = map[string]string{
"SCALA": ".scala",
"PYTHON": ".py",
"SQL": ".sql",
"R": ".r",
}
fileExtensionFormatMapping = map[string]string{
"HTML": ".html",
"JUPYTER": ".ipynb",
"DBC": ".dbc",
"R_MARKDOWN": ".Rmd",
}
)

func generateMountBody(ic *importContext, body *hclwrite.Body, r *resource) error {
Expand Down Expand Up @@ -1215,18 +1227,20 @@ var resourcesMap map[string]importable = map[string]importable{
Import: func(ic *importContext, r *resource) error {
ic.emitUserOrServicePrincipalForPath(r.ID, "/Users")
notebooksAPI := workspace.NewNotebooksAPI(ic.Context, ic.Client)
contentB64, err := notebooksAPI.Export(r.ID, "SOURCE")
contentB64, err := notebooksAPI.Export(r.ID, ic.notebooksFormat)
if err != nil {
return err
}
language := r.Data.Get("language").(string)
ext := map[string]string{
"SCALA": ".scala",
"PYTHON": ".py",
"SQL": ".sql",
"R": ".r",
var fileExtension string
if ic.notebooksFormat == "SOURCE" {
language := r.Data.Get("language").(string)
fileExtension = fileExtensionLanguageMapping[language]
r.Data.Set("language", "")
} else {
fileExtension = fileExtensionFormatMapping[ic.notebooksFormat]
}
name := r.ID[1:] + ext[language] // todo: replace non-alphanum+/ with _
r.Data.Set("format", ic.notebooksFormat)
name := r.ID[1:] + fileExtension // todo: replace non-alphanum+/ with _
content, _ := base64.StdEncoding.DecodeString(contentB64)
fileName, err := ic.createFileIn("notebooks", name, []byte(content))
if err != nil {
Expand All @@ -1240,7 +1254,8 @@ var resourcesMap map[string]importable = map[string]importable{
})
}

// TODO: it's not completely correct condition - we need to make emit smarter - emit only if permissions are different from their parent's permission.
// TODO: it's not completely correct condition - we need to make emit smarter -
// emit only if permissions are different from their parent's permission.
if ic.meAdmin {
directorySplits := strings.Split(r.ID, "/")
directorySplits = directorySplits[:len(directorySplits)-1]
Expand All @@ -1252,9 +1267,7 @@ var resourcesMap map[string]importable = map[string]importable{
})
}

log.Printf("Creating %s for %s", fileName, r)
r.Data.Set("source", fileName)
return r.Data.Set("language", "")
return r.Data.Set("source", fileName)
},
Depends: []reference{
{Path: "source", File: true},
Expand Down
52 changes: 51 additions & 1 deletion exporter/importables_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -803,9 +803,9 @@ func TestNotebookGeneration(t *testing.T) {
},
},
}, "notebooks", func(ic *importContext) {
ic.notebooksFormat = "SOURCE"
err := resourcesMap["databricks_notebook"].List(ic)
assert.NoError(t, err)

ic.generateHclForResources(nil)
assert.Equal(t, commands.TrimLeadingWhitespace(`
resource "databricks_notebook" "first_second_123" {
Expand All @@ -815,6 +815,56 @@ func TestNotebookGeneration(t *testing.T) {
})
}

func TestNotebookGenerationJupyter(t *testing.T) {
testGenerate(t, []qa.HTTPFixture{
{
Method: "GET",
Resource: "/api/2.0/workspace/list?path=%2F",
Response: workspace.ObjectList{
Objects: []workspace.ObjectStatus{
{
Path: "/Repos/Foo/Bar",
ObjectType: "NOTEBOOK",
},
{
Path: "/First/Second",
ObjectType: "NOTEBOOK",
},
},
},
},
{
Method: "GET",
Resource: "/api/2.0/workspace/get-status?path=%2FFirst%2FSecond",
Response: workspace.ObjectStatus{
ObjectID: 123,
ObjectType: "NOTEBOOK",
Path: "/First/Second",
Language: "PYTHON",
},
},
{
Method: "GET",
Resource: "/api/2.0/workspace/export?format=JUPYTER&path=%2FFirst%2FSecond",
Response: workspace.ExportPath{
Content: "YWJj",
},
},
}, "notebooks", func(ic *importContext) {
ic.notebooksFormat = "JUPYTER"
err := resourcesMap["databricks_notebook"].List(ic)
assert.NoError(t, err)
ic.generateHclForResources(nil)
assert.Equal(t, commands.TrimLeadingWhitespace(`
resource "databricks_notebook" "first_second_123" {
source = "${path.module}/notebooks/First/Second.ipynb"
path = "/First/Second"
language = "PYTHON"
format = "JUPYTER"
}`), string(ic.Files["notebooks"].Bytes()))
})
}

func TestDirectoryGeneration(t *testing.T) {
testGenerate(t, []qa.HTTPFixture{
{
Expand Down
3 changes: 3 additions & 0 deletions exporter/util.go
Original file line number Diff line number Diff line change
Expand Up @@ -630,6 +630,9 @@ func createListWorkspaceObjectsFunc(objType string, resourceType string, objName
modifiedAt, updatedSinceMs)
continue
}
if !ic.MatchesName(object.Path) {
continue
}
Comment on lines +633 to +635
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What's this change?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We didn't support -match for notebooks/workspace/directories, so I added it... But it will be superseded by #2574

ic.Emit(&resource{
Resource: resourceType,
ID: object.Path,
Expand Down
Loading