# [Integration of lakeFS with Unity Catalog](https://docs.lakefs.io/integrations/unity-catalog.html)

## Use Case: Isolated Development and Testing Environment

# Setup

#### Create lakeFS Python client

In [0]:
import lakefs
from lakefs.client import Client

lakefsEndPoint = '<lakeFS Endpoint URL>'
lakefsAccessKey = '<lakeFS Access Key>'
lakefsSecretKey = '<lakeFS Secret Key>'

clt = Client(
   host=lakefsEndPoint,
   username=lakefsAccessKey,
   password=lakefsSecretKey,
)

#### Create lakeFS Repository

In [0]:
repositoryName = "unity-catalog-demo"
storageNamespace = 's3://<bucket-name>/' + repositoryName
sourceBranch = "main"

repo = lakefs.Repository(
 repositoryName,
 client=clt).create(
   storage_namespace=storageNamespace,
   default_branch=sourceBranch,
   exist_ok=True)
branchMain = repo.branch(sourceBranch)

#### Let’s define the table descriptor and upload it to lakeFS

In [0]:
import yaml
table_name = "famous_people"
unity_catalog_name = 'lakefs_unity_catalog_demo'

table_descriptor = {
   'name': table_name,
   'type': 'delta',
   'path': f'tables/{table_name}',
   'catalog': unity_catalog_name,
}

# Write table descriptor to lakeFS
with branchMain.object(path=f'_lakefs_tables/{table_name}.yaml').writer() as out:
   yaml.safe_dump(table_descriptor, out)

#### Upload the Unity Catalog exporter script to lakeFS

In [0]:
luaScriptName = "scripts/unity_export.lua"

lua_script = """

local aws = require("aws")
local formats = require("formats")
local databricks = require("databricks")
local delta_export = require("lakefs/catalogexport/delta_exporter")
local unity_export = require("lakefs/catalogexport/unity_exporter")


local sc = aws.s3_client(args.aws.access_key_id, args.aws.secret_access_key, args.aws.region)


-- Export Delta Lake tables export:
local delta_client = formats.delta_client(args.lakefs.access_key_id, args.lakefs.secret_access_key, args.aws.region)
local delta_table_locations = delta_export.export_delta_log(action, args.table_defs, sc.put_object, delta_client, "_lakefs_tables")


-- Register the exported table in Unity Catalog:
local databricks_client = databricks.client(args.databricks_host, args.databricks_token)
local registration_statuses = unity_export.register_tables(action, "_lakefs_tables", delta_table_locations, databricks_client, args.warehouse_id)
for t, status in pairs(registration_statuses) do
   print("Unity catalog registration for table \\"" .. t .. "\\" completed with commit schema status : " .. status .. "\\n")
end

"""

branchMain.object(path=luaScriptName).upload(data=lua_script, mode='wb')

#### Define an action configuration that will run the above script after a commit or merge is completed over the main branch and upload it to lakeFS

In [0]:
newBranch = "dev"

databricks_host = 'https://<instance-name>.cloud.databricks.com'
databricks_token = '<Databricks personal access token>'
warehouse_id = '<Databricks SQL Warehouse ID>'

aws_region = '<AWS Region>'
aws_access_key_id = '<AWS Access Key>'
aws_secret_access_key = '<AWS Secret Key>'

hook_definition = {
   'name': 'unity_exporter',
   'on': {
       'post-commit': {
           'branches': [sourceBranch, newBranch+'*']
       },
       'post-create-branch': {
           'branches': [newBranch+'*']
       }
   },
   'hooks': [
       {
           'id': 'Unity-Registration',
           'type': 'lua',
           'properties': {
               'script_path': luaScriptName,
               'args': {
                   'aws': {
                     'access_key_id': aws_access_key_id,
                     'secret_access_key': aws_secret_access_key,
                     'region': aws_region
                   },
                   'lakefs': {
                       'access_key_id': lakefsAccessKey,
                       'secret_access_key': lakefsSecretKey 
                   },
                   'table_defs': [table_name],
                   'databricks_host': databricks_host,
                   'databricks_token': databricks_token,
                   'warehouse_id': warehouse_id
               }
           }
       }
   ]
}

with branchMain.object(path='_lakefs_actions/unity_exporter_action.yaml').writer() as out:
   yaml.safe_dump(hook_definition, out)

#### Create the Delta Table in source branch

In [0]:
data = [
   ('James','Bond','England','intelligence'),
   ('Robbie','Williams','England','music'),
   ('Hulk','Hogan','USA','entertainment'),
   ('Mister','T','USA','entertainment'),
   ('Rafael','Nadal','Spain','professional athlete'),
   ('Paul','Haver','Belgium','music'),
]
columns = ["firstname","lastname","country","category"]
df = spark.createDataFrame(data=data, schema = columns)
df.write.format("delta").mode("overwrite").partitionBy("category", "country").save(f"lakefs://{repositoryName}/{sourceBranch}/tables/{table_name}")
df.show()

#### Commit changes and attach some metadata

In [0]:
branchMain.commit(message='Added configuration files and Delta table!', 
        metadata={'using': 'python_api'})

# Demo Starts

#### Go to SQL Editor, click on "All" tab and refresh the schema
##### You will notice "main" schema under "lakefs_unity_catalog_demo" catalog.

#### Run SQL to read the data from the main branch

In [0]:
df = spark.sql(f"SELECT * FROM `{unity_catalog_name}`.`{sourceBranch}`.`{table_name}`")
df.show()

#### Create a new branch

In [0]:
newBranch = "dev1"
branchDev = repo.branch(newBranch).create(source_reference=sourceBranch)

#### Go back to SQL Editor and refresh the schema
##### You will notice new schema for the new branch created in previous step

#### Run SQL to read the data from the new branch

In [0]:
df = spark.sql(f"SELECT * FROM `{unity_catalog_name}`.`{newBranch}`.`{table_name}`")
df.show()

#### Update Delta Table in the new branch

In [0]:
from pyspark.sql.functions import col

df_us = df.filter(col("country") == "USA")
df_us.write.format("delta").mode("overwrite").save(f"lakefs://{repositoryName}/{newBranch}/tables/{table_name}")
df_us.show()

#### Commit changes in the new branch

In [0]:
branchDev.commit(message='Updated delta table!', 
        metadata={'using': 'python_api'})

#### Run SQL to read the data from the new branch

In [0]:
df = spark.sql(f"SELECT * FROM `{unity_catalog_name}`.`{newBranch}`.`{table_name}`")
df.show()

#### Run SQL to read the data from the main branch

In [0]:
df = spark.sql(f"SELECT * FROM `{unity_catalog_name}`.`{sourceBranch}`.`{table_name}`")
df.show()

# Demo Completes

## More Questions?

###### Join the [lakeFS Slack group](https://lakefs.io/slack)