# Integration of lakeFS with Glue Catalog and Athena

[📚 Docs](https://docs.lakefs.io/integrations/glue_hive_metastore.html)

## Use Case: Isolated Dev/Test Environments

## Config

### Glue session configuration

In [None]:
%stop_session
%session_id_prefix 'hive-notebook-demo'
%idle_timeout 120
%glue_version 4.0
%worker_type G.1X
%number_of_workers 2

%additional_python_modules 'lakefs-client'

### lakeFS endpoint and credentials

In [None]:
lakefsEndPoint = '<lakeFS Endpoint URL>' # e.g. 'https://username.aws_region_name.lakefscloud.io' 
lakefsAccessKey = '<lakeFS Access Key>'
lakefsSecretKey = '<lakeFS Secret Key>'

### Object Storage

In [None]:
storageNamespace = 's3://<Bucket Name>' # e.g. 's3://bucket'

### Glue Catalog Information

In [None]:
glueDatabaseName = "glue_hive_demo" # This notebook will create this database
glueCatalogId = "<Glue Catalog ID or AWS Account ID>"

### Athena Information

In [None]:
awsRegion = '<AWS Region>' # e.g. 'us-east-1'
BucketNameForAthenaOutput = '<Bucket Name to store Athena Output>'
FolderNameForAthenaOutput = 'lakefs-glue-demo'

### Install and configure lakectl (lakeFS command-line tool): https://docs.lakefs.io/reference/cli.html

### Add following in .lakectl.yaml (change AWS region and profile name)

In [None]:
metastore:
  type: glue
  glue:
    region: <AWS Region>
    profile: <Profile Name for AWS CLI>

---

## Setup

**(you shouldn't need to change anything in this section, just run it)**

In [None]:
repo_name = "glue-hive-demo"

### Create lakeFSClient

In [None]:
import lakefs_client
from lakefs_client.models import *
from lakefs_client.client import LakeFSClient

# lakeFS credentials and endpoint
configuration = lakefs_client.Configuration()
configuration.username = lakefsAccessKey
configuration.password = lakefsSecretKey
configuration.host = lakefsEndPoint

lakefs = LakeFSClient(configuration)

### Define lakeFS Repository

In [None]:
from lakefs_client.exceptions import NotFoundException

try:
    repo=lakefs.repositories.get_repository(repo_name)
    print(f"Found existing repo {repo.id} using storage namespace {repo.storage_namespace}")
except NotFoundException as f:
    print(f"Repository {repo_name} does not exist, so going to try and create it now.")
    try:
        repo=lakefs.repositories.create_repository(repository_creation=RepositoryCreation(name=repo_name,
                                                                                                storage_namespace=f"{storageNamespace}/{repo_name}"))
        print(f"Created new repo {repo.id} using storage namespace {repo.storage_namespace}")
    except lakefs_client.ApiException as e:
        print(f"Error creating repo {repo_name}. Error is {e}")
        os._exit(00)
except lakefs_client.ApiException as e:
    print(f"Error getting repo {repo_name}: {e}")
    os._exit(00)

### Set up Spark

In [None]:
spark.sparkContext._jsc.hadoopConfiguration().set("fs.s3.impl","org.apache.hadoop.fs.s3a.S3AFileSystem")
spark.sparkContext._jsc.hadoopConfiguration().set("fs.s3a.secret.key",lakefsSecretKey)
spark.sparkContext._jsc.hadoopConfiguration().set("fs.s3a.endpoint",lakefsEndPoint)
spark.sparkContext._jsc.hadoopConfiguration().set("fs.s3a.access.key",lakefsAccessKey)
spark.sparkContext._jsc.hadoopConfiguration().set("fs.s3a.path.style.access","true")

### Import some libraries

In [None]:
from pyspark.sql.types import ByteType, IntegerType, LongType, StringType, StructType, StructField
from pyspark.sql.functions import *

### Versioning Information

In [None]:
mainBranch = "main"
glueTestBranch = "glue_test_branch"
customersTable = "customers"
ordersTable = "orders"

### Create Glue Database

In [None]:
spark.sql(f"CREATE DATABASE IF NOT EXISTS {glueDatabaseName}").show()

### Define some helper functions

In [None]:
def print_diff_refs(diff_refs):
    results = map(
        lambda n:[n.path,n.path_type,n.size_bytes,n.type],
        diff_refs.results)
    return results

In [None]:
import boto3
import pandas as pd

s3 = boto3.resource('s3')
s3_client = boto3.client('s3')
athena_client = boto3.client(service_name='athena', region_name=awsRegion)

def run_query(client, query):
    response = client.start_query_execution(
        QueryString=query,
        QueryExecutionContext={ 'Database': glueDatabaseName },
        ResultConfiguration={ 'OutputLocation': 's3://{}/{}/'.format(BucketNameForAthenaOutput, FolderNameForAthenaOutput) },
    )
    return response

def validate_query(client, query_id):
    resp = ["FAILED", "SUCCEEDED", "CANCELLED"]
    response = client.get_query_execution(QueryExecutionId=query_id)
    # wait until query finishes
    while response["QueryExecution"]["Status"]["State"] not in resp:
        response = client.get_query_execution(QueryExecutionId=query_id)

    return response["QueryExecution"]["Status"]["State"]

def execute_athena_query(query):
    print('start query: {}\n'.format(query))
    qe = run_query(athena_client, query)
    qstate = validate_query(athena_client, qe["QueryExecutionId"])
    print('query state: {}\n'.format(qstate))

    file_name = "{}/{}.csv".format(FolderNameForAthenaOutput, qe["QueryExecutionId"])
    obj = s3_client.get_object(Bucket=BucketNameForAthenaOutput, Key=file_name)
    return pd.read_csv(obj['Body'])

### Define CUSTOMER.csv data file schema

In [None]:
customersSchema = StructType([
  StructField("Customer_ID", IntegerType(), False),
  StructField("Country", StringType(), False),
  StructField("Gender", StringType(), False),
  StructField("Personal_ID", IntegerType(), True),
  StructField("Customer_Name", StringType(), False),
  StructField("Customer_FirstName", StringType(), False),
  StructField("Customer_LastName", StringType(), False),
  StructField("Birth_Date", StringType(), False),
  StructField("Customer_Address", StringType(), False),
  StructField("Street_ID", LongType(), False),
  StructField("Street_Number", IntegerType(), False),
  StructField("Customer_Type_ID", IntegerType(), False)
])

In [None]:
customersSchemaForGlue = "Customer_ID int, \
  Country string, \
  Gender string, \
  Personal_ID int, \
  Customer_Name string, \
  Customer_FirstName string, \
  Customer_LastName string, \
  Birth_Date string, \
  Customer_Address string, \
  Street_ID long, \
  Street_Number int, \
  Customer_Type_ID int"

### Define ORDER_FACT.csv data file schema

In [None]:
ordersSchema = StructType([
  StructField("Customer_ID", IntegerType(), False),
  StructField("Employee_ID", IntegerType(), False),
  StructField("Street_ID", LongType(), False),
  StructField("Order_Date", StringType(), False),
  StructField("Delivery_Date", StringType(), False),
  StructField("Order_ID", LongType(), True),
  StructField("Order_Type", ByteType(), False),
  StructField("Product_ID", LongType(), False),
  StructField("Quantity", ByteType(), False),
  StructField("Total_Retail_Price", StringType(), False),
  StructField("CostPrice_Per_Unit", StringType(), False),
  StructField("Discount", LongType(), False)
])

In [None]:
ordersSchemaForGlue = "Customer_ID int, \
  Employee_ID int, \
  Street_ID long, \
  Order_Date string, \
  Delivery_Date string, \
  Order_ID long, \
  Order_Type int, \
  Product_ID long, \
  Quantity int, \
  Total_Retail_Price string, \
  CostPrice_Per_Unit string, \
  Discount long"

---

# Main demo starts here 🚦 👇🏻

For this demo - we'll be utilizing a dataset - [Orion Star - Sports and outdoors RDBMS dataset](https://www.kaggle.com/datasets/chethanp11/orion-star-sports-and-outdoors-rdbms-dataset) from [Kaggle](https://www.kaggle.com/).

## Run following command on your computer to clone lakeFS samples repo along with sample data used by this notebook:

### git clone https://github.com/treeverse/lakeFS-samples.git

## Print the command and run it on your computer to upload sample data to lakeFS repository

In [None]:
print(f"cd lakeFS-samples && lakectl fs upload -s ./data/OrionStar lakefs://{repo.id}/main/ --recursive && lakectl commit lakefs://{repo.id}/main -m 'Uploaded sample data'")

## Create Customers table in the main branch (using [CUSTOMER.csv](./data/samples/OrionStar/CUSTOMER.csv) file)

In [None]:
customersTablePath = f"s3a://{repo.id}/{mainBranch}/{customersTable}"
print(customersTablePath)

#### Register table in Glue catalog

In [None]:
spark.sql(f" \
          CREATE EXTERNAL TABLE IF NOT EXISTS {glueDatabaseName}.{customersTable}( \
              {customersSchemaForGlue} \
          ) \
          LOCATION \
              '{customersTablePath}' \
          ").show()

#### Read CSV file and create Hive table

In [None]:
df = spark.read.csv(f"s3a://{repo.id}/{mainBranch}/data/OrionStar/CUSTOMER.csv",header=True,schema=customersSchema)
df.write.format("hive").mode("append").saveAsTable(f"{glueDatabaseName}.{customersTable}")
df.show(10)

## Create Orders delta table in the main branch (using [ORDER_FACT.csv](./data/samples/OrionStar/ORDER_FACT.csv) file)

In [None]:
ordersTablePath = f"s3a://{repo.id}/{mainBranch}/{ordersTable}"
print(ordersTablePath)

#### Register table in Glue catalog

In [None]:
spark.sql(f" \
          CREATE EXTERNAL TABLE IF NOT EXISTS {glueDatabaseName}.{ordersTable}( \
              {ordersSchemaForGlue} \
          ) \
          LOCATION \
              '{ordersTablePath}' \
          ").show()

#### Read CSV file and create Hive table

In [None]:
df = spark.read.csv(f"s3a://{repo.id}/{mainBranch}/data/OrionStar/ORDER_FACT.csv",header=True,schema=ordersSchema)
df.write.format("hive").mode("append").saveAsTable(f"{glueDatabaseName}.{ordersTable}")
df.show(10)

## Commit changes and attach some metadata

In [None]:
lakefs.commits.commit(
    repository=repo.id,
    branch=mainBranch,
    commit_creation=CommitCreation(
        message='Added customers and orders Delta tables!', 
        metadata={'using': 'python_api'}))

## Print the commands and run it on your computer to create symlink so you can query the tables via Athena

In [None]:
print(f"lakectl metastore create-symlink \
--repo {repo.id} \
--branch {mainBranch} \
--path {customersTable} \
--from-client-type glue \
--catalog-id {glueCatalogId} \
--from-schema {glueDatabaseName} \
--from-table {customersTable} \
--to-schema {glueDatabaseName} \
--to-table {customersTable} \
")

In [None]:
print(f"lakectl metastore create-symlink \
--repo {repo.id} \
--branch {mainBranch} \
--path {ordersTable} \
--from-client-type glue \
--catalog-id {glueCatalogId} \
--from-schema {glueDatabaseName} \
--from-table {ordersTable} \
--to-schema {glueDatabaseName} \
--to-table {ordersTable} \
")

## Execute Athena query to read the data

In [None]:
print(execute_athena_query(f'SELECT * FROM "{glueDatabaseName}"."{customersTable}"'))

# 🟢 ETL Job Starts

## Create a new branch

In [None]:
lakefs.branches.create_branch(
    repository=repo.id, 
    branch_creation=BranchCreation(
        name=glueTestBranch, source=mainBranch))

### Create Glue Database for the new brach

In [None]:
spark.sql(f"CREATE DATABASE IF NOT EXISTS {glueDatabaseName}_{glueTestBranch}").show()

## Create external tables for the new branch

In [None]:
customersTablePathETLBranch = f"s3a://{repo.id}/{glueTestBranch}/{customersTable}"
print(customersTablePathETLBranch)

In [None]:
spark.sql(f" \
          CREATE EXTERNAL TABLE IF NOT EXISTS {glueDatabaseName}_{glueTestBranch}.{customersTable}( \
              {customersSchemaForGlue} \
          ) \
          LOCATION \
              '{customersTablePathETLBranch}' \
          ").show()

In [None]:
ordersTablePathETLBranch = f"s3a://{repo.id}/{glueTestBranch}/{ordersTable}"
print(ordersTablePathETLBranch)

In [None]:
spark.sql(f" \
          CREATE EXTERNAL TABLE IF NOT EXISTS {glueDatabaseName}_{glueTestBranch}.{ordersTable}( \
              {ordersSchemaForGlue} \
          ) \
          LOCATION \
              '{ordersTablePathETLBranch}' \
          ").show()

## Create symlink for the tables in the new branch

In [None]:
print(f"lakectl metastore create-symlink \
--repo {repo.id} \
--branch {glueTestBranch} \
--path {customersTable} \
--from-client-type glue \
--catalog-id {glueCatalogId} \
--from-schema {glueDatabaseName}_{glueTestBranch} \
--from-table {customersTable} \
--to-schema {glueDatabaseName}_{glueTestBranch} \
--to-table {customersTable} \
")

In [None]:
print(f"lakectl metastore create-symlink \
--repo {repo.id} \
--branch {glueTestBranch} \
--path {ordersTable} \
--from-client-type glue \
--catalog-id {glueCatalogId} \
--from-schema {glueDatabaseName}_{glueTestBranch} \
--from-table {ordersTable} \
--to-schema {glueDatabaseName}_{glueTestBranch} \
--to-table {ordersTable} \
")

## Execute Athena query to read the data from the new branch

In [None]:
print(execute_athena_query(f'SELECT * FROM "{glueDatabaseName}_{glueTestBranch}"."{customersTable}"'))

## More Questions?

###### Join the lakeFS Slack group - https://lakefs.io/slack