# Integration of lakeFS with Labelbox

## Use Case: ML Reproducibility

## Setup Task: Import required Python packages

In [None]:
%xmode Minimal
import os
import requests
from pathlib import Path
import os
import pandas as pd
import labelbox
import datetime
from tabulate import tabulate
from uuid import uuid4 ## to generate unique IDs
import json
from labelbox.schema.ontology import OntologyBuilder, Tool, Classification,Option
import random
from labelbox.data.annotation_types import (
    Label,
    Point,
    LabelList,
    ImageData,
    Rectangle,
    ObjectAnnotation,
)
from labelbox.data.serialization import NDJsonConverter
import time
from labelbox.schema.annotation_import import LabelImport

## Setup Task: lakeFS Upload Objects Function

In [None]:
def upload_files(repo, branch, path, files):
    for file in files:
        print(file)
        contentToUpload = open(file, 'rb') # Only a single file per upload which must be named \\\"content\\\"
        lakefs.objects.upload_object(
            repository=repo,
            branch=branch,
            path=path+'/'+os.path.basename(file), content=contentToUpload)

## Setup Task: lakeFS Import Function

In [None]:
def lakefs_import(repo, branch, paths, commitMessage):
    create_resp = lakefs.import_api.import_start(repo_name, branch, ImportCreation(paths, CommitCreation(message=commitMessage)))

    # Wait for import to finish
    while True:
        status_resp = lakefs.import_api.import_status(repo_name, branch, create_resp.id)
        print(status_resp)
        if hasattr(status_resp, "Error in import"):
            raise Exception(status_resp.err)
        if status_resp.completed:
            print("Import completed Successfully. Data imported into branch:", branch)
            break
        time.sleep(2)

## Setup Task: Create S3 client

In [None]:
import boto3
s3 = boto3.client('s3',
    endpoint_url='https://s3.' + awsRegion + '.amazonaws.com',
    aws_access_key_id=aws_access_key_id,
    aws_secret_access_key=aws_secret_access_key)

## Setup Task: Create lakeFS Python client

In [None]:
import lakefs_client
from lakefs_client.models import *
from lakefs_client.client import LakeFSClient

# lakeFS credentials and endpoint
configuration = lakefs_client.Configuration()
configuration.username = lakefsAccessKey
configuration.password = lakefsSecretKey
configuration.host = lakefsEndPoint

lakefs = LakeFSClient(configuration)

## Setup Task: Verify lakeFS credentials by getting lakeFS version

In [None]:
print("Verifying lakeFS credentials…")
try:
    v=lakefs.config.get_config()
except:
    print("🛑 failed to get lakeFS version")
else:
    print(f"…✅lakeFS credentials verified\n\nℹ️lakeFS version {v['version_config']['version']}")

## Setup Task: Define lakeFS Repository

In [None]:
from lakefs_client.exceptions import NotFoundException

try:
    repo=lakefs.repositories.get_repository(repo_name)
    print(f"Found existing repo {repo.id} using storage namespace {repo.storage_namespace}")
except NotFoundException as f:
    print(f"Repository {repo_name} does not exist, so going to try and create it now.")
    try:
        repo=lakefs.repositories.create_repository(repository_creation=RepositoryCreation(name=repo_name,
                                                                                                storage_namespace=f"{storageNamespace}/{repo_name}"))
        print(f"Created new repo {repo.id} using storage namespace {repo.storage_namespace}")
    except lakefs_client.ApiException as e:
        print(f"Error creating repo {repo_name}. Error is {e}")
        os._exit(00)
except lakefs_client.ApiException as e:
    print(f"Error getting repo {repo_name}: {e}")
    os._exit(00)

## Setup Task: S3A Gateway configuration

##### Note: lakeFS can be configured to work with Spark in two ways:
###### * Access lakeFS using the S3A gateway https://docs.lakefs.io/integrations/spark.html#access-lakefs-using-the-s3a-gateway.
###### * Access lakeFS using the lakeFS-specific Hadoop FileSystem https://docs.lakefs.io/integrations/spark.html#access-lakefs-using-the-lakefs-specific-hadoop-filesystem.

In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("lakeFS / Jupyter") \
        .config("spark.hadoop.fs.s3.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
        .config("spark.hadoop.fs.s3a.endpoint", lakefsEndPoint) \
        .config("spark.hadoop.fs.s3a.path.style.access", "true") \
        .config("spark.hadoop.fs.s3a.access.key", lakefsAccessKey) \
        .config("spark.hadoop.fs.s3a.secret.key", lakefsSecretKey) \
        .getOrCreate()
spark.sparkContext.setLogLevel("INFO")

spark

## Setup Task: Create Labelbox Python client

In [None]:
lb_client = labelbox.Client(LB_API_KEY)