<img src="./images/logo.svg" alt="lakeFS logo" width=300/> 

# lakeFS Role-Based Access Control (RBAC) Demo

## Use Case: Data Governance - secure your data lake

## Prerequisites

###### This Notebook requires connecting to lakeFS Cloud.
###### Register for the lakeFS Cloud: https://lakefs.cloud/register

## Following image explains the steps you will run in this notebook:

![RBAC](./images/RBAC.png)

## Change your lakeFS credentials

In [13]:
lakefsEndPoint = '127.0.0.1:8000' # e.g. 'https://username.aws_region_name.lakefscloud.io'
lakefsAccessKey = 'AKIAIOSFOLKFSSAMPLES'
lakefsSecretKey = 'wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY'

## You can change lakeFS repo name (it can be an existing repo or provide another repo name)

In [14]:
repo = "rbac-repo"

## Storage Information
#### Change the Storage Namespace to a location in the bucket you’ve configured. The storage namespace is a location in the underlying storage where data for this repository will be stored.

In [15]:
storageNamespace = 's3://<S3 Bucket Name>/' # e.g. "s3://username-lakefs-cloud/"

## Versioning Information

In [16]:
sourceBranch = "main"
newBranch = "experiment1"
fileName = "lakefs_test.csv"

## Working with the lakeFS Python client API

In [17]:
%xmode Minimal
if not 'superUserClient' in locals():
    import lakefs_client
    from lakefs_client import models
    from lakefs_client.client import LakeFSClient

    # lakeFS credentials and endpoint
    configuration = lakefs_client.Configuration()
    configuration.username = lakefsAccessKey
    configuration.password = lakefsSecretKey
    configuration.host = lakefsEndPoint

    superUserClient = LakeFSClient(configuration)
    print("Created lakeFS client.")

Exception reporting mode: Minimal


## Super User creates "admin1" user

In [18]:
superUserClient.auth.create_user(
    user_creation=models.UserCreation(
        id='admin1'))

ProtocolError: ('Connection aborted.', InvalidURL("URL can't contain control characters. '<lakefs endpoint url>' (found at least ' ')"))

## Super User adds "admin1" user to lakeFS created "Admins" group

In [None]:
superUserClient.auth.add_group_membership(
    group_id='Admins',
    user_id='admin1')

## Create credentials for "admin1" user

In [None]:
credentials = superUserClient.auth.create_credentials(user_id='admin1')
print(credentials)
admin1AccessKey = credentials.access_key_id
admin1SecretKey = credentials.secret_access_key

## Create lakeFS Python client for "admin1" user

In [None]:
# lakeFS credentials and endpoint
configuration = lakefs_client.Configuration()
configuration.username = admin1AccessKey
configuration.password = admin1SecretKey
configuration.host = lakefsEndPoint

admin1Client = LakeFSClient(configuration)
print("Created lakeFS client for admin1.")

## Verify user for "admin1Client" Python client

In [None]:
admin1Client.auth.get_current_user()

## Now "admin1" will do rest of the setup
#### "admin1" creates "developer1" user

In [None]:
admin1Client.auth.create_user(
    user_creation=models.UserCreation(
        id='developer1'))

## "admin1" adds "developer1" user to lakeFS created "Developers" group

In [None]:
admin1Client.auth.add_group_membership(
    group_id='Developers',
    user_id='developer1')

## Create credentials for "developer1" user

In [None]:
credentials = admin1Client.auth.create_credentials(user_id='developer1')
print(credentials)
developer1AccessKey = credentials.access_key_id
developer1SecretKey = credentials.secret_access_key

## Create lakeFS Python client for "developer1" user

In [None]:
# lakeFS credentials and endpoint
configuration = lakefs_client.Configuration()
configuration.username = developer1AccessKey
configuration.password = developer1SecretKey
configuration.host = lakefsEndPoint

developer1Client = LakeFSClient(configuration)
print("Created lakeFS client for developer1.")

## Verify user for "developer1Client" Python client

In [None]:
developer1Client.auth.get_current_user()

## "admin1" creates "DataScientists" group

In [None]:
admin1Client.auth.create_group(
    group_creation=models.GroupCreation(
        id='DataScientists'))

## "admin1" attaches lakeFS created "AuthManageOwnCredentials" policy to "DataScientists" group

In [None]:
admin1Client.auth.attach_policy_to_group(
    group_id='DataScientists',
    policy_id='AuthManageOwnCredentials')

## "admin1" attaches lakeFS created "FSReadWriteAll" policy to "DataScientists" group

In [None]:
admin1Client.auth.attach_policy_to_group(
    group_id='DataScientists',
    policy_id='FSReadWriteAll')

## "admin1" attaches lakeFS created "RepoManagementReadAll" policy to "DataScientists" group

In [None]:
admin1Client.auth.attach_policy_to_group(
    group_id='DataScientists',
    policy_id='RepoManagementReadAll')

## "admin1" creates "data_scientist1" user

In [None]:
admin1Client.auth.create_user(
    user_creation=models.UserCreation(
        id='data_scientist1'))

## "admin1" adds "data_scientist1" user to "DataScientists" group

In [None]:
admin1Client.auth.add_group_membership(
    group_id='DataScientists',
    user_id='data_scientist1')

## Create credentials for "data_scientist1" user

In [None]:
credentials = admin1Client.auth.create_credentials(user_id='data_scientist1')
print(credentials)
data_scientist1AccessKey = credentials.access_key_id
data_scientist1SecretKey = credentials.secret_access_key

## Create lakeFS Python client for "data_scientist1" user

In [None]:
# lakeFS credentials and endpoint
configuration = lakefs_client.Configuration()
configuration.username = data_scientist1AccessKey
configuration.password = data_scientist1SecretKey
configuration.host = lakefsEndPoint

data_scientist1Client = LakeFSClient(configuration)
print("Created lakeFS client for data_scientist1.")

## Verify user for "data_scientist1Client" Python client

In [None]:
data_scientist1Client.auth.get_current_user()

## "admin1" creates "FSBlockMergingToMain" policy to prevent commits to the main branch

In [None]:
admin1Client.auth.create_policy(
    policy=models.Policy(
        id='FSBlockMergingToMain',
        statement=[models.Statement(
            effect="deny",
            resource="arn:lakefs:fs:::repository/*/branch/main",
            action=["fs:CreateCommit"],
        ),
        ]
    )
)

## "admin1" attaches "FSBlockMergingToMain" policy to "DataScientists" group

In [None]:
admin1Client.auth.attach_policy_to_group(
    group_id='DataScientists',
    policy_id='FSBlockMergingToMain')

## "admin1" creates "FSBlockAccessToPIIData" policy which denies access to any objects in "PII" folder

In [None]:
admin1Client.auth.create_policy(
    policy=models.Policy(
        id='FSBlockAccessToPIIData',
        statement=[models.Statement(
            effect="deny",
            resource="arn:lakefs:fs:::repository/"+repo+"/object/PII/*",
            action=["fs:*"],
        ),
        ]
    )
)

## "admin1" attaches "FSBlockAccessToPIIData" policy to "Developers" group

In [None]:
admin1Client.auth.attach_policy_to_group(
    group_id='Developers',
    policy_id='FSBlockAccessToPIIData')

## If repo already exists on your lakeFS server then you can skip following step otherwise "admin1" creates a new repo

In [None]:
admin1Client.repositories.create_repository(
    repository_creation=models.RepositoryCreation(
        name=repo,
        storage_namespace=storageNamespace,
        default_branch=sourceBranch))

## "admin1" protects main branch so no one can write directly to main branch and any subsequent writes must be done via the merge of a branch

In [None]:
admin1Client.repositories.set_branch_protection_rules(
    repository=repo,
    branch_protection_rule=[models.BranchProtectionRule(
        pattern=sourceBranch)])

## "admin1" tries to upload a file to "PII" folder to main branch but it fails because main branch is protected

In [None]:
import os
contentToUpload = open('/data/'+fileName, 'rb') # Only a single file per upload which must be named \\\"content\\\"
admin1Client.objects.upload_object(
    repository=repo,
    branch=sourceBranch,
    path='PII/'+fileName, content=contentToUpload)

## "admin1" creates "ingest-pii-data" branch

In [None]:
admin1Client.branches.create_branch(
    repository=repo,
    branch_creation=models.BranchCreation(
        name='ingest-pii-data',
        source=sourceBranch))

## "admin1" uploads the file to "PII" folder in "ingest-pii-data" branch

In [None]:
import os
contentToUpload = open('/data/'+fileName, 'rb') # Only a single file per upload which must be named \\\"content\\\"
admin1Client.objects.upload_object(
    repository=repo,
    branch='ingest-pii-data',
    path='PII/'+fileName, content=contentToUpload)

## "admin1" commits changes and attaches some metadata

In [None]:
admin1Client.commits.commit(
    repository=repo,
    branch='ingest-pii-data',
    commit_creation=models.CommitCreation(
        message='Added PII file!',
        metadata={'using': 'python_api'}))

## "admin1" merges "ingest-pii-data" branch to main branch

In [None]:
admin1Client.refs.merge_into_branch(
    repository=repo,
    source_ref='ingest-pii-data', 
    destination_branch=sourceBranch)

## "admin1" reads object under "PII" folder successfully

In [None]:
admin1Client.objects.get_object(
    repository=repo,
    ref='main',
    path='PII/'+fileName).read()

## "data_scientist1" reads object under "PII" folder successfully

In [None]:
data_scientist1Client.objects.get_object(
    repository=repo,
    ref='main',
    path='PII/'+fileName).read()

## But "developer1" can't read object under "PII" folder due to "FSBlockAccessToPIIData" policy attached to "Developers" group

In [None]:
developer1Client.objects.get_object(
    repository=repo,
    ref='main',
    path='PII/'+fileName).read()

## "data_scientist1" creates "ds_branch" branch

In [None]:
data_scientist1Client.branches.create_branch(
    repository=repo,
    branch_creation=models.BranchCreation(
        name='ds_branch',
        source=sourceBranch))

## "data_scientist1" uploads a new file to "ds_branch" branch

In [None]:
import os
contentToUpload = open('/data/lakefs_test_new.csv', 'rb') # Only a single file per upload which must be named \\\"content\\\"
data_scientist1Client.objects.upload_object(
    repository=repo,
    branch='ds_branch',
    path='ds/lakefs_test_new.csv', content=contentToUpload)

## "data_scientist1" commits changes and attaches some metadata

In [None]:
data_scientist1Client.commits.commit(
    repository=repo,
    branch='ds_branch',
    commit_creation=models.CommitCreation(
        message='Added new data file!',
        metadata={'using': 'python_api'}))

## But "data_scientist1" can't merge "ds_branch" branch to main branch due to "FSBlockMergingToMain" policy attached to "DataScientists" group

In [None]:
data_scientist1Client.refs.merge_into_branch(
    repository=repo,
    source_ref='ds_branch', 
    destination_branch=sourceBranch)

## More Questions?

###### Join the lakeFS Slack group - https://lakefs.io/slack