<img src="./images/logo.svg" alt="lakeFS logo" width=300/>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<img src="./images/langchain.jpeg" alt="LangChain logo" width=300/>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<img src="./images/openai-lockup-black.svg" alt="LangChain logo" width=250/>

# Integration of lakeFS with LangChain and OpenAI

Use Case: Reproducibility and Data version control for LangChain and LLM/OpenAI Models

See also the [accompanying blog](https://lakefs.io/blog/lakefs-langchain-loader/)

## Config

### lakeFS endpoint and credentials

Change these if using lakeFS other than provided in the samples repo. 

In [None]:
lakefsEndPoint = 'http://lakefs:8000' # e.g. 'https://username.aws_region_name.lakefscloud.io' 
lakefsAccessKey = 'AKIAIOSFOLKFSSAMPLES'
lakefsSecretKey = 'wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY'

### Storage Information

If you're not using sample repo lakeFS, then change the Storage Namespace to a location in the bucket you’ve configured. The storage namespace is a location in the underlying storage where data for this repository will be stored.

In [None]:
storageNamespace = 's3://example' # e.g. "s3://bucket"

### OpenAI API Key
##### If you do not have an API key then create a free OpenAI account and API key here: https://platform.openai.com/api-keys

In [None]:
openai_api_key = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"

## Setup

**(you shouldn't need to change anything in this section, just run it)**

In [None]:
repo_name = "llm-openai-langchain-repo"

### Versioning Information 

In [None]:
sourceBranch = "main"
version1Branch = "version1"
version2Branch = "version2"
documentName = "lakeFS Brochure.pdf"
responsesTable = "responses"

### Import libraries

In [None]:
import os
import lakefs
from assets.lakefs_demo import print_commit, print_diff

from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores.faiss import FAISS
from langchain.document_loaders import LakeFSLoader

from langchain.llms import OpenAI
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
from langchain.vectorstores.faiss import FAISS

### Create a function to load documents from lakeFS repository by using an [official lakeFS document loader for LangChain](https://python.langchain.com/docs/integrations/document_loaders/lakefs)
##### Split documents into smaller chunks, convert documents into OpenAI embeddings and store them in an in-memory vector database (Meta’s [FAISS](https://ai.meta.com/tools/faiss/))

In [None]:
def load_document(repo: str, ref: str, path: str) -> FAISS:
    lakefs_loader = LakeFSLoader(
        lakefs_access_key=lakefsAccessKey,
        lakefs_secret_key=lakefsSecretKey,
        lakefs_endpoint=lakefsEndPoint
    )
    lakefs_loader.set_repo(repo)
    lakefs_loader.set_ref(ref)
    lakefs_loader.set_path(path)
    docs = lakefs_loader.load()
    splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
    docs = splitter.split_documents(docs)
    return FAISS.from_documents(docs, embedding=OpenAIEmbeddings(openai_api_key=openai_api_key))

### Create a function to query this data using OpenAI
#### Set up a model and a prompt, into which you will feed documents that are related to the user’s question

In [None]:
def query_document(db: FAISS, document_name: str, query: str) -> str:
    related_docs = db.similarity_search(query, k=4) # we want 4 similar vectors
    docs_content = ' '.join([d.page_content for d in related_docs])
    llm = OpenAI(model='text-davinci-003', temperature=0, openai_api_key=openai_api_key)
    prompt = PromptTemplate(
        input_variables=['question', 'docs', 'document_name'],
        template="""
        You are a helpful document assistant that can answer questions about a document based on the text it contains.
        
        The name of the document is: {document_name}
        Answer the following question: {question}
        By searching the following document: {docs}
        
        Only use factual information from the document to answer the question.
        
        If you feel like you don't have enough information to answer the question, say "I don't know".
        
        Your answers should be detailed.
        """
    )

    chain = LLMChain(llm=llm, prompt=prompt)
    return chain.run(question=query, docs=docs_content, document_name=document_name)

### lakeFS S3 gateway config for the Delta table

In [None]:
import pandas as pd
import deltalake

storage_options = {"AWS_ACCESS_KEY_ID": lakefsAccessKey, 
                   "AWS_SECRET_ACCESS_KEY":lakefsSecretKey,
                   "AWS_ENDPOINT": lakefsEndPoint,
                   "AWS_REGION": "us-east-1",
                   "AWS_ALLOW_HTTP": "true",
                   "AWS_S3_ALLOW_UNSAFE_RENAME": "true"
                  }

### Set environment variables

In [None]:
os.environ["LAKECTL_SERVER_ENDPOINT_URL"] = lakefsEndPoint
os.environ["LAKECTL_CREDENTIALS_ACCESS_KEY_ID"] = lakefsAccessKey
os.environ["LAKECTL_CREDENTIALS_SECRET_ACCESS_KEY"] = lakefsSecretKey

### Verify lakeFS credentials by getting lakeFS version

In [None]:
print("Verifying lakeFS credentials…")
try:
    v=lakefs.client.Client().version
except:
    print("🛑 failed to get lakeFS version")
else:
    print(f"…✅lakeFS credentials verified\n\nℹ️lakeFS version {v}")

### Define lakeFS Repository

In [None]:
repo = lakefs.Repository(repo_name).create(storage_namespace=f"{storageNamespace}/{repo_name}", default_branch=sourceBranch, exist_ok=True)
branchMain = repo.branch(sourceBranch)
print(repo)

# Main demo starts here 🚦 👇🏻

### Create version1 branch

In [None]:
branchVersion1 = repo.branch(version1Branch).create(source_reference=sourceBranch)
print(f"{version1Branch} ref:", branchVersion1.get_commit().id)

### Upload "lakeFS Brochure.pdf" document to version1 branch

In [None]:
contentToUpload = open(f"/data/{version1Branch}/{documentName}", 'rb').read()
print(branchVersion1.object(documentName).upload(data=contentToUpload, mode='wb', pre_sign=False))

### Commit changes and attach some metadata

In [None]:
ref = branchVersion1.commit(message='Uploaded lakeFS Brochure', metadata={'version': 'version1'})
print_commit(ref.get_commit())

### Load "lakeFS Brochure.pdf" (version 1) document to vector database

In [None]:
db = load_document(repo_name, version1Branch, documentName)

### Let's ask these 2 questions

In [None]:
question1 = 'why lakefs'
question2 = 'trusted by?'

### Ask 1st question

In [None]:
question1Response = query_document(db, documentName, question1)
print(question1Response)

### Ask 2nd question

In [None]:
question2Response = query_document(db, documentName, question2)
print(question2Response)

### Save the responses to a Delta table

In [None]:
df = pd.DataFrame({'Document Name': [documentName, documentName], 'Version': [version1Branch, version1Branch], 'Question': [question1, question2], 'Answer': [question1Response, question2Response]})

deltalake.write_deltalake(table_or_uri=f"s3a://{repo.id}/{version1Branch}/{responsesTable}", 
                          data = df,
                          mode='append',
                          storage_options=storage_options)

### Commit changes and attach some metadata

In [None]:
ref = branchVersion1.commit(message='Saved responses for the questions', metadata={'version': 'version1'})
print_commit(ref.get_commit())

### Merge version1 branch to main

In [None]:
res = branchVersion1.merge_into(branchMain)
print(res)

### Create version2 branch

In [None]:
branchVersion2 = repo.branch(version2Branch).create(source_reference=sourceBranch)
print(f"{version2Branch} ref:", branchVersion2.get_commit().id)

### Upload 2nd version of the "lakeFS Brochure.pdf" document

In [None]:
contentToUpload = open(f"/data/{version2Branch}/{documentName}", 'rb').read()
print(branchVersion2.object(documentName).upload(data=contentToUpload, mode='wb', pre_sign=False))

### Commit changes and attach some metadata

In [None]:
ref = branchVersion2.commit(message='Uploaded lakeFS Brochure', metadata={'version': 'version2'})
print_commit(ref.get_commit())

### Load "lakeFS Brochure.pdf" (version 2) document to vector database

In [None]:
db = load_document(repo_name, version2Branch, documentName)

### Ask 1st question by using version2 document

In [None]:
question1Response = query_document(db, documentName, question1)
print(question1Response)

### Ask 2nd question by using version2 document

In [None]:
question2Response = query_document(db, documentName, question2)
print(question2Response)

### Save the responses to Delta table

In [None]:
df = pd.DataFrame({'Document Name': [documentName, documentName], 'Version': [version2Branch, version2Branch], 'Question': [question1, question2], 'Answer': [question1Response, question2Response]})

deltalake.write_deltalake(table_or_uri=f"s3a://{repo.id}/{version2Branch}/{responsesTable}", 
                          data = df,
                          mode='append',
                          storage_options=storage_options)

### Commit changes and attach some metadata

In [None]:
ref = branchVersion2.commit(message='Saved responses for the questions', metadata={'version': 'version2'})
print_commit(ref.get_commit())

### Merge version2 branch to main

In [None]:
res = branchVersion2.merge_into(branchMain)
print(res)

### Review responses for both versions

In [None]:
responses = deltalake.DeltaTable(f"s3a://{repo.id}/{sourceBranch}/{responsesTable}", storage_options=storage_options)
pd.set_option('max_colwidth', 2000)
responses.to_pandas()

## More Questions?

###### Join the lakeFS Slack group - https://lakefs.io/slack