In [None]:
%%capture --no-stderr
!pip3 install -q google-cloud-aiplatform
!pip3 install -q langchain-google-vertexai
!pip3 install -q langchain-google-genai
!pip3 install -q wikipedia
!pip3 install -q chromadb==0.5.3
!pip3 install -q langchain-community

In [None]:
# restart the kernel after libraries are loaded
import IPython

app = IPython.Application.instance()
app.kernel.do_shutdown(True)

# Initial Setup

In [None]:
import os
import google.generativeai as genai

from IPython.display import display
from IPython.display import Markdown

key_name = !gcloud services api-keys list --filter="gemini-api-key" --format="value(name)"
key_name = key_name[0]

api_key = !gcloud services api-keys get-key-string $key_name --location="us-central1" --format="value(keyString)"
api_key = api_key[0]

os.environ["GOOGLE_API_KEY"] = api_key

genai.configure(api_key=os.environ["GOOGLE_API_KEY"])

## Import Required Libraries

In [None]:
import warnings
from langchain._api import LangChainDeprecationWarning
warnings.simplefilter("ignore", category=LangChainDeprecationWarning)

from langchain import PromptTemplate
from langchain import hub
from langchain.docstore.document import Document
from langchain.document_loaders import WikipediaLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import StrOutputParser
from langchain.schema.prompt_template import format_document
from langchain.schema.runnable import RunnablePassthrough
from langchain.vectorstores import Chroma

In [None]:
# Define project information
import sys
import subprocess

PROJECT_ID = subprocess.check_output(["gcloud", "config", "get-value", "project"], text=True).strip()
LOCATION = "us-central1"  # @param {type:"string"}

print(f"Your project ID is: {PROJECT_ID}")

## Task 1. Load `Documents` from Wikipedia

In [None]:
# Use the LangChain documentation to load documents for the query below
# Set the following parameters:
#  * query: "Gemini GPT-4"
#  * load_max_docs: 10
# https://python.langchain.com/docs/integrations/document_loaders/wikipedia

query="Gemini GPT-4"
max_docs=10

documents = WikipediaLoader(query=query, load_max_docs=max_docs).load()
len(documents)

## Task 2. Use `RecursiveTextSplitter` to split Documents

In [None]:
# Use the LangChain documentation to split the docs loaded into smaller chunks for indexing
# https://python.langchain.com/docs/modules/data_connection/document_transformers/recursive_text_splitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000, chunk_overlap=1000)
docs = text_splitter.split_documents(documents)
print(f"# of documents = {len(docs)}")

## Task 3. Index Documents in Chroma DB Vector Store

In [None]:
# Insert the correct model name in the constructor below
# https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#models
# You can ignore warning messages when running this cell

from langchain_google_vertexai import VertexAIEmbeddings
embeddings = VertexAIEmbeddings(model_name="text-embedding-004")


In [None]:
# Reference the correct parameters (already defined) to properly index 
# the documents loaded from Wikipedia into Chroma DB as embeddings
# https://python.langchain.com/docs/integrations/vectorstores/chroma

vectorstore = Chroma.from_documents(
    documents=docs,                 # Data
    embedding=embeddings,           # Embedding model
    persist_directory="./chroma_db" # Directory to save data
)

In [None]:
vectorstore_disk = Chroma(
    persist_directory="./chroma_db", # Directory of db
    embedding_function=embeddings    # Embedding model
)

## Task 4. Setup a Retriever

In [None]:
# Setup Chroma DB as a `Retriever` for querying the documents
# set the k value to 10
# https://python.langchain.com/docs/integrations/vectorstores/chroma#retriever-options

retriever = vectorstore_disk.as_retriever(search_kwargs={"k": 10})

In [None]:
# Test the retriever with a query
doc = retriever.invoke("Google Gemini")
doc

## Task 5. Setup Model and Build LangChain `Chain`

In [None]:
# Insert the correct model name in the constructor below.
# https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#gemini-models
# Ensure that the output is the least random configurable
from langchain_google_genai import ChatGoogleGenerativeAI

model = ChatGoogleGenerativeAI(model="gemini-1.0-pro", temperature=0.9, top_p=0.85)

In [None]:
# Prompt template to query Gemini
llm_prompt_template = """You are an assistant for question-answering tasks.
Use the following context to answer the question.
If you don't know the answer, just say that you don't know.
Use five sentences maximum and keep the answer concise.\n
Question: {question} \nContext: {context} \nAnswer:"""

prompt = PromptTemplate.from_template(llm_prompt_template)

print(prompt)

In [None]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [None]:
# Complete the Chain in the correct order. You need to leverage the `prompt` and `model` defined
# in earlier cells in the correct order to run the next cell successfully by replacing CHAIN_1 and CHAIN_2.
chain = (
    { "context": retriever | format_docs, "question": RunnablePassthrough() }
    | prompt
    | model
    | StrOutputParser()
)

In [None]:
chain.invoke("What is Gemini?")