# Retrieval Augmented Generation with Google Gemini and BigQuery


## Install libraries

In [None]:
%pip install --upgrade --quiet  langchain langchain-google-vertexai google-cloud-bigquery unstructured beautifulsoup4

## Set up

In [1]:
!gcloud config set project derrick-doit-sandbox --quiet

Updated property [core/project].


### Create BigQuery Dataset

In [2]:
PROJECT_ID = "derrick-doit-sandbox"
REGION = "US"
DATASET = "vector_search"
TABLE = "doc_and_vectors"
SITEMAP='https://docs.flutter.dev/sitemap.xml'

In [3]:
from google.cloud import bigquery

client = bigquery.Client(project=PROJECT_ID, location=REGION)
client.create_dataset(dataset=DATASET, exists_ok=True)

Dataset(DatasetReference('derrick-doit-sandbox', 'vector_search'))

### Embeddings

Embeddings are a way to store data of all types (including images, audio files, text, documents, etc.) in number arrays called vectors.

Vertex AI Embeddings for Text has an embedding space with 768 dimensions.

Let's visualize the embedding space of the 8 million Stack Overflow questions!

![link text](https://storage.googleapis.com/gweb-cloudblog-publish/images/4._Nomic_AI_Atlas.max-2200x2200.png)
Credit to:
- https://atlas.nomic.ai/
- https://cloud.google.com/blog/products/ai-machine-learning/how-to-use-grounding-for-your-llms-with-text-embeddings

### Use Vertex AI Embeddings model

In [4]:
from langchain_google_vertexai import VertexAIEmbeddings

embedding = VertexAIEmbeddings(
    model_name="textembedding-gecko@001", project=PROJECT_ID
)

### Use BigQuery as Vector Store

In [5]:
from langchain.vectorstores.utils import DistanceStrategy
from langchain_community.vectorstores import BigQueryVectorSearch

In [6]:
### Patch BigQueryVectorSearch
from typing import Any, Dict, List, Optional, Tuple
from langchain_core.documents import Document
import json

DEFAULT_TOP_K = 4  # default number of documents returned from similarity search

class PatchedBigQueryVectorSearch(BigQueryVectorSearch):
    def _search_with_score_and_embeddings_by_vector(
        self,
        embedding: List[float],
        k: int = DEFAULT_TOP_K,
        filter: Optional[Dict[str, Any]] = None,
        brute_force: bool = False,
        fraction_lists_to_search: Optional[float] = None,
    ) -> List[Tuple[Document, List[float], float]]:
        from google.cloud import bigquery

        # Create an index if no index exists.
        if not self._have_index and not self._creating_index:
            self._initialize_vector_index()
        # Prepare filter
        filter_expr = "TRUE"
        if filter:
            filter_expressions = []
            for i in filter.items():
                if isinstance(i[1], float):
                    expr = (
                        "ABS(CAST(JSON_VALUE("
                        f"base.`{self.metadata_field}`,'$.{i[0]}') "
                        f"AS FLOAT64) - {i[1]}) "
                        f"<= {sys.float_info.epsilon}"
                    )
                else:
                    val = str(i[1]).replace('"', '\\"')
                    expr = (
                        f"JSON_VALUE(base.`{self.metadata_field}`,'$.{i[0]}')"
                        f' = "{val}"'
                    )
                filter_expressions.append(expr)
            filter_expression_str = " AND ".join(filter_expressions)
            filter_expr += f" AND ({filter_expression_str})"
        # Configure and run a query job.
        job_config = bigquery.QueryJobConfig(
            query_parameters=[
                bigquery.ArrayQueryParameter("v", "FLOAT64", embedding),
            ],
            use_query_cache=False,
            priority=bigquery.QueryPriority.BATCH,
        )
        if self.distance_strategy == DistanceStrategy.EUCLIDEAN_DISTANCE:
            distance_type = "EUCLIDEAN"
        elif self.distance_strategy == DistanceStrategy.COSINE:
            distance_type = "COSINE"
        # Default to EUCLIDEAN_DISTANCE
        else:
            distance_type = "EUCLIDEAN"
        if brute_force:
            options_string = ",options => '{\"use_brute_force\":true}'"
        elif fraction_lists_to_search:
            if fraction_lists_to_search == 0 or fraction_lists_to_search >= 1.0:
                raise ValueError(
                    "`fraction_lists_to_search` must be between " "0.0 and 1.0"
                )
            options_string = (
                ',options => \'{"fraction_lists_to_search":'
                f"{fraction_lists_to_search}}}'"
            )
        else:
            options_string = ""
        query = f"""
            SELECT
                base.*,
                distance AS _vector_search_distance
            FROM VECTOR_SEARCH(
                TABLE `{self.full_table_id}`,
                "{self.text_embedding_field}",
                (SELECT @v AS {self.text_embedding_field}),
                distance_type => "{distance_type}",
                top_k => {k}
                {options_string}
            )
            WHERE {filter_expr}
            LIMIT {k}
        """
        document_tuples: List[Tuple[Document, List[float], float]] = []
        # TODO(vladkol): Use jobCreationMode=JOB_CREATION_OPTIONAL when available.
        job = self.bq_client.query(
            query, job_config=job_config, api_method=bigquery.enums.QueryApiMethod.QUERY
        )
        # Process job results.
        for row in job:
            metadata = row[self.metadata_field]
            if metadata:
                metadata = json.loads(json.dumps(metadata))
            else:
                metadata = {}
            metadata["__id"] = row[self.doc_id_field]
            metadata["__job_id"] = job.job_id
            doc = Document(page_content=row[self.content_field], metadata=metadata)
            document_tuples.append(
                (doc, row[self.text_embedding_field], row["_vector_search_distance"])
            )
        return document_tuples

In [7]:
store = PatchedBigQueryVectorSearch(
    project_id=PROJECT_ID,
    dataset_name=DATASET,
    table_name=TABLE,
    location=REGION,
    embedding=embedding,
    distance_strategy=DistanceStrategy.EUCLIDEAN_DISTANCE,
)

# Document processing


## Parse the sitemap

In [8]:
import requests
from bs4 import BeautifulSoup

def parse_sitemap(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, "xml")
    urls = [element.text for element in soup.find_all("loc")]
    return urls

sites = parse_sitemap(SITEMAP)

In [9]:
sites_filtered = [url for url in sites if '.css' not in url and '.json' not in url]
sites_filtered

['https://docs.flutter.dev',
 'https://docs.flutter.dev/404',
 'https://docs.flutter.dev/add-to-app',
 'https://docs.flutter.dev/add-to-app/android',
 'https://docs.flutter.dev/add-to-app/android/add-flutter-fragment',
 'https://docs.flutter.dev/add-to-app/android/add-flutter-screen',
 'https://docs.flutter.dev/add-to-app/android/add-flutter-view',
 'https://docs.flutter.dev/add-to-app/android/plugin-setup',
 'https://docs.flutter.dev/add-to-app/android/project-setup',
 'https://docs.flutter.dev/add-to-app/debugging',
 'https://docs.flutter.dev/add-to-app/ios',
 'https://docs.flutter.dev/add-to-app/ios/add-flutter-screen',
 'https://docs.flutter.dev/add-to-app/ios/project-setup',
 'https://docs.flutter.dev/add-to-app/multiple-flutters',
 'https://docs.flutter.dev/add-to-app/performance',
 'https://docs.flutter.dev/brand',
 'https://docs.flutter.dev/codelabs',
 'https://docs.flutter.dev/codelabs/implicit-animations',
 'https://docs.flutter.dev/community/china',
 'https://docs.flutter.de

In [10]:
len(sites_filtered)

503

In [11]:
sites_filtered[100]

'https://docs.flutter.dev/cookbook/testing/unit/introduction'

## Load page content using LangChains UnstructuredURLLoader

In [12]:
from langchain.document_loaders import UnstructuredURLLoader
loader = UnstructuredURLLoader(urls=sites_filtered)
documents = loader.load()
len(documents)

In [13]:
documents[100]

## Chunking

In [20]:
from langchain.text_splitter import CharacterTextSplitter

text_splitter = CharacterTextSplitter(
    separator = "\n",
    chunk_size = 2000,
    chunk_overlap  = 200)

document_chunks = text_splitter.split_documents(documents)

print(f"Number documents {len(documents)}")
print(f"Number chunks {len(document_chunks)}")

Created a chunk of size 2041, which is longer than the specified 2000
Created a chunk of size 5157, which is longer than the specified 2000
Created a chunk of size 2192, which is longer than the specified 2000


Number documents 502
Number chunks 5431


In [21]:
document_chunks[100]

Document(page_content='The Theme.of(context) method looks up the widget tree and retrieves\nthe nearest Theme in the tree.\nIf you have a standalone Theme, that’s applied.\nIf not, Flutter applies the app’s theme.\nIn the following example, the Container constructor uses this technique to set its color.\nOverride a theme\nTo override the overall theme in part of an app,\nwrap that section of the app in a Theme widget.\nYou can override a theme in two ways:\nCreate a unique ThemeData instance.\nExtend the parent theme.\nSet a unique ThemeData instance\nIf you want a component of your app to ignore the overall theme,\ncreate a ThemeData instance.\nPass that instance to the Theme widget.\nExtend the parent theme\nInstead of overriding everything, consider extending the parent theme.\nTo extend a theme, use the copyWith() method.\nWatch a video on Theme\nTo learn more, watch this short Widget of the Week video on the Theme widget:\nTry an interactive example', metadata={'source': 'https://

# Embeddings for documents



## Create embedding for all document chunks

In [22]:
store.add_documents(documents=document_chunks, embedding=embedding)

['6a19df18ae734887b66a42221e107535',
 '8c5df770dedd43629d92a09392019080',
 'd20f2502bcba4d01b28de4b0e665eb57',
 '6280ff59b1b842a89b07fd2995ed4266',
 '0fb6b07ea6024dcaa0101fd7d46f3edd',
 'e8f9ed01a488443087968d74cc06ee1d',
 'a8c164f2a478457d8f33f930b83603f7',
 '3c1aac884354473091d9497675e76498',
 'f44202ec78e34ea18fe4a078f3e48e34',
 '276509762bc643f393f8a26a32dcf6b3',
 '355e24a14f784d008f0040459ba5b14a',
 '2d281fb729454624aa4179c23c28bf62',
 '6379171554f241d2901d04659e0f2197',
 'b5c86370393549769a781f1156412c3d',
 '7994826de00345f9a3842f67413dc440',
 '23cf5d43eeca4ddcb99bf690589ca5c2',
 '825c925d5e44414a9ebcfe0ad4e26be3',
 '72671336854d4b9396cce2e6faa09b78',
 '7692a64f265e4cbd90479aa84b097c03',
 'c7e06d8a08cc44cab791aaa13ca2b887',
 'c3b20938a6a6423b8bde9581b106a7cc',
 '1e6e5b1cce5844d88bf0d29ad2067f81',
 '7556d9c1a82d44368064b6c2fa86ba4e',
 '3b818fdb84a247a98ae31fb446cf1477',
 '8fa69ee0209d4662b50db768bce57a16',
 '2f298501c4c9462d9f6dd2d9783533cc',
 '0e5979f6898d4a09b11267ba14eba6ed',
 

In [14]:
question = "What is Flutter?"

In [15]:
store.similarity_search(question, k=8)

[Document(page_content='It speeds app development and reduces the cost and complexity\nof app production across platforms.\nFor designers, Flutter provides a canvas for\nhigh-end user experiences. Fast Company described\nFlutter as one of the top design ideas of the decade for\nits ability to turn concepts into production code\nwithout the compromises imposed by typical frameworks.\nIt also acts as a productive prototyping tool\nwith drag-and-drop tools like FlutterFlow\nand web-based IDEs like Zapp!.\nFor engineering managers and businesses,\nFlutter allows the unification of app\ndevelopers into a single mobile, web,\nand desktop app team, building branded\napps for multiple platforms out of a single codebase.\nFlutter speeds feature development and synchronizes\nrelease schedules across the entire customer base.\nHow much development experience do I need to use Flutter?\nFlutter is approachable to programmers familiar with\nobject-oriented concepts (classes, methods, variables,\netc

In [22]:
from langchain.prompts import ChatPromptTemplate
from langchain.chains import RetrievalQA
from langchain_google_vertexai import VertexAI


template = """You are an assistant for question-answering tasks.
Use the following pieces of retrieved context to answer the question.
If you don't know the answer, just say that you don't know.

Follow exactly those 4 steps:
1. Read the context below and aggregrate this data
Context : {context}
2. Answer the question using only this context, have detailed explanation
3. Show all the source URL for your answers
4. The answer should be in following format. Keep an eye on the changeline and don't truncate the link:

**Question**: {question}
\n**Answer**:
\n**Source**:
"""

prompt = ChatPromptTemplate.from_template(template)

llm = VertexAI(model_name="gemini-pro", temperature=0)

def generate_response(question):
    qa_chain = RetrievalQA.from_chain_type(
        llm,
        retriever=store.as_retriever(),
        chain_type_kwargs={"prompt": prompt}
    )
    result = qa_chain({"query": question})
    print(result["result"])

question = "How to remote debug Flutter app?"
generate_response(question)

**Question**: How to remote debug Flutter app?

**Answer**: There are multiple ways to remote debug Flutter apps.
1. **Terminal**: Run 'flutter attach' or 'flutter attach -d deviceId' to attach from the terminal.
2. **VS Code**: Build the iOS version of the Flutter app in the Terminal.
 - Run the 'flutter build ios --config-only --no-codesign --debug' command.
 - Open the Flutter app directory in VS Code and select the device to debug.
 - Click the debug icon to launch the app and wait for it to connect.
3. **Xcode**: Attach to the Flutter process in Xcode by going to 'Debug' > 'Attach to Process' > 'Runner'.
 - Copy the Dart VM service URI from the Xcode debug area.
 - In VS Code, open the command palette and type 'debug'.
 - Click the 'Debug: Attach to Flutter on Device' command and paste the URI in the 'Paste an VM Service URI' box.

**Source**:
https://docs.flutter.dev/development/tools/debugging
