In [None]:
!pip install --upgrade --user google-cloud-aiplatform>=1.29.0 google-cloud-storage langchain pypdf ratelimit backoff langchain-google-vertexai google-cloud-bigquery

In [None]:
# Restart kernel after installs so that your environment can access the new packages
import IPython

app = IPython.Application.instance()
app.kernel.do_shutdown(True)

In [None]:
# get project ID
PROJECT_ID = ! gcloud config get-value project
PROJECT_ID = PROJECT_ID[0]
LOCATION = "us-central1" # @param {type:"string"}
DOCUMENT_URL = "https://www.nyc.gov/assets/doh/downloads/pdf/rii/fpc-manual.pdf" # @param {type:"string"}

# define project information manually if the above code didn't work
if PROJECT_ID == "(unset)":
  PROJECT_ID = "[your-project-id]" # @param {type:"string"}

print(PROJECT_ID)

## Initial Vertex AI

In [None]:
# init the aiplatform package
from google.cloud import aiplatform
aiplatform.init(project=PROJECT_ID, location=LOCATION)

## Just test the Embeddings model

In [None]:
from vertexai.language_models import TextEmbeddingModel

def text_embedding(text_to_embed) -> list:
    """Text embedding with a Large Language Model."""
    model = TextEmbeddingModel.from_pretrained("textembedding-gecko@002")
    embeddings = model.get_embeddings([text_to_embed])
    for embedding in embeddings:
        vector = embedding.values
    return vector

In [None]:
emb1 = text_embedding("Hello World")
print(emb1)

## Read the PDF and Split it into pages

In [None]:
from langchain_community.document_loaders import PyPDFLoader

pdf = PyPDFLoader(DOCUMENT_URL)
pages = pdf.load_and_split()

# convert pages array into an array of page_content
pages = [page.page_content for page in pages]

In [None]:
print(len(pages))
print(pages[125][:200])
print(len(pages[0]))

## Create an Embedding from 1 page

In [None]:
emb1 = text_embedding(pages[0])
print(emb1)

## Use the model to clean up all the pages

In [None]:
# This is the prompt we will use to clean up all the pages
cleanup_prompt = """
context: Edit the following data surrounded by triple back ticks.

1. Correct spelling and grammar mistakes.
2. Remove data not related to restaurant and food safety.
3. Return the edited data.

```
Data: {0}
```
cleaned data:

"""

In [None]:
from google.cloud.aiplatform_v1beta1.types.content import SafetySetting, HarmCategory

cleaned_pages = []

safety_settings = [
    SafetySetting(
        category=HarmCategory.HARM_CATEGORY_HARASSMENT,
        threshold=SafetySetting.HarmBlockThreshold.BLOCK_NONE,
    ),
    SafetySetting(
        category=HarmCategory.HARM_CATEGORY_HATE_SPEECH,
        threshold=SafetySetting.HarmBlockThreshold.BLOCK_NONE,
    ),
    SafetySetting(
        category=HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT,
        threshold=SafetySetting.HarmBlockThreshold.BLOCK_NONE,
    ),
    SafetySetting(
        category=HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT,
        threshold=SafetySetting.HarmBlockThreshold.BLOCK_NONE,
    ),
]

def generate(prompt):
  model = GenerativeModel("gemini-pro")
  response = model.generate_content(
    prompt,
    generation_config={
        "max_output_tokens": 8192,
        "temperature": 0.5,
        "top_p": 0.5,
        "top_k": 10,
    },
  stream=False,
  safety_settings=safety_settings,
  )
  try:
    return response.text
  except:
    print("An Error Ocuured Creaning the Data")
    return "An Error Ocuured Creaning the Data"

# Iterate over the pages and generate a summary for each page
for page in pages:
    # Create a prompt for the model using the page and a prompt template
    prompt = cleanup_prompt.format(page)

    # Generate a summary using the model and the prompt
    cleaned_page = generate(prompt=prompt)

    # Append the summary to the list of summaries
    cleaned_pages.append(cleaned_page)

    #print the number of times through the loop
    print(len(cleaned_pages))

## Generate the embeddings in batches

Below are helper functions that are used to rate limit and batch the geberation of the embeddings.

In [None]:
from typing import Generator, List, Optional, Tuple
import functools
import time
from concurrent.futures import ThreadPoolExecutor
import numpy as np
from tqdm import tqdm
import math

from vertexai.language_models import TextEmbeddingInput, TextEmbeddingModel

# Define an embedding method that uses the model
def encode_texts_to_embeddings(chunks: List[str]) -> List[Optional[List[float]]]:
    try:
        model = TextEmbeddingModel.from_pretrained("textembedding-gecko@002")

        # convert chunks into list[TextEmbeddingInput]
        inputs = [TextEmbeddingInput(text=chunk, task_type="RETRIEVAL_DOCUMENT") for chunk in chunks]
        embeddings = model.get_embeddings(inputs)

        # You could also generate the embeddings without the task_type.
        # Then, you are just passing a collection of strings. In a real app
        # test it multiple ways.
        # The alternative would be as follows
        # embeddings = model.get_embeddings(chunks)

        return [embedding.values for embedding in embeddings]
    except Exception:
        return [None for _ in range(len(chunks))]


# Generator function to yield batches of descriptions
def generate_batches(
    chunks: List[str], batch_size: int
) -> Generator[List[str], None, None]:
    for i in range(0, len(chunks), batch_size):
        yield chunks[i : i + batch_size]


def encode_text_to_embedding_batched(
    chunks: List[str], api_calls_per_minute: int = 20, batch_size: int = 5
) -> Tuple[List[bool], np.ndarray]:

    embeddings_list: List[List[float]] = []

    # Prepare the batches using a generator
    batches = generate_batches(chunks, batch_size)

    seconds_per_job = 60 / api_calls_per_minute

    with ThreadPoolExecutor() as executor:
        futures = []
        for batch in tqdm(
            batches, total=math.ceil(len(chunks) / batch_size), position=0
        ):
            futures.append(
                executor.submit(functools.partial(encode_texts_to_embeddings), batch)
            )
            time.sleep(seconds_per_job)

        for future in futures:
            embeddings_list.extend(future.result())

    is_successful = [
        embedding is not None for sentence, embedding in zip(chunks, embeddings_list)
    ]
    embeddings_list_successful = np.squeeze(
        np.stack([embedding for embedding in embeddings_list if embedding is not None])
    )
    return is_successful, embeddings_list_successful


In [None]:
embeddings = encode_text_to_embedding_batched(cleaned_pages, api_calls_per_minute=100)

In [None]:
embeddings_array = embeddings[1]
ids = [i for i in range(len(pages))]

In [None]:
print(len(cleaned_pages))
print(len(pages))
print(len(ids))
print(len(embeddings_array))
print("--------------------------")
print(ids[50])
print(pages[50][:50])
print(cleaned_pages[50][:50])
print(embeddings_array[50][:5])

## Create the JSON file to Import into BigQuery



In [None]:
import json

FILE_NAME = "embeddings_for_bq.jsonl"

# create a collection of objects from the  id, pages, cleaned_pages, and emb embeddings_array array
objects = [{"id": id, "page": page, "cleaned_page": cleaned_page, "embedding": embedding.tolist()} for id, page, cleaned_page, embedding in zip(ids, pages, cleaned_pages, embeddings_array)]

# Create a JSON-L file with the objects array
with open(FILE_NAME, "w") as f:
    for obj in objects:
        f.write(json.dumps(obj) + "\n")


### Write the JSON file to a Cloud Storage Bucket

In [None]:
# Create a Cloud Storage bucket named vertex-assessment-dar if it does not already exist
from google.cloud import storage

BUCKET_NAME = "vertex-assessment-dar"
BUCKET_URI = "gs://{0}".format(BUCKET_NAME)

storage_client = storage.Client()
bucket = storage_client.bucket(BUCKET_NAME)
if not bucket.exists():
    bucket.create(location="us-central1")

# Upload the JSON-L file to the bucket
blob = bucket.blob(FILE_NAME)
blob.upload_from_filename(FILE_NAME)

# print the URI of the bucket
print(BUCKET_URI)

## Create a BQ daraset and load the data into it.

In [None]:
%%bash

bq mk --dataset embeddings_ds

bq load --source_format=NEWLINE_DELIMITED_JSON --autodetect embeddings_ds.embeddings_data  gs://vertex-assessment-dar/embeddings_for_bq.jsonl


In [None]:
# Run a BigQuery Query with Python
from google.cloud import bigquery

client = bigquery.Client()

query = """
SELECT left(cleaned_page, 100) FROM `embeddings_ds.embeddings_data` WHERE ID = 1;
"""

query_job = client.query(query)

for row in query_job:
    print(row)