In [50]:
#pip install google-cloud-storage
#!pip install PyPDF2
#!pip install vertexai


In [51]:
from google.cloud import storage
from vertexai.language_models import TextEmbeddingModel
from google.cloud import aiplatform

import PyPDF2

import re
import os
import random
import json
import uuid

In [None]:
project="llmdemo-466101"
location="us-east1"

pdf_path="ACL_cricket_rule.pdf"
bucket_name = "rag-vector-search-ai"
embed_file_path = "cricket_embeddings.json"
sentence_file_path = "criket_sentences.json"
index_name="Confulence-Embeddings"

In [53]:
def extract_sentences_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ""
        for page in reader.pages:
            if page.extract_text() is not None:
                text += page.extract_text() + " "
    sentences = [sentence.strip() for sentence in text.split('. ') if sentence.strip()]
    return sentences

In [54]:
# def generate_text_embeddings(sentences) -> list: 
#   aiplatform.init(project=project,location=location)
#   model = TextEmbeddingModel.from_pretrained("gemini-embedding-001")
#   embeddings = model.get_embeddings(sentences)
#   vectors = [embedding.values for embedding in embeddings]
#   return vectors


def generate_text_embeddings(sentences, batch_size=250) -> list:
    aiplatform.init(project=project, location=location)
    model = TextEmbeddingModel.from_pretrained("gemini-embedding-001")
    all_vectors = []

    # Helper to yield chunks of size <= batch_size
    def batch_generator(seq, size):
        for pos in range(0, len(seq), size):
            yield seq[pos:pos + size]

    for batch in batch_generator(sentences, batch_size):
        embeddings = model.get_embeddings(batch)
        vectors = [embedding.values for embedding in embeddings]
        all_vectors.extend(vectors)

    return all_vectors

In [55]:
def generate_and_save_embeddings(pdf_path, sentence_file_path, embed_file_path):
    def clean_text(text):
        cleaned_text = re.sub(r'\u2022', '', text)  # Remove bullet points
        cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()  # Remove extra whitespaces and strip
        return cleaned_text
    
    sentences = extract_sentences_from_pdf(pdf_path)
    if sentences:
        embeddings = generate_text_embeddings(sentences)
        
        with open(embed_file_path, 'w') as embed_file, open(sentence_file_path, 'w') as sentence_file:
            for sentence, embedding in zip(sentences, embeddings):
                cleaned_sentence = clean_text(sentence)
                id = str(uuid.uuid4())
                
                embed_item = {"id": id, "embedding": embedding}
                sentence_item = {"id": id, "sentence": cleaned_sentence}
                
                json.dump(sentence_item, sentence_file)
                sentence_file.write('\n') 
                json.dump(embed_item, embed_file)
                embed_file.write('\n')  

In [56]:
def upload_file(bucket_name,file_path):
    storage_client = storage.Client()
    bucket = storage_client.create_bucket(bucket_name,location=location)
    blob = bucket.blob(file_path)
    blob.upload_from_filename(file_path)

In [57]:
def create_vector_index(bucket_name, index_name):
    cricketrule_idex = aiplatform.MatchingEngineIndex.create_tree_ah_index(
    display_name = index_name,
    contents_delta_uri = "gs://"+bucket_name,
    dimensions = 768,
    approximate_neighbors_count = 10,
    )
                  
    cricketrule_idex = aiplatform.MatchingEngineIndexEndpoint.create(
    display_name = index_name,
    public_endpoint_enabled = True
    )                      

    cricketrule_idex.deploy_index(
    index = cricketrule_idex, deployed_index_id = index_name
    )

In [58]:
from google.auth.transport.requests import Request
from google.oauth2.service_account import Credentials
import vertexai
from vertexai.generative_models import GenerativeModel

api_key_path="/Users/bhavanakajal/Documents/GitHub/GCPAI-Projects/keys/llmdemo-466101-3acdef328b4a.json"

credentials= Credentials.from_service_account_file(api_key_path)

PROJECT_ID="llmdemo-466101"
REGION="us-east1"

vertexai.init(project=PROJECT_ID,location=REGION,credentials=credentials)

model=GenerativeModel("gemini-2.5-flash")
# res=model.generate_content("What is LLM ?")
# print(res.text)

# ## generation config 
# from vertexai.generative_models import GenerationConfig

# generation_config=GenerationConfig(
#      temperature=0.9,
#      top_p=1.0,
#      top_k=32,
#      candidate_count=1,
#      #max_output_token=8192,
# )

# res=model.generate_content("Why do sunsets appear red and orange?",generation_config=generation_config)
# print(res.text)

In [64]:
#generate_and_save_embeddings(pdf_path,sentence_file_path,embed_file_path)


In [66]:
!export GOOGLE_APPLICATION_CREDENTIALS="/Users/bhavanakajal/Documents/GitHub/GCPAI-Projects/keys/llmdemo-466101-6d1afc48f1d3.json"


In [1]:
#upload_file(bucket_name,sentence_file_path)