In [None]:
from dotenv import load_dotenv
load_dotenv()

In [1]:
from vertexai.language_models import TextEmbeddingModel


def text_embedding(text: str = "What is life?") -> list:
    """Text embedding with a Large Language Model."""
    model = TextEmbeddingModel.from_pretrained("textembedding-gecko@003")
    embeddings = model.get_embeddings([text])
    vector = embeddings[0].values
    print(f"Length of Embedding Vector: {len(vector)}")
    return vector


In [2]:
import psycopg2

# Database connection URI
db_user = os.environ['DB_USER']
db_pass = os.environ['DB_PASSWORD']
db_host = os.environ['DB_HOST']
db_name = 'postgres'
db_port = '5432'  # Default PostgreSQL port
table_name = 'embeddings_sample'  # Table where you want to store the embeddings

def save_to_alloydb(texts, embeddings):
    # PostgreSQL connection URI
    
    conn = psycopg2.connect(dbname=db_name, user=db_user, password=db_pass, host=db_host, port=db_port)

    # Assuming `embeddings` is a list of embedding vectors you received from the textembedding service
    # And `texts` is the list of texts corresponding to each embedding
    #for text, embedding in zip(texts, embeddings):
        # Convert the embedding to a format suitable for storage, e.g., a string
        #embedding_str = f"[{','.join(map(str, str(embedding)))}]"
    #print(embeddings)
        # Insert into database
    with conn.cursor() as cur:
        cur.execute(f"INSERT INTO {table_name} (content, embedding) VALUES (%s, %s)", (texts, embeddings))
        
    conn.commit()

    print("Embeddings stored in AlloyDB.")
    

In [3]:
from google.cloud import storage

def read_files_from_gcs_bucket(bucket_name):
    """
    Reads all files from a specified Google Cloud Storage bucket.

    :param bucket_name: The name of the GCS bucket.
    """
    # Initialize the client
    storage_client = storage.Client()

    # Get the bucket
    bucket = storage_client.get_bucket(bucket_name)

    # List all objects in the bucket
    blobs = bucket.list_blobs(prefix="posts")

    for blob in blobs:
        print(f"Reading file: {blob.name}")
        # Read the content of the file
        content = blob.download_as_string()
        # Assuming the file content is text, decode it
        text = content.decode('utf-8')
        
        chunks = []
        s = 0
        e = 1000
        for i in range(0,len(text)):
            chunks.append(text[s:e])
            s += 900
            e = s + 1000
            
            if e > len(text):
                e = len(text)
                
        #print(chunks[:10])
                
        for chunk in chunks:
            if chunk:
                embedding = text_embedding(chunk)
                save_to_alloydb(chunk, embedding)
        
        
# Example usage
bucket_name = 'rag-demo-bucket-oiajc'  # Replace with your Google Cloud Storage bucket name
read_files_from_gcs_bucket(bucket_name)


Reading file: posts/3-tips-to-improve-your-aws-cost-optimization-strategies.txt
Length of Embedding Vector: 768
Embeddings stored in AlloyDB.
Length of Embedding Vector: 768
Embeddings stored in AlloyDB.
Length of Embedding Vector: 768
Embeddings stored in AlloyDB.
Length of Embedding Vector: 768
Embeddings stored in AlloyDB.
Length of Embedding Vector: 768
Embeddings stored in AlloyDB.
Length of Embedding Vector: 768
Embeddings stored in AlloyDB.
Length of Embedding Vector: 768
Embeddings stored in AlloyDB.
Length of Embedding Vector: 768
Embeddings stored in AlloyDB.
Length of Embedding Vector: 768
Embeddings stored in AlloyDB.
Length of Embedding Vector: 768
Embeddings stored in AlloyDB.
Reading file: posts/a-guide-to-cloud-cost-analytics-tools.txt
Length of Embedding Vector: 768
Embeddings stored in AlloyDB.
Length of Embedding Vector: 768
Embeddings stored in AlloyDB.
Length of Embedding Vector: 768
Embeddings stored in AlloyDB.
Length of Embedding Vector: 768
Embeddings stored in