In [1]:
import pandas as pd
from llama_index import download_loader
import re
from llama_index import download_loader, VectorStoreIndex, ServiceContext, StorageContext
from llama_index.vector_stores import PGVectorStore
from os import environ
from dotenv import load_dotenv
from llama_index.llms.vertex import Vertex
import google.auth
from palm_multi import PaLMMultiEmbeddings
import psycopg2
from typing import List
from time import sleep

In [2]:
# as per recommendation from @freylis, compile once only
CLEANR = re.compile('<.*?>') 

def cleanhtml(raw_html):
    cleantext = re.sub(CLEANR, '', raw_html)
    return cleantext

In [3]:
load_dotenv()

True

In [4]:
# Load the markdown reader from the hub
MarkdownReader = download_loader("MarkdownReader")
markdownreader = MarkdownReader()

In [5]:
docs = []
df_pre = pd.read_csv("./uscis.csv").dropna()
for row in range(len(df_pre)):
    row_data = df_pre.iloc[row]
    heading = row_data.heading
    link = row_data.link
    text = row_data.text
    new_text = cleanhtml(text)
    new_docs = markdownreader.load_data(file=None, content=new_text, extra_info={
        "page": heading,
        "link": link,
    })
    docs.extend(new_docs)

In [6]:
credentials, project_id = google.auth.default()

In [7]:
class DelayedPaLMEmbedding(PaLMMultiEmbeddings):
    _delay = 1

    def _get_query_embedding(self, query: str) -> List[float]:
        sleep(self._delay)
        return super()._get_query_embedding(query)

    async def _aget_query_embedding(self, query: str) -> List[float]:
        sleep(self._delay)
        return await super()._aget_query_embedding(query)

    def _get_text_embedding(self, text: str) -> List[float]:
        sleep(self._delay)
        return super()._get_text_embedding(text)

    async def _aget_text_embedding(self, text: str) -> List[float]:
        sleep(self._delay)
        return super()._aget_text_embedding(text)

    def _get_text_embeddings(self, texts: List[str]) -> List[List[float]]:
        sleep(self._delay)
        return super()._get_text_embeddings(texts)

    async def _aget_text_embeddings(self, texts: List[str]) -> List[List[float]]:
        sleep(self._delay)
        return await super()._aget_text_embeddings(texts)


In [8]:
embed_model = DelayedPaLMEmbedding(embed_batch_size=5)

In [9]:
unix_socket = '/cloudsql/{}'.format("ask-prita:us-central1:ask-priya-store")
db_user = "postgres"
db_password = environ["DB_PASSWORD"]
db_name = "askpriyadb4"
db_host = environ["DB_HOST"]
db_port = 5432
CLOUD_SQL_CONNECTION_NAME = "ask-prita:us-central1:ask-priya-store"
connection_string = f"user={db_user} password={db_password} host={db_host} port={db_port}"
conn = psycopg2.connect(connection_string)

In [10]:
conn.autocommit = True

with conn.cursor() as c:
    c.execute(f"DROP DATABASE IF EXISTS {db_name}")
    c.execute(f"CREATE DATABASE {db_name}")

In [11]:
vector_store = PGVectorStore.from_params(
    database=db_name,
    host=db_host,
    password=db_password,
    port=db_port,
    user=db_user,
    table_name="uscis",
    embed_dim=768,
)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

In [12]:
llm = Vertex(model="chat-bison", temperature=0, additional_kwargs={})

In [13]:
service_context = ServiceContext.from_defaults(embed_model=embed_model, llm=llm)

In [14]:
index = VectorStoreIndex.from_documents(
    documents=docs,
    service_context=service_context,
    storage_context=storage_context,
    show_progress=True
)

  from .autonotebook import tqdm as notebook_tqdm
Parsing nodes: 100%|██████████| 3538/3538 [00:05<00:00, 590.32it/s] 
Generating embeddings: 100%|██████████| 2048/2048 [08:07<00:00,  4.20it/s]
Generating embeddings: 100%|██████████| 2048/2048 [08:13<00:00,  4.15it/s]
Generating embeddings: 100%|██████████| 92/92 [00:22<00:00,  4.01it/s]
