# RAG with Postgres flexible server

#### IMPORTANT!! Embeddings Creation - Run this only once !!!
You only need to run this once to create the embeddings and save them to Postgres flexible server.  

In [1]:
from dotenv import load_dotenv
import pandas as pd
from IPython.display import display, HTML, JSON, Markdown
import os

# Configure environment variables
load_dotenv()

# Configure OpenAI API
OPENAI_GPT35_DEPLOYMENT_NAME = os.getenv("OPENAI_GPT35_DEPLOYMENT_NAME")
OPENAI_GPT4_DEPLOYMENT_NAME = os.getenv("OPENAI_GPT4_DEPLOYMENT_NAME")
OPENAI_GPT4V_DEPLOYMENT_NAME = os.getenv("OPENAI_GPT4V_DEPLOYMENT_NAME")
OPENAI_ADA_EMBEDDING_DEPLOYMENT_NAME = os.getenv("OPENAI_ADA_EMBEDDING_DEPLOYMENT_NAME")
OPENAI_DALLE_DEPLOYMENT_NAME = os.getenv("OPENAI_DALLE_DEPLOYMENT_NAME")

OPENAI_DEPLOYMENT_ENDPOINT = os.getenv("OPENAI_DEPLOYMENT_ENDPOINT")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

# configure postgres
POSTGRES_USER = os.getenv("POSTGRES_USER")
POSTGRES_PASSWORD = os.getenv("POSTGRES_PASSWORD")
POSTGRES_HOST = os.getenv("POSTGRES_HOST")
POSTGRES_DB = os.getenv("POSTGRES_DB")


In [2]:
from langchain.text_splitter import CharacterTextSplitter,RecursiveCharacterTextSplitter

from langchain_openai import AzureOpenAIEmbeddings
from langchain.document_loaders import PyPDFLoader
from tenacity import retry, wait_random_exponential, stop_after_attempt

In [3]:
embeddingmodel = AzureOpenAIEmbeddings(
    deployment=OPENAI_ADA_EMBEDDING_DEPLOYMENT_NAME,
    model=OPENAI_ADA_EMBEDDING_DEPLOYMENT_NAME,
    azure_endpoint=OPENAI_DEPLOYMENT_ENDPOINT,
    chunk_size = 1000)

In [4]:
#we use the tenacity library to create delays and retries when calling openAI embeddings to avoid hitting throttling limits
@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))
def calc_embeddings(text):
    deployment = OPENAI_ADA_EMBEDDING_DEPLOYMENT_NAME
    # replace newlines, which can negatively affect performance.
    txt = text.replace("\n", " ")
    return embeddingmodel.embed_query(txt)

In [5]:
# splitting into 1000 char long chunks with 30 char overlap
# split ["\n\n", "\n", " ", ""]
splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=30,
)

documentName = "moby dick book"
fileName = "../data/moby dick.pdf"
loader = PyPDFLoader(fileName)
pages = loader.load_and_split(text_splitter=splitter)
print("Number of pages: ", len(pages))

Number of pages:  1475


In [6]:
#save all the pages into a pandas dataframe
df = pd.DataFrame(columns=['document_name', 'content', 'embedding'])
for page in pages:
    df.loc[len(df.index)] = [documentName, page.page_content, ""]  
df.head()

Unnamed: 0,document_name,content,embedding
0,moby dick book,The Project Gutenberg eBook of Moby -Dick; or ...,
1,moby dick book,CONTENTS \n \nETYMOLOGY. \n \nEXTRACTS (Supp...,
2,moby dick book,CHAPTER 11. Nightgown. \n \nCHAPTER 12. Biogr...,
3,moby dick book,CHAPTER 41. Moby Dick. \n \nCHAPTER 42. The W...,
4,moby dick book,CHAPTER 67. Cutting In. \n \nCHAPTER 68. The ...,


In [None]:
# calculate the embeddings using openAI ada 
df["embedding"] = df.content.apply(lambda x: calc_embeddings(x))
df.to_csv('../data/pg_embeddings.csv', index=False)
print(df.head(2))

In [7]:
import psycopg2

sslmode = "require"
conn_string = "host={0} user={1} dbname={2} password={3} sslmode={4}".format(POSTGRES_HOST, POSTGRES_USER, POSTGRES_DB, POSTGRES_PASSWORD, sslmode)
conn = psycopg2.connect(conn_string) 
print("Connection established")
cursor = conn.cursor()

Connection established


In [8]:
# Install the vector extension in the DB
try:
    # Start a new transaction
    cursor.execute("BEGIN;")

    # Check if the extension already exists
    extension_query = "SELECT * FROM pg_extension WHERE extname = 'vector';"
    cursor.execute(extension_query)
    extension_exists = cursor.fetchone()

    if not extension_exists:
        # Extension does not exist, create it
        create_extension_query = "CREATE EXTENSION vector;"
        cursor.execute(create_extension_query)
        conn.commit()
    else:
        # Extension already exists, pass through
        pass

    # Commit the transaction
    cursor.execute("COMMIT;")
except Exception as e:
    # An error occurred, rollback the transaction
    cursor.execute("ROLLBACK;")
    raise e

In [9]:
# Checking the vector extension exists
show_extensions_query = "SHOW azure.extensions;"
cursor.execute(show_extensions_query)
conn.commit()
results = cursor.fetchall()
for row in results:
    print(row)

('VECTOR',)


In [None]:
from pgvector.psycopg2 import register_vector
from psycopg2 import Error
from psycopg2 import sql

# Register 'pgvector' type for the 'embedding' column
register_vector(conn)

# Convert the DataFrame to a list of tuples for bulk insertion
records = df.to_records(index=False)
records_list = records.tolist()

table_name = 'embeddings'
batch_size = 10

create_table_query = f"""
CREATE TABLE IF NOT EXISTS {table_name} (
    document_name TEXT,
    content TEXT,
    embedding VECTOR
);
"""
cursor.execute(create_table_query)
conn.commit()

insert_query = f"INSERT INTO {table_name} (document_name, content, embedding) " \
            f"VALUES (%s, %s, %s)"
cursor.executemany(insert_query, records_list)
conn.commit()
        