Load the document

In [84]:
from docx import Document as DocxDocument
doc = DocxDocument("acme_bank_consolidate_performance_report.docx")

Extract and chunk paragraphs

In [85]:
doc_chunks = []
for para in doc.paragraphs:
            if para.text.strip():
                doc_chunks.append(para.text)

Connect to the database 
(Database has been created for you.  See create_lame_db.py)

In [86]:
import psycopg2
def get_connection():
    try:
        conn = psycopg2.connect(
                dbname="lame_db",
                user="postgres",
                password="admin",
                host="localhost",
                port="5432",
        )
    except (psycopg2.DatabaseError, Exception) as error:
        print(f"Error: {error}")
    
    return conn



Store the chunks in the database

In [87]:
from psycopg2 import sql

try:
    conn = get_connection()
    with conn.cursor() as cursor:
        # Insert file metadata and content into the complete_files table
        for chunk in doc_chunks:
            cursor.execute(
                sql.SQL("INSERT INTO text_chunks (text) VALUES (%s) RETURNING id"),
                [chunk],
            )
            pk = cursor.fetchone()[0]  # Capture the returned primary key
    
            conn.commit()  # Commit the transaction after each insert
except Exception as e:
    print(f"Error inserting file with chunks: {e}")

QUESTIONS re jupyter
1. Exceptions?
2. Connection?
3. output = pprint?
4. For codio - section limit?
5. When we move it does Theme color matter?

Retrieve Current Chunks from Database
Create embedding of each chunk
Store vector of embedding in pgvector (vector store)

In [88]:
from langchain_openai import OpenAIEmbeddings


# Retreive Currrent Chunks from Database
current_chunks = []
conn = get_connection()
with conn.cursor() as cursor:
    cursor.execute(
        sql.SQL("SELECT id, text FROM text_chunks WHERE is_vectorized = FALSE"),
    )

    rows = cursor.fetchall()
    for row in rows:
        current_chunks.append(row) # append tuple of id and text 
        cursor.execute(f"UPDATE text_chunks SET is_vectorized = TRUE WHERE id = %s",
                       (row[0],))
    
    conn.commit()

# Get the embedding model
openai_embedding = OpenAIEmbeddings(model="text-embedding-3-small")

# Generate the embedding for each chunk
vector_dict = {}
for chunk in current_chunks:
     content = openai_embedding.embed_query(chunk[1])
     # Convert the embedding values to floats (ensures compatibility with storage formats)
     float_content = [float(x) for x in content]
     vector_dict[chunk[0]] = float_content



# add the content to the vector store
with conn.cursor() as cursor:
    for cid, vec in vector_dict.items():
        cursor.execute(
            sql.SQL("INSERT INTO mmr_vector (vector, text_chunk_id) VALUES (%s, %s)"),
            [vec,cid]
        )

conn.commit()

Vectorize Incoming Query 

In [67]:

query = "What was Net Interest Margin (NIM)?"
#query = "How was fee income?"
vectorized_query = openai_embedding.embed_query(query)

Find Similar Vectors
pgvector similarity search operators: 
<->:
Represents the Euclidean distance between two vectors, which is the "straight-line" distance between them in multi-dimensional space. 
<=>:
Calculates the cosine similarity between vectors, which is often preferred for high-dimensional data as it focuses on the angle between vectors rather than their magnitude. 
<#>
: Computes the inner product of two vectors, where each corresponding element is multiplied and summed. 

In [89]:
#print(f"vectorized query: {vectorized_query[:5]}")
top_k = 3
conn = get_connection()
with conn.cursor() as cur:
    cur.execute(
        sql.SQL(
            """SELECT id, text_chunk_id, 1 - (vector <#> %s::VECTOR) AS similarity
               FROM mmr_vector
               ORDER BY similarity DESC
               LIMIT %s"""
        ),
        [vectorized_query, top_k],
    )
    rows = cur.fetchall()
    similar_chunk_ids = []
    if rows:
        for row in rows:
            similar_chunk_ids.append(row[1])
    else:
        print("No results found.")


      

Get the text chunks of the closest matches

In [90]:
conn = get_connection()
with conn.cursor() as cur:
    similar_context = []
    for chunk_id in similar_chunk_ids:
        cur.execute(
            sql.SQL("""SELECT text FROM text_chunks where id = %s"""),
            [chunk_id],
        )
        row = cur.fetchone()  # Fetch only one row for the current chunk_id
        similar_context.append(row[0])



Submit Similar Vectors to LLM with query to retrieve result

In [91]:
from openai import OpenAI
from pprint import pprint

# Show the similar content retrieved
# for sc in similar_context:
#     pprint(f"CONTEXT ITEM:{sc}")
# Format the prompt
prompt = f"""You are an assistant for question-answering tasks. Use only 
the following pieces of retrieved context to answer the 
question. Use 3 sentences maximum to keep your answer concise. Here's a query: 
{query} and here are similar queries of retrieved context: {similar_context}. Again,
only base your answer on the similar queries data within the similar context."""

# Call the OpenAI ChatCompletion API using the updated method
client = OpenAI()
response = client.chat.completions.create(
    model="gpt-4",
    messages=[
        {"role": "system", "content": prompt},
        {"role": "user", "content": query},
    ],
)

# Extract and print the response
pprint(response.choices[0].message.content.strip())

('The Net Interest Margin (NIM) is a measure of the difference between the '
 'interest income generated by banks or other financial institutions and the '
 'amount of interest paid out to their lenders, relative to the amount of '
 'their interest-earning assets. It is usually expressed as a percentage of '
 'what the financial institution earns on loans in relation to the total '
 'amount of these loans. In the provided context, the NIM was slightly above '
 'target at 3.96%, which suggests it was driven by favorable economic '
 'conditions and prudent financial management.')
