In [39]:
import os
import openai

import logging
import sys
from pprint import pprint

logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

from llama_index.core import (
    VectorStoreIndex,
    SimpleDirectoryReader,
    load_index_from_storage,
    StorageContext,
    ServiceContext,
    Document
)

from llama_index.llms.openai import OpenAI
from llama_index.core.node_parser import SentenceWindowNodeParser, HierarchicalNodeParser, get_leaf_nodes
from llama_index.core.text_splitter import SentenceSplitter
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core.schema import MetadataMode
from llama_index.core.postprocessor import MetadataReplacementPostProcessor
from llama_index.core import Settings
openai.api_key = os.environ['OPENAI_API_KEY']

from llama_index.core.postprocessor import MetadataReplacementPostProcessor
from llama_index.core import get_response_synthesizer
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core.query_engine import RetrieverQueryEngine

In [40]:
import pandas as pd

In [41]:
# list the folder names in data directory
data_dir = '../data/bankruptcy_dockets/documents'
folders = [f for f in os.listdir(data_dir) if os.path.isdir(os.path.join(data_dir, f))]

In [42]:
from docnametorowname import mapping
from question import questions

In [46]:
def retrieve_lopucki_data(folder):
    #load locpucki data
    locpucki = pd.read_csv('../code/lopucki-db-just-what-we-have.csv')
    locpucki = locpucki[questions.keys()]
    #map folder name to row name
    row_name = mapping[folder]
    #retrieve the row
    row = locpucki[locpucki['NameCorp'] == row_name]
    #return row as a dictionary
    return row.to_dict(orient='records')[0]

def run_rag_system(folder):
    # load PDFs
    documents = SimpleDirectoryReader(f"../data/bankruptcy_dockets/documents/{folder}").load_data()
    # split into nodes
    sentence_node_parser = SentenceSplitter(
    chunk_size=512,
    chunk_overlap=20,
    )
    nodes = sentence_node_parser.get_nodes_from_documents(documents)
    Settings.embed_model = OpenAIEmbedding(
    model="text-embedding-3-large", embed_batch_size=100
    )
    index = VectorStoreIndex(nodes)

    # create a folder to store the embeddings and output
    os.makedirs(f"../data/bankruptcy_dockets/output/persist_{folder}",
    exist_ok=True)
    # store the index and embeddings
    #index.storage_context.persist(persist_dir=f"../data/bankruptcy_dockets/output/persist_{folder}")
    #load stored index
    stored_index = StorageContext.from_defaults(persist_dir=f"../data/bankruptcy_dockets/output/persist_{folder}")
    retrieved_index = load_index_from_storage(stored_index)

    sentence_query_engine = retrieved_index.as_query_engine(
        similarity_top_k=5,
        verbose=True
    )

    llm = OpenAI(model="gpt-4o")
    Settings.llm = llm

    # configure retriever
    retriever = VectorIndexRetriever(
        index=retrieved_index,
        similarity_top_k=5,
    )
    
    # configure response synthesizer
    response_synthesizer = get_response_synthesizer(
        response_mode="compact",
    )
    
    # assemble query engine
    query_engine = RetrieverQueryEngine(
        retriever=retriever,
        response_synthesizer=response_synthesizer,
    )

    def generate_question(item):
        varname,description = item
        # this function generates the question that we want to ask the model
        question_prompt = f"""
        You are a helpful legal assistant.
        Given the following variable: {varname}, which is described here: {description}, what is the value of the variable?
        """
        return question_prompt
    
    response_dict = {}
    for question in questions.items():
        response = query_engine.query(generate_question(question))
        response_dict[question[0]+'_response'] = response
        response_dict[question[0]+'_RAG'] = response.response
        for i,source_node in enumerate(response.source_nodes):
            response_dict[question[0]+f'_source_{i}'] = source_node.text
    
    return response_dict    


In [47]:
def produce_row(folder):
    #load lopucki dataset
    locpucki_data = retrieve_lopucki_data(folder)

    #run rag system
    rag_data = run_rag_system(folder)

    #combine the two dictionaries
    combined_dict = {**locpucki_data, **rag_data}

    return combined_dict

In [48]:
results = [produce_row(folder) for folder in folders]

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP 

AuthenticationError: Error code: 401 - {'error': {'message': 'Incorrect API key provided: sk-proj-********************************************Nwpy. You can find your API key at https://platform.openai.com/account/api-keys.', 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_api_key'}}

In [None]:
# save results to a csv
results_df = pd.DataFrame(results)

In [None]:
columns_sorted = list(results_df.columns)
columns_sorted.sort()

In [None]:
results_df = results_df[columns_sorted]

In [None]:
results_df.to_csv('../data/bankruptcy_dockets/output/results_compact.csv', index=False)

DEFAULT_TEXT_QA_PROMPT_TMPL = (
    "Context information is below.\n"
    "---------------------\n"
    "{context_str}\n"
    "---------------------\n"
    "Given the context information and not prior knowledge, "
    "answer the query.\n"
    "Query: {query_str}\n"
    "Answer: "
)

DEFAULT_REFINE_PROMPT_TMPL = (
    "The original query is as follows: {query_str}\n"
    "We have provided an existing answer: {existing_answer}\n"
    "We have the opportunity to refine the existing answer "
    "(only if needed) with some more context below.\n"
    "------------\n"
    "{context_msg}\n"
    "------------\n"
    "Given the new context, refine the original answer to better "
    "answer the query. "
    "If the context isn't useful, return the original answer.\n"
    "Refined Answer: "
)