In [1]:
from define_questions import questions

In [2]:
import os
import openai

import logging
import sys
from pprint import pprint

logging.basicConfig(stream=sys.stdout, level=logging.INFO)

from llama_index.core import (
    VectorStoreIndex,
    SimpleDirectoryReader,
    load_index_from_storage,
    StorageContext,
    ServiceContext,
    Document
)

from llama_index.llms.openai import OpenAI
from llama_index.core.node_parser import SentenceWindowNodeParser, HierarchicalNodeParser, get_leaf_nodes
from llama_index.core.text_splitter import SentenceSplitter
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core.schema import MetadataMode
from llama_index.core.postprocessor import MetadataReplacementPostProcessor
from llama_index.core import Settings
openai.api_key = os.environ['OPENAI_API_KEY']

from llama_index.core.postprocessor import MetadataReplacementPostProcessor
from llama_index.core import get_response_synthesizer
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core.query_engine import RetrieverQueryEngine

In [3]:
# list the folder names in data directory
data_dir = '../data/bankruptcy_dockets/documents'
folders = [f for f in os.listdir(data_dir) if os.path.isdir(os.path.join(data_dir, f))]

In [5]:
stored_index = StorageContext.from_defaults(persist_dir=f"../data/bankruptcy_dockets/output/persist_{folders[0]}")
retrieved_index = load_index_from_storage(stored_index) 

INFO:llama_index.core.indices.loading:Loading all indices.


In [16]:
def ask_question(question_name,retrieved_index):
    Settings.embed_model = OpenAIEmbedding(
    model="text-embedding-3-large", embed_batch_size=100
    )
    llm = OpenAI(model="gpt-4o",temperature=0)
    Settings.llm = llm

    # configure retriever
    retriever = VectorIndexRetriever(
        index=retrieved_index,
        similarity_top_k=10,
        reranker_top_n=5,
        with_reranker=True
    )
    
    response_synthesizer = get_response_synthesizer(
        response_mode="tree_summarize",
        output_cls=questions[question_name]['output_class'],
    )

    query_engine = RetrieverQueryEngine(
        retriever=retriever,
        response_synthesizer=response_synthesizer,
        node_postprocessors=[
        MetadataReplacementPostProcessor(target_metadata_key="window")],
    )

    def generate_question(item):
        varname, description = item
        # this function generates the question that we want to ask the model
        question_prompt = f"""
        You are a helpful legal assistant.
        Given the following variable: {varname}, which is described here: {description}, what is the value of the variable?
        """
        return question_prompt
    
    response = query_engine.query(generate_question((question_name,questions[question_name]['description'])))
    return response


In [17]:
response_dict = {}
for question in list(questions.keys())[:6]:
    response = ask_question(question,retrieved_index)
    response_dict[question] = response
    print(response)

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
{"val": "Armstrong Flooring Inc (2022)", "confidence": "high"}
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
{"val": 517.0, "confidence": "high"}
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
{"val": "Michel S. Vermette", "confidence": "high"}
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
{"val": "Wilmington", "confidence": "high"}
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP

In [None]:
response = query_engine.query(generate_question(('AssetsPetition',questions['AssetsPetition']['description'])))

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


In [9]:
type(response.response)

define_questions.AssetsPetition

In [10]:
response.response.val

517.0

In [15]:
from define_rag_function import ask_question, run_rag, questions

In [2]:
import os

In [3]:
# list the folder names in data directory
data_dir = '../data/bankruptcy_dockets/documents'
folders = [f for f in os.listdir(data_dir) if os.path.isdir(os.path.join(data_dir, f))]

In [4]:
from llama_index.core import Settings
from llama_index.embeddings.openai import OpenAIEmbedding

In [5]:
Settings.embed_model = OpenAIEmbedding(
    model="text-embedding-3-large", embed_batch_size=100
    )

In [17]:
def retrieve_lopucki_data(folder):
    #load locpucki data
    locpucki = pd.read_csv('../code/lopucki-db-just-what-we-have.csv')
    locpucki = locpucki[questions.keys()]
    #map folder name to row name
    row_name = mapping[folder]
    #retrieve the row
    row = locpucki[locpucki['NameCorp'] == row_name]
    #return row as a dictionary
    return row.to_dict(orient='records')[0]

In [18]:
def produce_row(folder):
    #load lopucki dataset
    locpucki_data = retrieve_lopucki_data(folder)

    #run rag system
    rag_data = run_rag(folder)

    #combine the two dictionaries
    combined_dict = {**locpucki_data, **rag_data}

    return combined_dict

In [None]:
# save results to a csv
results_df = pd.DataFrame(results)