In [1]:
import os
import openai

import logging
import sys
from pprint import pprint

In [2]:
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

from llama_index.core import (
    VectorStoreIndex,
    SimpleDirectoryReader,
    load_index_from_storage,
    StorageContext,
    ServiceContext,
    Document
)

from llama_index.llms.openai import OpenAI
from llama_index.core.node_parser import SentenceWindowNodeParser, HierarchicalNodeParser, get_leaf_nodes
from llama_index.core.text_splitter import SentenceSplitter
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core.schema import MetadataMode
from llama_index.core.postprocessor import MetadataReplacementPostProcessor
from llama_index.core import Settings


In [3]:
openai.api_key = os.environ['OPENAI_API_KEY']

In [4]:
# load PDFs
documents = SimpleDirectoryReader("../data/bankruptcy_dockets/documents/Sequential Brands Group").load_data()

In [5]:
# split into nodes
sentence_node_parser = SentenceSplitter(
    chunk_size=512,
    chunk_overlap=20,
)

In [6]:
nodes = sentence_node_parser.get_nodes_from_documents(documents)

In [7]:
Settings.embed_model = OpenAIEmbedding(
    model="text-embedding-3-small", embed_batch_size=100
)

In [8]:
# create index and embed nodes

In [9]:
index = VectorStoreIndex(nodes)

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.open

In [10]:
# store the index and embeddings
index.storage_context.persist(persist_dir="../persist")

In [11]:
#load stored index
stored_index = StorageContext.from_defaults(persist_dir="../persist")
retrieved_index = load_index_from_storage(stored_index)

INFO:llama_index.core.indices.loading:Loading all indices.
Loading all indices.


In [12]:
from llama_index.core.postprocessor import MetadataReplacementPostProcessor
sentence_query_engine = retrieved_index.as_query_engine(
    similarity_top_k=5,
    verbose=True
)

In [13]:
from llama_index.core import get_response_synthesizer
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core.query_engine import RetrieverQueryEngine

In [14]:
llm = OpenAI(model="gpt-4o")

In [15]:
Settings.llm = llm

In [16]:
# configure retriever
retriever = VectorIndexRetriever(
    index=retrieved_index,
    similarity_top_k=5,
)

# configure response synthesizer
response_synthesizer = get_response_synthesizer(
    response_mode="tree_summarize",
)

# assemble query engine
query_engine = RetrieverQueryEngine(
    retriever=retriever,
    response_synthesizer=response_synthesizer,
)

In [17]:
from question import return_question

In [18]:
questions = return_question()

In [63]:
def generate_question(item):
    varname,description = item
    # this function generates the question that we want to ask the model
    question_prompt = f"""
    You are a helpful legal assistant.
    Given the following variable: {varname}, which is described here: {description}, what is the value of the variable?
    """
    return question_prompt


In [64]:
list(questions.items())

[('NameCorp',
  'The name of the 10-K filing company. If the 10-K filing company did not file bankruptcy, add the name of the principal filing\nsubsidiary in parentheses, followed by “only.” If more than one 10-K filing\ncompany is administratively consolidated in the same bankruptcy case, add\nthe names of the additional 10-K filers in parentheses, not followed by\n“only.” If the company has filed bankruptcy more than once, add the year of\nthis bankruptcy filing in parentheses. Remove apostrophes from names.\nAccess will tolerate apostrophes, but the WebBRD will not. If a company\nname contains an apostrophe, the company’s date will not appear in View\ndata by company.'),
 ('AssetsPetition',
  'Total assets of the debtors, as indicated on “Exhibit A” to\nthe petition, in millions of dollars. The SEC requires that companies file\nExhibit A if they remain public as of filing. If the Exhibit A amount listed\ndiffers substantially from AssetsBefore, we investigate whether the amount\nlis

In [73]:
response = query_engine.query(generate_question(list(questions.items())[9]))

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


In [74]:
print(response)

The value of the variable DateFiled is August 31, 2021.


In [85]:
response.response

'The value of the variable DateFiled is August 31, 2021.'

In [75]:
dir(response)

['__annotations__',
 '__class__',
 '__dataclass_fields__',
 '__dataclass_params__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__get_validators__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__match_args__',
 '__module__',
 '__ne__',
 '__new__',
 '__pydantic_initialised__',
 '__pydantic_model__',
 '__pydantic_run_validation__',
 '__pydantic_validate_values__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__validate__',
 '__weakref__',
 'get_formatted_sources',
 'metadata',
 'response',
 'source_nodes']

In [88]:
response.source_nodes[0].text

'American LegalNet, Inc.  \nwww.FormsWorkFlow.com\nOfficial Form 201 Voluntary Petition for Non- Individuals Filing for Bankruptcy  page 3 Debtor Sequential Brands Group, Inc. Case number (if known )\nName \n9.Were prior bankruptcy cases\nfiled by or against the debtorwithin the last 8 years?\nIf more than 2 cases, attach a\ns\neparate list. No \n Yes. District  When  Case number  \nMM / DD / YYYY  \nDistrict  When  Case number  \nMM / DD / YYYY  \n10. Are any bankruptcy cases\npending or being filed by a\nbusiness partner or an\naffiliate of the debtor?\nList all cases. If more than 1,\nattach a separate list. No \nYes. Debtor  See Schedule 1  Relationship  Affiliates  \nDistrict  Delaware  When  08/31/2021 \nMM / DD / YYYY  \nCase number, if known  \n11. Why is the case filed in this\ndistrict ?Check all that apply:\n Debtor has had its domicile, principal place of business, or principal assets in this district for 180 days  \nimmediately preceding the date of this petition or for a 

In [81]:
print(response.source_nodes[4].text)

American LegalNet, Inc.  
www.FormsWorkFlow.com
Official Form 201 Voluntary Petition for Non- Individuals Filing for Bankruptcy  page 1 Fill in this information to identify the case:  
 Check if this is an 
amended filing  United States Bankruptcy Court for the:  
District of  Delaware  
(State) 
Case number (If known ): Chapter  11 
Official Form 201
Voluntary Petition for Non -Individuals Filing for Bankruptcy  04/20  
If more space  is needed, attach a separate sheet to this form. On the top of any additional pages, write the debtor ’s name and the case 
number (if known). For more information, a separate document, Instructions for Bankruptcy Forms for Non -Individuals, is available.  
1.Debtor ’s name Sequential Brands Group, Inc.  
2.All other names debtor used
in the last 8 years
Include any assumed names,
trade names, and doing business
as namesSinger  Madeline Holdings, Inc.  
3.Debtor ’s federal Employer
Identification Number (EIN)4 7 – 4 4 5 2 7 8 9 
4.Debtor ’s address Princ