In [5]:
import warnings
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    from tqdm.autonotebook import tqdm
from tqdm.auto import tqdm  # this is our progress bar
from langchain.vectorstores import Chroma, Pinecone
from langchain.embeddings.openai import OpenAIEmbeddings
import pinecone
import pinecone
from langchain.embeddings.openai import OpenAIEmbeddings
# PDF Loaders. If unstructured gives you a hard time, try PyPDFLoader
from langchain.document_loaders import UnstructuredPDFLoader, OnlinePDFLoader, PyPDFLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter
from dotenv import load_dotenv
import os
load_dotenv()
# Set up the Pinecone vector database
OPENAI_API_KEY=os.getenv("OPENAI_API_KEY")
model_name = "text-embedding-ada-002"
embed = OpenAIEmbeddings(model=model_name, openai_api_key=OPENAI_API_KEY)
PINECONE_API_KEY=os.getenv("PINECONE_API_KEY")
PINECONE_ENV = "us-east-1-aws"
pinecone.init(
    api_key=PINECONE_API_KEY,  # find at app.pinecone.io
    environment=PINECONE_ENV  # next to api key in console
)
# docsearch = Pinecone.from_texts([t.page_content for t in texts], embeddings, index_name=index_name) #this uploads embeddings and other data to Pinecone
index_name = "doc-start"
index = pinecone.Index(index_name)

In [6]:
def meta_data_extractor(file_name):
    case_no = file_name.split('/')[-1].split('.')[0]
    district = file_name.split('/')[-3].split('_')[-1]
    year = int(file_name.split('/')[-2])
    url = f'https://www.supremecourt.ohio.gov/rod/docs/pdf/{district}/{year}/{case_no}.pdf'
    return case_no, district, year, url

SELECTING RANDOM OPINION FILES FOR TESTING


In [8]:
import os
import random

def select_random_files_from_year_subdirs(root_directory, num_files=100):
    """
    Selects a specified number of random files from year subdirectories under each district directory
    within the given root directory. Assumes district directories are named 'District_1' through 'District_12'
    and year subdirectories are named with years.

    Parameters:
    root_directory (str): Path to the root directory.
    num_files (int): Number of files to select. Defaults to 20.

    Returns:
    list: A list of full paths of selected files.
    """
    all_files = []

    # Define district directories
    district_dirs = [f'District_{i}' for i in range(1, 13)]

    # Walk through the directory structure
    for district in district_dirs:
        district_path = os.path.join(root_directory, district)
        if os.path.exists(district_path):
            for year in os.listdir(district_path):
                year_path = os.path.join(district_path, year)
                if os.path.isdir(year_path) and year.isdigit():
                    for file in os.listdir(year_path):
                        file_path = os.path.join(year_path, file)
                        if os.path.isfile(file_path):
                            all_files.append(file_path)

    # Select num_files random files from the list
    selected_files = random.sample(all_files, min(len(all_files), num_files))

    return selected_files

# Example usage
# Replace 'path/to/ohio_case_scrape' with the actual path to your 'ohio_case_scrape' directory
root_directory = '/Users/deantaylor/ohio_case_scrape'
file_lst = select_random_files_from_year_subdirs(root_directory)
# print(random_files)

In [9]:
for file_name in file_lst:
    case_no, district, year, url = meta_data_extractor(file_name)
    # file_name = '/Users/deantaylor/ohio_case_scrape/District_12/2005/2005-Ohio-5048.pdf'
    loader = PyPDFLoader(file_name)
    data = loader.load()
    text_splitter = CharacterTextSplitter(
    separator = "\n",
    chunk_size = 2000,
    chunk_overlap  = 200,
    )  #required for loading into the embeddings model because of the limited context window
    texts = text_splitter.split_documents(data)
    for t in texts:  #this cleans up the text and adds metadata while removing the reference to the source file on my system which is not needed
        #replace the \n characters with spaces
        t.page_content = t.page_content.replace('\n', ' ')
        t.metadata["case_no"] = case_no
        t.metadata["district"] = district
        t.metadata["year"] = int(year)
        t.metadata["url"] = url
        t.metadata.pop('source')
    embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY) #creates the object to make the embeddings, does not hold any data itself
    test_cone = Pinecone.from_documents(texts, embeddings, index_name=index_name)

In [None]:
# file_name = '/Users/deantaylor/ohio_case_scrape/District_12/2005/2005-Ohio-5048.pdf'
# loader = PyPDFLoader(file_name)
# data = loader.load()
# text_splitter = CharacterTextSplitter(
# separator = "\n",
# chunk_size = 2000,
# chunk_overlap  = 200,
# )
# texts = text_splitter.split_documents(data)
# print(f'total length of texts is {len(texts)}')
# texts

In [None]:
# case_no, district, year, url = meta_data_extractor(file_name)
# print(f'case no: {case_no}')
# print(F'appellate district: {district}')
# print(f'year: {year}')
# print(f'url: {url}')

In [None]:
# for t in texts:
#     t.metadata["case_no"] = case_no
#     t.metadata["district"] = district
#     t.metadata["year"] = year
#     t.metadata["url"] = url
#     t.metadata.pop('source')

In [None]:
# embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY) #creates the object to make the embeddings, does not hold any data itself
# test_cone = Pinecone.from_documents(texts, embeddings, index_name=index_name)

CODE ABOVE IS WORKING AND PRODUCING MODIFIED META DATA IN THE PINECONE DB ALONG WITH VECTORS


CODE BELOW WORKING TO ADD DOCUMENTS IN BULK WITH META DATA


In [None]:
# for file_name in file_lst:
#     loader = PyPDFLoader(file_name)
#     data = loader.load()
#     text_splitter = CharacterTextSplitter(
#     separator = "\n",
#     chunk_size = 2000,
#     chunk_overlap  = 200,
#     )
#     texts = text_splitter.split_documents(data)
#     print(f'file_name: {file_name}, texts length: {len(texts)}')
    

SEARCHING THE DB FOR INFORMATION


In [None]:
from langchain.chat_models import ChatOpenAI
from langchain.chains.question_answering import load_qa_chain
llm = ChatOpenAI(temperature=0, openai_api_key=OPENAI_API_KEY)
chain = load_qa_chain(llm, chain_type="stuff")
query = "What is the name of the defendant in case number 2005-Ohio-287?"
docs = test_cone.similarity_search(query)
chain.run(input_documents=docs, question=query)

QUERY WITHOUT FILTERING ON META DATA


In [None]:
import random
queries = [[random.random() for i in range(1536)]]
index.query(queries, top_k=10, include_metadata=True)

NOW WE WILL QUERY THE DB WITH REFERENCE TO META DATA


THIS FILTERS FOR CASES NOT IN DISTRICT 12


In [None]:
index.query(
    queries=queries, 
    top_k=10, 
    include_metadata=True,
    filter={"district": {"$ne":"12"}})


GETTING ALL ITEMS IN DB FOR A SPECIFIC CASE


In [None]:
case_no = "2009-Ohio-6689"
index.query(
    queries=queries, 
    top_k=10, 
    include_metadata=True,
    filter={"case_no": {"$eq":case_no}})

EXTRA CODE


In [None]:
texts[0].page_content
texts[0].metadata

In [None]:
texts[0].page_content

In [None]:
# Generate embeddings and prepare metadata
data_to_upload = []
for t in texts:
    embedding = embeddings.embed(t.page_content)
    metadata = {
        'url': "the url",
        'case_no': "the case no",
        # include other metadata fields as needed
    }
    data_to_upload.append((t.id, embedding, metadata))

In [None]:
embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY) #creates the object to make the embeddings, does not hold any data itselfdoc_search = 
upsert_response = index.upsert(
   vectors=[t.page_content for t in texts]
)





# docsearch = Pinecone.from_texts([t.page_content for t in texts], embeddings, index_name=index_name) #this uploads embeddings and other data to Pinecone 

In [None]:
pinecone.delete_index(index_name)

GRAB RANDOM TEN FILES FROM A DIRECTORY


In [None]:
import os
import random

def select_random_files(directory, num_files=10):
    """
    Selects a specified number of random files from a given directory and returns their full paths.

    Parameters:
    directory (str): Path to the directory.
    num_files (int): Number of files to select. Defaults to 10.

    Returns:
    list: A list of full paths of selected files.
    """
    # Get a list of files in the directory with their full paths
    files = [os.path.join(directory, f) for f in os.listdir(directory) if os.path.isfile(os.path.join(directory, f))]

    # Select num_files random files from the list
    selected_files = random.sample(files, min(len(files), num_files))

    return selected_files

file_lst = select_random_files("District_8/2010")
# file_lst

In [None]:

    # print(url)

In [None]:
def embedder(file_name):
    case_no, district, year, url = meta_data_extractor(file_name)
    print(f'case no: {case_no}')
    print(F'appellate district: {district}')
    print(f'year: {year}')
    print(f'url: {url}')
    loader = PyPDFLoader(file_name)
    data = loader.load()
    text_splitter = CharacterTextSplitter(
    separator = "\n",
    chunk_size = 2000,
    chunk_overlap  = 200,
    )
    texts = text_splitter.split_documents(data)
    # for t in texts:
    #     t.page_content = t.page_content.replace("\n", " ")
    #     t.metadata["url"] = url
    #     t.metadata["case_no"] = case_no
    #     t.metadata["district"] = district
    #     t.metadata["year"] = year
    #     #remove the source key from the t.metadata
    #     t.metadata.pop('source')
    # print(texts[0].metadata)
    print(texts[0])
    embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY) #creates the object to make the embeddings, does not hold any data itself
    Pinecone.from_texts([t.page_content for t in texts], embeddings, index_name=index_name) #this uploads embeddings and other data to Pinecone 

In [None]:
embedder('/Users/deantaylor/ohio_case_scrape/District_8/2005/2005-Ohio-24.pdf')

In [None]:
docsearch = Pinecone.from_existing_index(index_name, embeddings)

In [None]:
for file_name in file_lst:
    embedder(file_name)
    print(f'uploaded {file_name} to pinecone')

In [None]:
file_lst

In [None]:
# loader = PyPDFLoader(file_name)
# data = loader.load()
# Note: If you're using PyPDFLoader then it will split by page for you already
# print (f'You have {len(data)} document(s) in your data')
# print (f'There are {len(data[0].page_content)} characters in your sample document')
# print (f'Here is a sample: {data[0].page_content[:200]}')

In [None]:
file_name = "/Users/deantaylor/ohio_case_scrape/District_2/2010/2010-Ohio-3652.pdf"
case_no = file_name.split('/')[-1].split('.')[0]
print(f'case no: {case_no}')
#get the district number which is the number at the end of this 'District_12"
district = file_name.split('/')[-3].split('_')[-1]
print(F'appellate district: {district}')
year = file_name.split('/')[-2]
print(f'year: {year}')
#now build the url for this case number
#format is https://www.supremecourt.ohio.gov/rod/docs/pdf/1/2023/2023-Ohio-4551.pdf
url = f'https://www.supremecourt.ohio.gov/rod/docs/pdf/{district}/{year}/{case_no}.pdf'
print(url)

In [None]:
# We'll split our data into chunks around 500 characters each with a 50 character overlap. These are relatively small.
# text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
text_splitter = CharacterTextSplitter(
    separator = "{¶",
    chunk_size = 1000,
    chunk_overlap  = 200,
)

texts = text_splitter.split_documents(data)

In [None]:
#go through all the t.page_content in the texts and remove the \n charazxter and replace with space
for t in texts:
    t.page_content = t.page_content.replace("\n", " ")
texts[:1]

In [None]:
metadata = ''
test_text = texts[0]
# print(test_text)
# print(test_text.page_content)
# print(test_text.metadata)
test_text.metadata["url"] = url
test_text.metadata["case_no"] = case_no
test_text.metadata["district"] = district
test_text.metadata["year"] = year
#remove the source key from the test_text.metadata
test_text.metadata.pop('source')
test_text

In [None]:


# # print(test_text.metadata)
# # # Assuming 'doc' is an instance of langchain_core.metadatas.base.Document
# page_content = test_text.page_content
# metadata = test_text.metadata
# print(f'metadata from the document is {metadata}')
# metadata
# # print(test_text["metadata"])
# url = "https://wwww.courtlistener.com"
# district = "District 12"
# case_no = "2005-Ohio-327"

# for test_text.metadata in test_text:
#     print(metadata)

    # metadata["district"] = district
    # metadata["case_no"] = case_no

In [None]:
test_text

In [None]:
# New key-value pairs to add
url = "http://example.com/case/2005-Ohio-327"
district = "Twelfth Appellate District"
case_no = "2005-Ohio-327"


for text in texts:
    t.page_content = t.page_content.replace("\n", " ")


# Adding the key-value pairs to the metadata
for document in data:
    document["metadata"]["url"] = url
    document["metadata"]["district"] = district
    document["metadata"]["case_no"] = case_no

In [None]:
print (f'Now you have {len(texts)} documents')

In [None]:
texts[:1]

In [None]:
embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY) #creates the object to make the embeddings, does not hold any data itself

In [None]:
# metademata = {"url": "opinionurl.com", "case_name": "Case Name", "year": "2005", "court": "Ohio Supreme Court"}

In [None]:
texts[:1]

In [None]:
test_lst_texts = [t.page_content for t in texts[:1]]
test_lst_texts

In [None]:

# initialize pinecone
pinecone.init(
    api_key=PINECONE_API_KEY,  # find at app.pinecone.io
    environment=PINECONE_ENV  # next to api key in console
)

docsearch = Pinecone.from_texts([t.page_content for t in texts], embeddings, index_name=index_name) #this uploads embeddings and other data to Pinecone
# docsearch = Pinecone.from_texts([t.page_content for t in texts], embeddings, index_name=index_name) #this uploads embeddings and other data to Pinecone

QUERY THE DOCUMENT THAT WAS JUST VECTORIZED IN PINECONE


In [None]:
from langchain.chat_models import ChatOpenAI
from langchain.chains.question_answering import load_qa_chain
llm = ChatOpenAI(temperature=0, openai_api_key=OPENAI_API_KEY)
chain = load_qa_chain(llm, chain_type="stuff")
query = "What is the name of the defendant in this case?"
docs = docsearch.similarity_search(query)
chain.run(input_documents=docs, question=query)

In [None]:
query1= "What are the rules or statutes involved in this case?"
docs = docsearch.similarity_search(query1)
chain.run(input_documents=docs, question=query1)

In [None]:
query1= "Summarize all of the facts of this case like a lawyer analyzing the case and do it in a list of sentences."
docs = docsearch.similarity_search(query1)
chain.run(input_documents=docs, question=query1)