## Install Libraries

In [2]:
#load the pdfs from the folder and convert them to text and do eda for llm dataset
import os
from pypdf import PdfReader
import fitz
import pandas as pd


## Load PDF's directories in

In [3]:
# Load the PDFs from the folder
def load_pdfs(folder_path):
    pdfs = []
    for filename in os.listdir(folder_path):
        if filename.endswith('.pdf'):
            pdfs.append(filename)
    return pdfs

## Translate PDF to text format

In [3]:
# Convert the PDFs to text
def convert_pdf_to_text(folder_path, pdf):
    pdf_file_path = os.path.join(folder_path, pdf)
    reader = PdfReader(pdf_file_path)
    text = ''
    for page in reader.pages:
        text += page.extract_text()
    return text

In [4]:
# Folder containing PDFs
folder_path = 'raw_data'

In [5]:
# Load the PDFs
pdfs = load_pdfs(folder_path)

In [6]:
# Convert the PDFs to text
texts = []
for pdf in pdfs:
    text = convert_pdf_to_text(folder_path, pdf)
    texts.append({'file_name': pdf, 'text': text})


In [7]:
# Convert the texts to a DataFrame
df = pd.DataFrame(texts)
print(df.head())

                                       file_name  \
0      - CERTIFICATE OF DISSOLUTION OF A LLC.pdf   
1  AMENDED CERTIFICATE OF FORMATION OF A LLC.pdf   
2                 AMENDED CERTIFICATE OF LLP.pdf   
3       ARTICLES OF AMENDMENT NONPROFIT Corp.pdf   
4             Articles of Incorporation (WA).pdf   

                                                text  
0  Certificate  of Dissolution -LLC & PLLC Washin...  
1  Amended Cert of Formation -LLC Washington Secr...  
2  Amended Certificate of Limited Liability Partn...  
3   \n \nArticles of Amendment – Nonprofit 24.03 ...  
4   ARTICLES OF INCORPORATION  \nOF \n[CORPORATIO...  


In [8]:
import spacy
nlp = spacy.load("en_core_web_sm")

def tokenize_sentences_spacy(text):
    doc = nlp(text)
    return [sent.text for sent in doc.sents]

# Example usage for your DataFrame
chunks = []
for i, row in df.iterrows():
    for sent in tokenize_sentences_spacy(row['text']):
        chunks.append({'file_name': row['file_name'], 'chunk': sent})
chunk_df = pd.DataFrame(chunks)
print(chunk_df.head())


                                   file_name  \
0  - CERTIFICATE OF DISSOLUTION OF A LLC.pdf   
1  - CERTIFICATE OF DISSOLUTION OF A LLC.pdf   
2  - CERTIFICATE OF DISSOLUTION OF A LLC.pdf   
3  - CERTIFICATE OF DISSOLUTION OF A LLC.pdf   
4  - CERTIFICATE OF DISSOLUTION OF A LLC.pdf   

                                               chunk  
0  Certificate  of Dissolution -LLC & PLLC Washin...  
1  After 120 days of this submission being filed ...  
2          General Instruction s: Use dark ink only.  
3  Complete the entire form and enter all request...  
4  At our \nwebsite www.sos.wa.gov/corporations  ...  


In [11]:
# Save as CSV
chunk_df.to_csv('chunked_text_data.csv', index=False)

# Save as JSONL
chunk_df.to_json('chunked_text_data.jsonl', orient='records', lines=True)


In [14]:
# Verify the JSONL content
with open('chunked_text_data.jsonl', 'r') as f:
    for i, line in enumerate(f):
        if i < 5:  # Print the first 5 lines
            print(line.strip())


{"file_name":"- CERTIFICATE OF DISSOLUTION OF A LLC.pdf","chunk":"Certificate  of Dissolution -LLC & PLLC Washington Secretary of State  Revised 10.2023   \n \n \n \n \n \n \n \n \n \nINSTRUCTIONS : CERTIFICATE  OF DISSOLUTION OF A  LIMITED LIABILITY CO MPANY  OR \nPROFES SIONAL LIMITED LIABILITY COMPANY RCW 25.15  \n \nPurpose : Certificate of Dissolution i s used  to voluntarily dissolve the business entity."}
{"file_name":"- CERTIFICATE OF DISSOLUTION OF A LLC.pdf","chunk":"After 120 days of this submission being filed the \nbusiness entity is no longer eligible for reinstatement or revocation and is considered permanently dissolved . \n \n"}
{"file_name":"- CERTIFICATE OF DISSOLUTION OF A LLC.pdf","chunk":"General Instruction s: Use dark ink only."}
{"file_name":"- CERTIFICATE OF DISSOLUTION OF A LLC.pdf","chunk":"Complete the entire form and enter all requested information in the fields provided."}
{"file_name":"- CERTIFICATE OF DISSOLUTION OF A LLC.pdf","chunk":"At our \nwebsite 

In [18]:
import absl
print("absl-py version:", absl.__version__)

AttributeError: module 'absl' has no attribute '__version__'