In [None]:
import re
import fitz  # PyMuPDF
import json
from pathlib import Path
import os
from langchain.schema.document import Document
from langchain.chat_models import groq
from langchain.schema import HumanMessage, SystemMessage
import pickle

In [2]:
def create_documents_from_structure(structure, pdf_path):
    documents = []
    for book, parts in structure.get('Books', {}).items():
        for part, extract_headis in parts.get('Parts', {}).items():
            for extract_headi, articles in extract_headis.get('extract_headis', {}).items():
                for article in articles:
                    for article_name, content in article.items():
                        metadata = {
                            'file_name': os.path.basename(pdf_path),
                            'page': content.get('page', 'N/A')  # Add page number to metadata
                        }
                        documents.append(Document(page_content=f"{chapter} {article_name} {content['text']}", metadata=metadata))
    for chapter, articles in structure.get('Chapters', {}).items():
        for article in articles:
            for article_name, content in article.items():
                metadata = {
                    'file_name': os.path.basename(pdf_path),
                    'page': content.get('page', 'N/A')  # Add page number to metadata
                }
                documents.append(Document(page_content=f"{chapter} {article_name} {content['text']}", metadata=metadata))
    for article in structure.get('Articles', []):
        for article_name, content in article.items():
            metadata = {
                'file_name': os.path.basename(pdf_path),
                'page': content.get('page', 'N/A')  # Add page number to metadata
            }
            documents.append(Document(page_content=f"{article_name} {content['text']}", metadata=metadata))
    return documents

# Outer function
def extract_headings_and_content(file_id, pdf_path, display_name, is_proprietary):
    """Extract headings and content from a PDF file using regex and build a nested structure."""
    doc = fitz.open(pdf_path)
    structure = {}

    # Define regex patterns for book, part, chapter, and article headings
    book_pattern = re.compile(r'Book [A-Z]+:.*', re.IGNORECASE)
    part_pattern = re.compile(r'Part [A-Z]+:.*', re.IGNORECASE)
    chapter_pattern = re.compile(r'Chapter [\d]+ -.*', re.IGNORECASE)
    article_pattern = re.compile(r'Article \(?\d+\)?.*', re.IGNORECASE)

    current_book = None
    current_part = None
    current_chapter = None
    current_article = None
    content_buffer = []
    current_page = None

    def add_content_to_structure():
        nonlocal current_book, current_part, current_chapter, current_article, current_page
        if current_article and content_buffer:
            content = "\n".join(content_buffer).strip()
            content_data = {'text': content, 'page': current_page}
            if current_chapter:
                if current_part:
                    if current_book:
                        # Ensure all levels of the structure are initialized
                        structure.setdefault(current_book, {'Parts': {}})
                        structure[current_book]['Parts'].setdefault(current_part, {'Chapters': {}})
                        structure[current_book]['Parts'][current_part]['Chapters'].setdefault(current_chapter, []).append({current_article: content_data})
                    else:
                        # Initialize the structure for parts if the book is not present
                        structure.setdefault(current_part, {'Chapters': {}})
                        structure[current_part]['Chapters'].setdefault(current_chapter, []).append({current_article: content_data})
                else:
                    # Initialize the structure for chapters if the part is not present
                    structure.setdefault('Chapters', {})
                    structure['Chapters'].setdefault(current_chapter, []).append({current_article: content_data})
            else:
                # Initialize the structure for articles if the chapter is not present
                structure.setdefault('Articles', []).append({current_article: content_data})
            # Clear the content buffer and current article after adding content to the structure
            content_buffer.clear()
            current_article = None

    for page_number, page in enumerate(doc, start=1):
        # Get the text of the page
        text = page.get_text()
        current_page = page_number
        # Find all matches of the patterns
        for line in text.split('\n'):
            if book_match := book_pattern.match(line):
                add_content_to_structure()
                current_book = book_match.group()
                current_part = None
                current_chapter = None
            elif part_match := part_pattern.match(line):
                add_content_to_structure()
                current_part = part_match.group()
                current_chapter = None
            elif chapter_match := chapter_pattern.match(line):
                add_content_to_structure()
                current_chapter = chapter_match.group()
            elif article_match := article_pattern.match(line):
                add_content_to_structure()
                current_article = article_match.group()
            else:
                content_buffer.append(line)

    # Add the last buffered content to the structure
    add_content_to_structure()

    doc.close()

    new_docs = create_documents_from_structure(structure, pdf_path)
    
    # upload pdf file to s3
    
    # s3_file_link, error = upload_file_to_s3(country_name, pdf_path)
    
    # if error:
    #     print(f"Error uploading pdf to S3: {error}")
    #     # stops the loop if there is an error
    #     return None
    
    for doc in new_docs:
        # doc.metadata['link'] = s3_file_link
        doc.metadata['display_name'] = display_name
        doc.metadata['is_proprietary'] = is_proprietary
        doc.metadata['image_link'] = " "
        doc.metadata['file_id'] = file_id

    return new_docs

In [3]:
a= extract_headings_and_content(1,"ex reg.pdf", "file", False)
print(a)
for i in a:
    print(i)
    # print(i.metadata)
    print(i.page_content)
    # print("\n")

[Document(metadata={'file_name': 'ex reg.pdf', 'page': 2, 'display_name': 'file', 'is_proprietary': False, 'image_link': ' ', 'file_id': 1}, page_content='Article 1 1  1 \nCabinet Decision No. 52 of 2017 and its amendments – Unofficial translation \n \nThis is not an official Translation: \nThe Executive Regulation of the Federal Decree-Law No. 8 \nof 2017 on Value Added Tax \nCabinet Decision No. 52 of 2017 – Issued 26 Nov 2017 \nCabinet Decision No. 46 of 2020 – Issued 4 Jun 2020 (Effective from 4 Jun 2020) \nCabinet Decision No. 24 of 2021 – Issued 11 Mar 2021 (Effective from 1 Jan 2018) \nCabinet Decision No. 88 of 2021 – Issued 28 Sep 2021 (Effective from 30 Oct 2021) \nCabinet Decision No. 99 of 2022 – Issued 21 Oct 2022 (Effective from 1 Jan 2023) \nCabinet Decision No. 100 of 2024 – Issued 6 Sept 2024 (Effective from 15 Nov 2024) \n \nThe Cabinet has decided:  \n \n- Having reviewed the Constitution, \n- Federal Law No. 1 of 1972 on the Competencies of the Ministries and Powers

In [4]:
from neo4j import GraphDatabase
import re

# Neo4j Connection Settings
uri = "bolt://localhost:7687"  # Replace with your Neo4j URI
user = "neo4j"
password = "12345678"     # Replace with your Neo4j password

# Initialize the Neo4j driver
driver = GraphDatabase.driver(uri, auth=(user, password))

def parse_article_content(page_content):
    """
    Parses the article content to extract article number, title, amendments, and cleans the content.
    """
    # Regular expression to extract article number and title
    article_match = re.search(
        r'Article\s+(\d+)(\s+\(bis\))?\s*–\s*(.*?)(?=\s*\d+\.|\s*$)', 
        page_content
    )
    
    article_number = None
    title = None
    if article_match:
        article_number = article_match.group(1).strip()
        if article_match.group(2):  # Check for 'bis' (supplementary article)
            article_number += " bis"
        title = article_match.group(3).strip()
    
    # Extract amendments (e.g., "Article amended as per Cabinet Decision No. 100 of 2024.")
    amendments = re.findall(
        r'Article amended as per (Cabinet Decision No\. \d+ of \d{4})', 
        page_content
    )
    
    # Clean content by removing amendment footnotes
    cleaned_content = re.sub(
        r'\n\s*\d+ Article amended as per Cabinet Decision No\. \d+ of \d{4}\.?\s*\n', 
        '\n', 
        page_content
    ).strip()
    
    return {
        "article_number": article_number,
        "title": title,
        "content": cleaned_content,
        "amendments": amendments
    }

def create_article_node(tx, article_data, metadata):
    """
    Creates an Article node in Neo4j with the provided properties.
    """
    # Skip if article_number is null
    if not article_data["article_number"]:
        print(f"Skipping article with null article_number: {article_data}")
        return
    
    query = (
        "MERGE (a:Article {article_number: $article_number}) "
        "SET a.title = $title, "
        "a.content = $content, "
        "a.amendments = $amendments, "
        "a.page = $page, "
        "a.file_name = $file_name"
    )
    tx.run(query, 
           article_number=article_data["article_number"],
           title=article_data["title"],
           content=article_data["content"],
           amendments=article_data["amendments"],
           page=metadata["page"],
           file_name=metadata["file_name"])

# Process each document and create nodes
for doc in a:  # Assuming 'documents' is your list of Document objects
    metadata = doc.metadata
    page_content = doc.page_content
    
    # Parse article details from content
    article_data = parse_article_content(page_content)
    
    # Create node in Neo4j
    with driver.session() as session:
        session.execute_write(create_article_node, article_data, metadata)

# Close the driver connection
driver.close()

Skipping article with null article_number: {'article_number': None, 'title': None, 'content': 'Article 1 1  1 \nCabinet Decision No. 52 of 2017 and its amendments – Unofficial translation \n \nThis is not an official Translation: \nThe Executive Regulation of the Federal Decree-Law No. 8 \nof 2017 on Value Added Tax \nCabinet Decision No. 52 of 2017 – Issued 26 Nov 2017 \nCabinet Decision No. 46 of 2020 – Issued 4 Jun 2020 (Effective from 4 Jun 2020) \nCabinet Decision No. 24 of 2021 – Issued 11 Mar 2021 (Effective from 1 Jan 2018) \nCabinet Decision No. 88 of 2021 – Issued 28 Sep 2021 (Effective from 30 Oct 2021) \nCabinet Decision No. 99 of 2022 – Issued 21 Oct 2022 (Effective from 1 Jan 2023) \nCabinet Decision No. 100 of 2024 – Issued 6 Sept 2024 (Effective from 15 Nov 2024) \n \nThe Cabinet has decided:  \n \n- Having reviewed the Constitution, \n- Federal Law No. 1 of 1972 on the Competencies of the Ministries and Powers \nof the Ministers and its amendments, \n- Federal Decree-L

In [None]:
# Handle skipped articles

In [5]:
from neo4j import GraphDatabase
import re

def connect_to_neo4j(uri, user, password):
    return GraphDatabase.driver(uri, auth=(user, password))

def process_document_content(content, driver):
    # Extract all article entries from the document
    article_entries = re.findall(r"Skipping article with null article_number: ({.*?})", content, re.DOTALL)
    
    # Convert string representations to dictionaries
    articles = []
    for entry in article_entries:
        # Clean up the entry string and evaluate it safely
        try:
            clean_entry = entry.replace("'None'", "None").replace("'", '"')
            article_data = eval(entry)
            articles.append(article_data)
        except:
            continue

    with driver.session() as session:
        for article in articles:
            content = article.get('content', '')
            amendments = article.get('amendments', [])
            
            # Extract article number if present
            article_match = re.search(r'Article (\d+(?:\s*\(bis\))?)', content)
            
            if article_match:
                article_num = article_match.group(1)
                
                # Create Article node
                query = """
                MERGE (a:Article {article_number: $article_num})
                SET a.content = $content,
                    a.amendments = $amendments
                """
                
                session.run(query,
                    article_num=article_num,
                    content=content,
                    amendments=amendments
                )
                
                # Create Amendment relationships if present
                for amendment in amendments:
                    amendment_query = """
                    MERGE (am:Amendment {name: $amendment})
                    MERGE (a:Article {article_number: $article_num})
                    MERGE (a)-[:AMENDED_BY]->(am)
                    """
                    
                    session.run(amendment_query,
                        amendment=amendment,
                        article_num=article_num
                    )
            else:
                # Handle content without article number
                content_hash = hash(content) & 0xffffffff
                node_id = f"unnamed_content_{content_hash}"
                
                query = """
                MERGE (u:UnparsedContent {id: $id})
                SET u.content = $content,
                    u.amendments = $amendments
                """
                
                session.run(query,
                    id=node_id,
                    content=content,
                    amendments=amendments
                )

# Usage example
def main():
    # Replace with your Neo4j credentials
    uri = "bolt://localhost:7687"
    user = "neo4j"
    password = "12345678"
    
    driver = connect_to_neo4j(uri, user, password)
    
    try:
        # Get content from your document
        with open('paste.txt', 'r') as file:
            content = file.read()
        
        process_document_content(content, driver)
        print("Successfully processed document content into Neo4j")
        
    finally:
        driver.close()

if __name__ == "__main__":
    main()

Successfully processed document content into Neo4j


In [None]:
# --------------------------------- End ------------------------------------------

In [None]:
llm = groq(
    openai_api_key=os.getenv("OPENAI_API_KEY_PRODUCTION"),
    model_name='gpt-4o-mini',
    temperature=0)

In [29]:
def generate_summary(content, llm):
    response = llm([
        SystemMessage(content="You are a legal assistant that summarizes the content of legal documents. Please summarize the content of the document below provided summaries of different terms."),
        HumanMessage(content= f"""You are provided with the content related to the definitions of different legal terms. You have to summarize the provided content such that no information is lost. Directly start writing the content instead of writing starting lines like the document includes this and this. 
                     The content is as follows: {content}""")
    ])
    return response

In [30]:
def add_summary(docs, summary):
    """Append the summary to the content of each document. Exclude the first chunk"""
    for i in docs[1:]:
        i.page_content = f"Summary of definitions of legal terms: {summary} \n\n Chunk content: \n {i.page_content}"
    return docs

In [31]:
def remove_arabic(text):
    arabic_pattern = re.compile("[\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF\uFB50-\uFDFF\uFE70-\uFEFF]+")
    cleaned_text = arabic_pattern.sub('', text)
    return cleaned_text

In [32]:
def file_names_directory(directory_path) -> list:
    """Returns file names of all pdf files in the given directory in the form of list."""
    pdf_files = []
    for file in os.listdir(directory_path):
        if file.endswith(".pdf"):
            pdf_files.append(file)
    return pdf_files

In [33]:
def create_chunks(file, llm):
    docs1 = extract_headings_and_content(1,file, file, False)
    summary = generate_summary(docs1[0].page_content, llm)
    docs = add_summary(docs1, summary)
    return docs

In [54]:
def store_chunks(chunks, file, directory_path):
    """Store chunks of a file in the directory as pickle file."""
    file_name = file.split(".")[0]
    with open(f"{directory_path}/{file_name}.pkl", "wb") as f:
        pickle.dump(chunks, f)

In [34]:
# file_names_directory("Test01")

In [56]:
def create_chunks_from_directory(read_directory, store_directory, llm):
    pdf_files = file_names_directory(read_directory)
    all_docs = []
    for file in pdf_files:
        docs = create_chunks(f"{read_directory}/{file}", llm)
        store_chunks(docs, file, store_directory)
        all_docs.append(docs)
    return all_docs

In [59]:
docs = create_chunks_from_directory("Test01", "Store01", llm)

In [60]:
docs

[[Document(metadata={'file_name': 'test01.pdf', 'page': 3, 'display_name': 'Test01/test01.pdf', 'is_proprietary': False, 'image_link': ' ', 'file_id': 1}, page_content='Article 1 - Definitions  1 \nCabinet Decision No. 36 of 2017 and Cabinet Decision No. 51 of 2021 - Unofficial Translation \n \n \n \nThis is not an official Translation: \nThe Executive Regulation of Federal Law No. 7 of 2017 \non Tax Procedures \nCabinet Decision No. 36 of 2017 – Issue Date: 24th of September 2017 \nCabinet Decision No. 51 of 2021 – Issue Date: 28th of April 2021 \n \nThe Cabinet has decided:  \n \n- Having reviewed the Constitution; \n- Federal Law No. 1 of 1972 on the Competencies of the Ministries and Powers of \nthe Ministers and its amendments; \n- Federal Decree-Law No. 13 of 2016 on the Establishment of the Federal Tax \nAuthority; \n- Federal Law No. 7 of 2017 on Tax Procedures; \n- Based on what was presented by the Minister of Finance and approved by the \nCabinet, \n \n \nTitle One  \nIn the

In [61]:
len(docs[0])

29

In [62]:
for i in docs[0]:
    print(i)
    print(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>")

page_content='Article 1 - Definitions  1 
Cabinet Decision No. 36 of 2017 and Cabinet Decision No. 51 of 2021 - Unofficial Translation 
 
 
 
This is not an official Translation: 
The Executive Regulation of Federal Law No. 7 of 2017 
on Tax Procedures 
Cabinet Decision No. 36 of 2017 – Issue Date: 24th of September 2017 
Cabinet Decision No. 51 of 2021 – Issue Date: 28th of April 2021 
 
The Cabinet has decided:  
 
- Having reviewed the Constitution; 
- Federal Law No. 1 of 1972 on the Competencies of the Ministries and Powers of 
the Ministers and its amendments; 
- Federal Decree-Law No. 13 of 2016 on the Establishment of the Federal Tax 
Authority; 
- Federal Law No. 7 of 2017 on Tax Procedures; 
- Based on what was presented by the Minister of Finance and approved by the 
Cabinet, 
 
 
Title One  
In the application of the provisions of this Decision, the following words and 
expressions shall have the meanings assigned against each, unless the context 
otherwise requires: 
State

In [63]:
# Read from pickle file
with open("Store01/test01.pkl", "rb") as f:
    data = pickle.load(f)

In [65]:
for i in data:
    print(i.page_content)
    print(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>")

Article 1 - Definitions  1 
Cabinet Decision No. 36 of 2017 and Cabinet Decision No. 51 of 2021 - Unofficial Translation 
 
 
 
This is not an official Translation: 
The Executive Regulation of Federal Law No. 7 of 2017 
on Tax Procedures 
Cabinet Decision No. 36 of 2017 – Issue Date: 24th of September 2017 
Cabinet Decision No. 51 of 2021 – Issue Date: 28th of April 2021 
 
The Cabinet has decided:  
 
- Having reviewed the Constitution; 
- Federal Law No. 1 of 1972 on the Competencies of the Ministries and Powers of 
the Ministers and its amendments; 
- Federal Decree-Law No. 13 of 2016 on the Establishment of the Federal Tax 
Authority; 
- Federal Law No. 7 of 2017 on Tax Procedures; 
- Based on what was presented by the Minister of Finance and approved by the 
Cabinet, 
 
 
Title One  
In the application of the provisions of this Decision, the following words and 
expressions shall have the meanings assigned against each, unless the context 
otherwise requires: 
State 
: United Ara