In [1]:
import PyPDF2
import re
from natsort import natsorted

# Function to extract text from PDF starting from a specific page
def extract_text_from_pdf(pdf_path, start_page=11):
    with open(pdf_path, 'rb') as file:
        pdf_reader = PyPDF2.PdfReader(file)
        num_pages = len(pdf_reader.pages)
        text = ''
        for page_num in range(start_page, num_pages):
            page = pdf_reader.pages[page_num]
            text += page.extract_text()
        return text


# Path to your PDF file
pdf_path = 'data/ConstitutionOfKenya.pdf'

constitution_text = extract_text_from_pdf(pdf_path)

# Print a snippet to verify
print(constitution_text[:1000])  # Print first 1000 characters as a snippet

Const2010
Constitution of Kenya, 2010
12CHAPTER ONE – SOVEREIGNTY OF THE PEOPLE
AND SUPREMACY OF THIS CONSTITUTION
1.  Sovereignty of the people
(1)  All sovereign power belongs to the people of Kenya and shall be exercised
only in accordance with this Constitution.
(2)  The people may exercise their sovereign power either directly or through
their democratically elected representatives.
(3)  Sovereign power under this Constitution is delegated to the following State
organs, which shall perform their functions in accordance with this Constitution—
(a)Parliament and the legislative assemblies in the county governments;
(b)the national executive and the executive structures in the county
governments; and
(c)the Judiciary and independent tribunals.
(4)  The sovereign power of the people is exercised at—
(a)the national level; and
(b)the county level.
2.  Supremacy of this Constitution
(1)  This Constitution is the supreme law of the Republic and binds all persons
and all State organs at b

In [2]:
# Define a function to find and clean chapter headings in the extracted text
def find_and_clean_chapters(text):
    # Use a regular expression to match chapter headings
    chapter_pattern = r"CHAPTER\s+\w+"
    chapters = re.findall(chapter_pattern, text)

    # Use a set to remove duplicates and then convert back to a list
    unique_chapters = list(dict.fromkeys(chapters))

    # Sort chapters naturally
    sorted_chapters = natsorted(unique_chapters, key=lambda x: (x.split()[1], x))

    return sorted_chapters

# Find and clean chapters
unique_chapters = find_and_clean_chapters(constitution_text)

# Print the unique chapters found
for chapter in unique_chapters:
    print(chapter)


CHAPTER EIGHT
CHAPTER EIGHTEEN
CHAPTER ELEVEN
CHAPTER FIFTEEN
CHAPTER FIVE
CHAPTER FOUR
CHAPTER FOURTEEN
CHAPTER NINE
CHAPTER ONE
CHAPTER SEVEN
CHAPTER SEVENTEEN
CHAPTER SIX
CHAPTER SIXTEEN
CHAPTER TEN
CHAPTER THIRTEEN
CHAPTER THREE
CHAPTER TWELVE
CHAPTER TWO


In [3]:
import re
from natsort import natsorted

unique_chapters = find_and_clean_chapters(constitution_text)

# Count the number of unique chapters
unique_chapter_count = len(unique_chapters)

# Print the count of unique chapters
print("\nNumber of unique chapters:", unique_chapter_count)



Number of unique chapters: 18


In [4]:
def get_chapter_title(text, chapter_number):
    # Construct the chapter heading to search for, e.g., "CHAPTER ONE"
    chapter_heading = f"CHAPTER {chapter_number.upper()}"

    # Regular expression to match the chapter heading and capture the following title
    pattern = re.compile(
        rf"{chapter_heading}\s*[-–—:]*\s*(.*?)(?=\n|PART|CHAPTER|\Z)",
        re.DOTALL
    )

    # Search for the pattern in the text
    match = re.search(pattern, text)
    
    if match:
        title = match.group(1).strip()
        return title
    else:
        return f"Title for {chapter_heading} not found"


In [5]:
#for random chapters
# Get the title for "CHAPTER ONE"
chapter_one_title = get_chapter_title(constitution_text, "ONE")
print(f"CHAPTER ONE\n{chapter_one_title}\n")

# Get the title for "CHAPTER TWO"
chapter_two_title = get_chapter_title(constitution_text, "TWO")
print(f"CHAPTER TWO\n{chapter_two_title}\n")

# Get the title for "CHAPTER THREE"
chapter_3_title = get_chapter_title(constitution_text, "THREE")
print(f"CHAPTER THREE\n{chapter_3_title}\n")

# Get the title for "CHAPTER THREE"
chapter_4_title = get_chapter_title(constitution_text, "FOUR")
print(f"CHAPTER FOUR\n{chapter_4_title}\n")

# Get the title for "CHAPTER THREE"
chapter_5_title = get_chapter_title(constitution_text, "FIVE")
print(f"CHAPTER FIVE\n{chapter_5_title}\n")

# Get the title for "CHAPTER THREE"
chapter_title = get_chapter_title(constitution_text, "SIX")
print(f"CHAPTER SIX\n{chapter_title}\n")


CHAPTER ONE
SOVEREIGNTY OF THE PEOPLE

CHAPTER TWO
THE REPUBLIC

CHAPTER THREE
CITIZENSHIP

CHAPTER FOUR
THE BILL OF RIGHTS

CHAPTER FIVE
LAND AND ENVIRONMENT

CHAPTER SIX
LEADERSHIP AND INTEGRITY



In [6]:
import re

def get_all_chapter_titles(text):
    # Regular expression to find all chapter headings and titles
    pattern = re.compile(
        r"CHAPTER\s+(?:[A-Z]+|\d+)\s*[-–—:]*\s*(.*?)(?=\n|PART|CHAPTER|\Z)",
        re.DOTALL
    )

    # Find all matches for the pattern in the text
    matches = re.finditer(pattern, text)

    # Dictionary to store chapter titles
    chapter_titles = {}

    # Iterate over all matches and extract the chapter number and title
    for match in matches:
        # Extract the whole match string (e.g., "CHAPTER ONE SOVEREIGNTY OF THE PEOPLE")
        full_match = match.group(0).strip()
        
        # Extract the chapter title
        title = match.group(1).strip()

        # Extract chapter number (e.g., "ONE")
        chapter_number = re.search(r"CHAPTER\s+([A-Z]+|\d+)", full_match).group(1)

        # Add the chapter number and title to the dictionary
        chapter_titles[chapter_number] = title

    return chapter_titles


# Get all chapter titles
chapter_titles = get_all_chapter_titles(constitution_text)

# Print all chapter titles
for chapter, title in chapter_titles.items():
    print(f"CHAPTER {chapter}\n{title}\n")


CHAPTER ONE
SOVEREIGNTY OF THE PEOPLE

CHAPTER TWO
THE REPUBLIC

CHAPTER THREE
CITIZENSHIP

CHAPTER FOUR
THE BILL OF RIGHTS

CHAPTER FIVE
LAND AND ENVIRONMENT

CHAPTER SIX
LEADERSHIP AND INTEGRITY

CHAPTER SEVEN
REPRESENTATION OF THE PEOPLE

CHAPTER EIGHT
THE LEGISLATURE

CHAPTER NINE
THE EXECUTIVE

CHAPTER TEN
JUDICIARY

CHAPTER ELEVEN
DEVOLVED GOVERNMENT

CHAPTER TWELVE
PUBLIC FINANCE

CHAPTER THIRTEEN
THE PUBLIC SERVICE

CHAPTER FOURTEEN
NATIONAL SECURITY

CHAPTER FIFTEEN
COMMISSIONS AND INDEPENDENT OFFICES

CHAPTER SIXTEEN
AMENDMENT OF THIS CONSTITUTION

CHAPTER SEVENTEEN
GENERAL PROVISIONS

CHAPTER EIGHTEEN
TRANSITIONAL



In [7]:
import re

def get_chapter_content(text, chapter_number):
    # Construct the chapter heading to search for, e.g., "CHAPTER ONE"
    chapter_heading = f"CHAPTER {chapter_number.upper()}"

    # Regular expression to match the chapter heading and its following content
    # Match everything from "CHAPTER ONE" up to the next chapter or end of the text
    pattern = re.compile(
        rf"{chapter_heading}\b.*?(?=\nCHAPTER\s+\w+|\Z)", 
        re.DOTALL | re.IGNORECASE
    )

    # Search for the pattern in the text
    match = re.search(pattern, text)
    
    if match:
        chapter_content = match.group(0).strip()
        return chapter_content
    else:
        return f"Content for {chapter_heading} not found"

# Get the content for "CHAPTER ONE"
chapter_one_content = get_chapter_content(constitution_text, "ONE")

# Print the content for "CHAPTER ONE"
print(f"CHAPTER ONE CONTENT:\n{chapter_one_content}\n")


CHAPTER ONE CONTENT:
CHAPTER ONE – SOVEREIGNTY OF THE PEOPLE
AND SUPREMACY OF THIS CONSTITUTION
1.  Sovereignty of the people
(1)  All sovereign power belongs to the people of Kenya and shall be exercised
only in accordance with this Constitution.
(2)  The people may exercise their sovereign power either directly or through
their democratically elected representatives.
(3)  Sovereign power under this Constitution is delegated to the following State
organs, which shall perform their functions in accordance with this Constitution—
(a)Parliament and the legislative assemblies in the county governments;
(b)the national executive and the executive structures in the county
governments; and
(c)the Judiciary and independent tribunals.
(4)  The sovereign power of the people is exercised at—
(a)the national level; and
(b)the county level.
2.  Supremacy of this Constitution
(1)  This Constitution is the supreme law of the Republic and binds all persons
and all State organs at both levels of gover

In [8]:
import re
from transformers import pipeline

def extract_chapter_content(text, chapter_number):
    chapter_heading = f"CHAPTER {chapter_number.upper()}"
    pattern = re.compile(rf"{chapter_heading}\b.*?(?=\nCHAPTER\s+\w+|\Z)", re.DOTALL | re.IGNORECASE)
    match = re.search(pattern, text)
    
    if match:
        chapter_content = match.group(0).strip()
        return chapter_content
    else:
        return f"Content for {chapter_heading} not found"

# Get the content for "CHAPTER ONE"
chapter_one_content = extract_chapter_content(constitution_text, "ONE")

# Summarize the text using transformers
def summarize_text(text):
    summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
    summary = summarizer(text, max_length=200, min_length=30, do_sample=False)[0]['summary_text']
    return summary

# Summarize "CHAPTER ONE"
summary = summarize_text(chapter_one_content)
summary







'All sovereign power belongs to the people of Kenya and shall be exercisedonly in accordance with this Constitution. The Constitution is the supreme law of the Republic and binds all persons and all State organs at both levels of government.'

In [9]:
# Function to extract all chapters and their content
def extract_all_chapters(constitution_text):
    chapters = []
    # Regex to match chapters in uppercase and their titles
    chapter_pattern = re.compile(r"CHAPTER\s+(\w+)\s*–\s*([^\n]+)?\b(.*?)(?=\nCHAPTER\s+\w+\b|\Z)", re.DOTALL)

    for match in chapter_pattern.finditer(constitution_text):
        chapter_number = match.group(1).strip()
        chapter_title = match.group(2).strip() if match.group(2) else "Unknown Title"
        chapter_content = match.group(3).strip()
        chapters.append({
            "chapter": chapter_number,
            "title": chapter_title,
            "content": chapter_content
        })

    return chapters

# Function to normalize text
def normalize_text(text):
    # Remove excessive whitespace, convert to lowercase
    return ' '.join(text.lower().split())

# Function to extract articles from chapter content
def extract_articles(chapter_content):
    # Regex to match articles with optional sub-articles
    article_pattern = re.compile(r"(\d+)\.\s*(.*?)(?=\n\d+\.\s|\n\Z)", re.DOTALL)

    articles = []
    for match in article_pattern.finditer(chapter_content):
        article_number = match.group(1).strip()
        article_content = match.group(2).strip()
        articles.append({
            "article": article_number,
            "content": article_content
        })

    return articles

# Function to find a specific provision in the entire Constitution
def find_provision_in_constitution(constitution_text, provision_text):
    # Normalize the provision text
    normalized_provision = normalize_text(provision_text)
    
    # Extract all chapters
    chapters = extract_all_chapters(constitution_text)

    # List to store results
    results = []

    # Search within each chapter
    for chapter in chapters:
        articles = extract_articles(chapter["content"])
        for article in articles:
            normalized_article_content = normalize_text(article["content"])
            if normalized_provision in normalized_article_content:
                results.append({
                    "chapter": chapter["chapter"],
                    "title": chapter["title"],
                    "article": article["article"],
                    "content": article["content"]
                })

    return results

# Example provision to search for
provision_text = r"freedom from torture and cruel, inhuman or degrading treatment orpunishment"

# Assuming constitution_text is already defined from previous steps
results = find_provision_in_constitution(constitution_text, provision_text)

# Print results
if results:
    for result in results:
        print(f"Found in CHAPTER {result['chapter']} – {result['title']}\nArticle {result['article']}\n")
else:
    print("Provision not found in the Constitution.")

Found in CHAPTER FOUR – THE BILL OF RIGHTS
Article 25

