### Land Acquisitions - Building Systems (learning)

Import Libraries

In [2]:
import re #Regular expressions
from collections import defaultdict, Counter
from typing import Dict, List, Tuple, Any #Supports typing hints for code documentation etc
import pandas as pd
from pathlib import Path #provides an object-oriented interface for working with filesystem paths.
import logging

import PyPDF2 #Used for reading, splitting, merging Pdf's etc
import fitz # from pymupdf, peforms better than PyPDF2
from fuzzywuzzy import process, fuzz
from watermark import watermark

%load_ext watermark

The watermark extension is already loaded. To reload it, use:
  %reload_ext watermark


In [3]:
%matplotlib inline
%watermark --iversion

pandas    : 2.2.3
fuzzywuzzy: 0.18.0
PyPDF2    : 3.0.1
pymupdf   : 1.24.11
fitz      : 1.24.11
logging   : 0.5.1.2
re        : 2.2.1
watermark : 2.5.0



#### Convert PDF to text 

In [8]:
def clean_text(text):
    # Replace common OCR errors
    replacements = {
        '_.r': '',  # Remove this artifact
        'fllO,OOO.OO': '₵10,000.00',  # Correct currency symbol
        'Ju#\'': 'July',  # Correct month
        'L.I. 14S~.': 'L.I. 1452.',  # Correct regulation number
        'W1LDUFE': 'WILDLIFE',  # Correct spelling
        ';;r~din~': 'Trading',  # Correct word
        'wi:ho~t': 'without',  # Correct word
        '1icenc:e.': 'licence.',  # Correct word
        'prohibnod.': 'prohibited.',  # Correct word
        'A· licati I! li d': 'Application for licence under',  # Correct phrase
        '0.•': 'or',  # Correct word
        'whended': 'amended',  # Correct word
    }
    
    for old, new in replacements.items():
        text = text.replace(old, new)
    
    # Correct spacing issues
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with single space

    
    return text.strip()


def extract_pdf_text(pdf_path):
    extracted_data = defaultdict(str)
    
    with fitz.open(pdf_path) as doc:
        for page_num in range(len(doc)):
            page = doc[page_num]
            text = page.get_text("text")
            cleaned_text = clean_text(text)
            extracted_data[page_num + 1] = cleaned_text
    
    return extracted_data

def display_text_on_page(extracted_data, page_number):
    if page_number in extracted_data:
        print(f"Text on Page {page_number}:\n{extracted_data[page_number]}")
    else:
        num_pages = len(extracted_data)
        if num_pages > 0:
            print(f"Page number {page_number} not found in the extracted data. Valid page numbers are 1 to {num_pages}")
        else:
            print("No pages found in the extracted data. Make sure you have executed the code to extract data.")

def total_word_count(input_str):
    words = input_str.split()  # Split the input string by spaces into a list of words
    count = len(words)         # Count the number of words in the list
    return count

def word_occurrences(input_str):
    word_count = {}
    for word in input_str.split():
        word_count[word] = word_count.get(word, 0) + 1
    return word_count

text = "this is a test this is only a test"
print(word_occurrences(text))  # Output: {'this': 2, 'is': 2, 'a': 2, 'test': 2, 'only': 1}



# Usage
pdf_path =  r"C:\Users\ellio\Desktop\Personal\Land Matters\Lands Commission\Head Office\EI . LI's\LEGISLATIVE INSTRUMENT, (1989)\scan0001.pdf"
extracted_data = extract_pdf_text(pdf_path)

# Display text from a specific page
page_number_to_display = 2
display_text_on_page(extracted_data, page_number_to_display)

Text on Page 2:
~'. . 2 WILDLIFE CONSERVATION (AMENDMENT) ( REGULATIONS, 1989 ~~l~P~~'O!l' . 6B (1) No person shall keep a wild animal WJ d &OImas . .' ., • without' ,.as a pet unless he IS the .holder .of a licence licence! issued by the Chief Game and ·Wildlife Officer prohibited. or his representative for that purpose. . (2) Application for licence to keep a :\.:' . wild animal asa pet shall be made in writing -,to the Chief Game and Wildlife Officer or ...... ,. hIS representative in the area of residence of the applicant accompanied by such fees as may be determined by the Chief Game and . Wildlife Officer. I" :.' ,(3) The Chief Game. and Wildlife " .( Officer may in granting a licence under this .' '.;'.' ,sej::tion stipulate the conditions under which the wild animal shall be kept . .i ;': '. . (4) A licence granted under this section r : may'forstatedreasons bewithdrawn and the pet confiscated to the State. ..'., (5) All fees collected forlicences issued .. '.;' ',. underthe re

In [4]:
# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

class PDFTextProcessor:
    def __init__(self):
        self.common_ocr_errors = {
            '_.r': '',  # Remove this artifact
            'fllO,OOO.OO': '₵10,000.00',  # Correct currency symbol
            'Ju#\'': 'July',  # Correct month
            'L.I. 14S~.': 'L.I. 1452.',  # Correct regulation number
            'W1LDUFE': 'WILDLIFE',  # Correct spelling
            ';;r~din~': 'Trading',  # Correct word
            'wi:ho~t': 'without',  # Correct word
            '1icenc:e.': 'licence.',  # Correct word
            'prohibnod.': 'prohibited.',  # Correct word
            'A· licati I! li d': 'Application for licence under',  # Correct phrase
            '0.•': 'or',  # Correct word
            'whended': 'amended',  # Correct word
        }
        
    def clean_text(self, text: str) -> str:
        """
        Clean and normalize text from PDF.
        """
        try:
            # Apply OCR error corrections
            for old, new in self.common_ocr_errors.items():
                text = text.replace(old, new)
            
            # Advanced text cleaning
            text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces
            text = re.sub(r'([.!?])\s*([A-Z])', r'\1\n\n\2', text)  # Add line breaks after sentences
            text = re.sub(r'(\d+\.)\s*([A-Z])', r'\1\n\2', text)  # Add line breaks after numbered points
            
            return text.strip()
        except Exception as e:
            logger.error(f"Error in clean_text: {str(e)}")
            return text

    def extract_pdf_text(self, pdf_path: str) -> Dict[int, str]:
        """
        Extract and clean text from PDF file.
        """
        try:
            pdf_path = Path(pdf_path)
            if not pdf_path.exists():
                raise FileNotFoundError(f"PDF file not found: {pdf_path}")
            
            extracted_data = defaultdict(str)
            with fitz.open(pdf_path) as doc:
                for page_num in range(len(doc)):
                    page = doc[page_num]
                    text = page.get_text("text")
                    cleaned_text = self.clean_text(text)
                    extracted_data[page_num + 1] = cleaned_text
                    
            return extracted_data
        except Exception as e:
            logger.error(f"Error processing PDF {pdf_path}: {str(e)}")
            return defaultdict(str)

    def analyze_text(self, text: str) -> Dict[str, Any]:
        """
        Perform comprehensive text analysis.
        """
        try:
            words = text.split()
            word_count = len(words)
            
            # Word frequency analysis
            word_freq = Counter(words)
            
            # Basic statistics
            analysis = {
                'total_words': word_count,
                'unique_words': len(word_freq),
                'avg_word_length': sum(len(word) for word in words) / word_count if word_count > 0 else 0,
                'most_common_words': word_freq.most_common(10),
                'sentence_count': len(re.findall(r'[.!?]+', text)),
            }
            
            return analysis
        except Exception as e:
            logger.error(f"Error in text analysis: {str(e)}")
            return {}

    def export_to_csv(self, extracted_data: Dict[int, str], output_path: str) -> None:
        """
        Export extracted text to CSV file.
        """
        try:
            df = pd.DataFrame.from_dict(extracted_data, orient='index', columns=['text'])
            df.index.name = 'page_number'
            df.to_csv(output_path)
            logger.info(f"Data exported to {output_path}")
        except Exception as e:
            logger.error(f"Error exporting to CSV: {str(e)}")

def main():
    # Initialize processor
    processor = PDFTextProcessor()
    
    # Process PDF
    pdf_path =  r"C:\Users\ellio\Desktop\Personal\Land Matters\Lands Commission\Head Office\EI . LI's\LEGISLATIVE INSTRUMENT, (1989)\scan0001.pdf"
    try:
        # Extract text
        extracted_data = processor.extract_pdf_text(pdf_path)
        
        # Analyze each page
        for page_num, text in extracted_data.items():
            print(f"\nPage {page_num} Analysis:")
            analysis = processor.analyze_text(text)
            print("Word Count:", analysis['total_words'])
            print("Unique Words:", analysis['unique_words'])
            print("Average Word Length:", f"{analysis['avg_word_length']:.2f}")
            print("\nMost Common Words:")
            for word, count in analysis['most_common_words']:
                print(f"  {word}: {count}")
        
        # Export to CSV
        output_path = Path(pdf_path).with_suffix('.csv')
        processor.export_to_csv(extracted_data, output_path)
        
    except Exception as e:
        logger.error(f"Error in main execution: {str(e)}")

if __name__ == "__main__":
    main()

2024-10-25 15:20:09,701 - ERROR - Error processing PDF C:\Users\ellio\Desktop\Personal\Land Matters\Lands Commission\Head Office\EI . LI's\LEGISLATIVE INSTRUMENT, (1989)\scan0001.pdf: PDF file not found: C:\Users\ellio\Desktop\Personal\Land Matters\Lands Commission\Head Office\EI . LI's\LEGISLATIVE INSTRUMENT, (1989)\scan0001.pdf
2024-10-25 15:20:09,705 - ERROR - Error exporting to CSV: Cannot save file into a non-existent directory: 'C:\Users\ellio\Desktop\Personal\Land Matters\Lands Commission\Head Office\EI . LI's\LEGISLATIVE INSTRUMENT, (1989)'


Word Classification