In [1]:
import os
import spacy
from spacy.tokens import DocBin
from spacy.training import Example
import re
import xml.etree.ElementTree as ET
from pathlib import Path
import logging
import PyPDF2
import json

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

class PanchayatRajProcessor:
    def __init__(self, model_dir="./trained_models"):
        """Initialize the processor"""
        self.nlp = spacy.blank("en")
        self.model_dir = Path(model_dir)
        self.model_dir.mkdir(parents=True, exist_ok=True)
        
        # Define entity labels
        self.CUSTOM_ENTITIES = [
            "BUDGET_AMOUNT",
            "DATE",
            "FILE_NUMBER",
            "ACCOUNT_HEAD",
            "DEPARTMENT",
            "OFFICIAL_DESIGNATION",
            "TECHNICAL_TERM"
        ]
        
        # Compile regex patterns
        self.patterns = {
            'BUDGET_AMOUNT': r'Rs\.?\s*([\d,]+(?:\.\d{2})?)\/?-',
            'DATE': r'\d{2}-\d{2}-\d{4}',
            'FILE_NUMBER': r'G\.O\.RT\.No\.\s*\d+',
            'ACCOUNT_HEAD': r'\d{14}',
            'DEPARTMENT': r'[A-Z&]{2,}(?:\s+[A-Z]+)*\s+DEPARTMENT',
            'OFFICIAL_DESIGNATION': r'(?:Commissioner|Director|Secretary|PRINCIPAL SECRETARY|GOVERNOR)',
            'TECHNICAL_TERM': r'(?:Finance Commission|Basic grant|Untied|RLBS)'
        }

    def read_pdf(self, pdf_path):
        """
        Read text content from PDF file
        
        Args:
            pdf_path (str): Path to PDF file
            
        Returns:
            str: Extracted text from PDF
        """
        try:
            with open(pdf_path, 'rb') as file:
                pdf_reader = PyPDF2.PdfReader(file)
                text = ""
                for page in pdf_reader.pages:
                    text += page.extract_text() + "\n"
                logger.info(f"Successfully read PDF: {pdf_path}")
                return text
        except Exception as e:
            logger.error(f"Error reading PDF {pdf_path}: {str(e)}")
            return None

    def read_training_data(self, directory):
        """
        Read XML files from training directory
        
        Args:
            directory (str): Path to training data directory
            
        Returns:
            list: List of parsed XML documents
        """
        documents = []
        dir_path = Path(directory)
        
        try:
            for xml_file in dir_path.glob("*.xml"):
                tree = ET.parse(xml_file)
                root = tree.getroot()
                documents.extend(root.findall(".//document"))
                logger.info(f"Successfully processed {xml_file}")
        except Exception as e:
            logger.error(f"Error reading training data: {str(e)}")
            raise
            
        return documents

    def prepare_training_data(self, xml_documents):
        """Prepare training data from XML documents"""
        training_data = []
        
        for doc in xml_documents:
            try:
                content = doc.find("document_content").text
                entities = []
                
                # Find entities using patterns
                for label, pattern in self.patterns.items():
                    for match in re.finditer(pattern, content):
                        entities.append((match.start(), match.end(), label))
                
                # Sort and filter overlapping entities
                entities = sorted(entities, key=lambda x: x[0])
                filtered_entities = []
                last_end = 0
                
                for start, end, label in entities:
                    if start >= last_end:
                        filtered_entities.append((start, end, label))
                        last_end = end
                
                training_data.append((content, {"entities": filtered_entities}))
                
            except Exception as e:
                logger.error(f"Error preparing training data: {str(e)}")
                continue
                
        return training_data

    def train_model(self, training_data, iterations=30):
        """Train the NER model"""
        try:
            # Add NER pipeline
            if "ner" not in self.nlp.pipe_names:
                ner = self.nlp.create_pipe("ner")
                self.nlp.add_pipe("ner", last=True)
            
            # Add labels
            ner = self.nlp.get_pipe("ner")
            for label in self.CUSTOM_ENTITIES:
                ner.add_label(label)
            
            # Train
            other_pipes = [pipe for pipe in self.nlp.pipe_names if pipe != "ner"]
            with self.nlp.disable_pipes(*other_pipes):
                optimizer = self.nlp.begin_training()
                for itn in range(iterations):
                    losses = {}
                    for text, annotations in training_data:
                        doc = self.nlp.make_doc(text)
                        example = Example.from_dict(doc, annotations)
                        self.nlp.update([example], drop=0.5, losses=losses)
                    logger.info(f"Iteration {itn + 1}, Losses: {losses}")
                    
        except Exception as e:
            logger.error(f"Error during training: {str(e)}")
            raise

    def extract_information(self, text):
        """Extract information from text and format output"""
        try:
            doc = self.nlp(text)
            
            extracted_info = {
                "budget_amounts": [],
                "dates": [],
                "file_numbers": [],
                "account_heads": [],
                "departments": [],
                "official_designations": [],
                "technical_terms": [],
                "summary": ""
            }
            
            # Extract entities
            for ent in doc.ents:
                category = ent.label_.lower()
                if category in extracted_info:
                    extracted_info[category].append({
                        "text": ent.text,
                        "start": ent.start_char,
                        "end": ent.end_char
                    })
            
            # Generate summary
            paragraphs = text.split('\n\n')
            if paragraphs:
                extracted_info["summary"] = paragraphs[0].strip()
            
            # Format the extracted information
            formatted_output = "\nExtracted Information:\n"
            for category, items in extracted_info.items():
                formatted_output += f"\n**{category.upper()}**:\n"
                if isinstance(items, list) and items:
                    for item in items:
                        formatted_output += f"- **{item['text']}** (Start: {item['start']}, End: {item['end']})\n"
                elif isinstance(items, str) and items:
                    formatted_output += f"**{items}**\n"
                    
            return formatted_output
            
        except Exception as e:
            logger.error(f"Error extracting information: {str(e)}")
            return None

    def save_model(self, model_name="panchayat_raj_model"):
        """Save the trained model"""
        try:
            model_path = self.model_dir / model_name
            self.nlp.to_disk(model_path)
            logger.info(f"Model saved to {model_path}")
        except Exception as e:
            logger.error(f"Error saving model: {str(e)}")

    def load_model(self, model_name="panchayat_raj_model"):
        """Load a trained model"""
        try:
            model_path = self.model_dir / model_name
            self.nlp = spacy.load(model_path)
            logger.info(f"Model loaded from {model_path}")
        except Exception as e:
            logger.error(f"Error loading model: {str(e)}")

    def save_results(self, results, output_file="extracted_info.json"):
        """Save extracted information to JSON file"""
        try:
            with open(output_file, 'w', encoding='utf-8') as f:
                json.dump(results, f, indent=2, ensure_ascii=False)
            logger.info(f"Results saved to {output_file}")
        except Exception as e:
            logger.error(f"Error saving results: {str(e)}")

def main():
    """Main execution function"""
    # Initialize processor
    processor = PanchayatRajProcessor()
    
    try:
        # Training Phase
        logger.info("Starting training phase...")
        
        # Read and process training documents
        training_docs = processor.read_training_data("/Users/srikar/Desktop/Panchayat Raj")
        logger.info(f"Found {len(training_docs)} training documents")
        
        # Prepare training data
        training_data = processor.prepare_training_data(training_docs)
        logger.info(f"Prepared {len(training_data)} training examples")
        
        # Train the model
        processor.train_model(training_data)
        
        # Save the trained model
        processor.save_model()
        
        # Testing Phase
        logger.info("Starting testing phase...")
        
        # Read test document
        test_pdf_path = "/Users/srikar/Desktop/Panchayat Raj/Test/GO MS NO. 174.pdf - Seri.ap.gov.in.pdf"
        test_text = processor.read_pdf(test_pdf_path)
        
        if test_text:
            # Extract information from test document
            results = processor.extract_information(test_text)
            
            # Save results to JSON file
            processor.save_results(results)
            
            # Print results
            print(results)
        
    except Exception as e:
        logger.error(f"Error in main execution: {str(e)}")

if __name__ == "__main__":
    main()


2024-10-26 22:55:28,932 - INFO - Starting training phase...
2024-10-26 22:55:28,933 - INFO - Found 0 training documents
2024-10-26 22:55:28,933 - INFO - Prepared 0 training examples
[2024-10-26 22:55:29,013] [INFO] Created vocabulary
2024-10-26 22:55:29,013 - INFO - Created vocabulary
[2024-10-26 22:55:29,014] [INFO] Finished initializing nlp object
2024-10-26 22:55:29,014 - INFO - Finished initializing nlp object
2024-10-26 22:55:29,128 - INFO - Iteration 1, Losses: {}
2024-10-26 22:55:29,128 - INFO - Iteration 2, Losses: {}
2024-10-26 22:55:29,128 - INFO - Iteration 3, Losses: {}
2024-10-26 22:55:29,128 - INFO - Iteration 4, Losses: {}
2024-10-26 22:55:29,128 - INFO - Iteration 5, Losses: {}
2024-10-26 22:55:29,129 - INFO - Iteration 6, Losses: {}
2024-10-26 22:55:29,129 - INFO - Iteration 7, Losses: {}
2024-10-26 22:55:29,129 - INFO - Iteration 8, Losses: {}
2024-10-26 22:55:29,129 - INFO - Iteration 9, Losses: {}
2024-10-26 22:55:29,130 - INFO - Iteration 10, Losses: {}
2024-10-26 


Extracted Information:

**BUDGET_AMOUNTS**:

**DATES**:

**FILE_NUMBERS**:

**ACCOUNT_HEADS**:

**DEPARTMENTS**:

**OFFICIAL_DESIGNATIONS**:

**TECHNICAL_TERMS**:

**SUMMARY**:
**GOVERNMENT OF ANDHRA PRADESH 
ABSTRACT 
LOANS AND ADVANCES— House Building Advance—Admissibility in Revised Pay  
Scales, 2010 – Recommendation of Ninth Pay Revision  Commission -- Accepted – Orders 
–Issued. 
                                FINANCE (A & L) DEPARTMENT 
G.O. Ms. No.174                                                                              Dated:15-05-2010.  
                                                                                                   Rea d the following:- 
1. G.O. (P) No.77, Finance (FW: A&L) Department, dt  03-04-2006. 
2. G.O.Ms.No. 438, General Administration (Spl. A) Department, dt 7-7-2008. 
3. G.O. (P) No. 52, Finance (PC.I) Department, dt 2 5-2-2010. 
                                                  * ** 
ORDER: 
 
 In the Government Order 3rd read above, ba