In [1]:
#!/usr/bin/env python3

"""
Congressional Record/Bill Data Extractor
Extract structured data from Congressional PDFs (congress.gov)
"""

import os
import re
import pandas as pd
import pdfplumber
from datetime import datetime
from pathlib import Path
from typing import Dict, List, Optional
import requests
import logging

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('congressional_extraction.log'),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)


class CongressionalExtractor:
    """Extract structured data from Congressional Record PDFs"""
    
    def __init__(self, output_file: str = "congressional_data.csv"):
        self.output_file = output_file
        self.results = []
        self.errors = []
        
    def download_pdf(self, url: str, save_path: str) -> bool:
        """Download a PDF from a URL"""
        try:
            logger.info(f"Downloading: {url}")
            response = requests.get(url, timeout=30)
            response.raise_for_status()
            
            with open(save_path, 'wb') as f:
                f.write(response.content)
            
            logger.info(f"Downloaded to: {save_path}")
            return True
            
        except Exception as e:
            logger.error(f"Failed to download {url}: {e}")
            return False
    
    def extract_date(self, text: str) -> Optional[str]:
        """Extract date from Congressional Record"""
        # Look for dates like "THURSDAY, January 14, 1897"
        patterns = [
            r'((?:MONDAY|TUESDAY|WEDNESDAY|THURSDAY|FRIDAY|SATURDAY|SUNDAY),\s+\w+\s+\d{1,2},\s+\d{4})',
            r'(\w+ \d{1,2}, \d{4})',
            r'(\d{1,2}/\d{1,2}/\d{4})',
        ]
        
        for pattern in patterns:
            match = re.search(pattern, text, re.IGNORECASE)
            if match:
                return match.group(1)
        return None
    
    def extract_congress_session(self, text: str) -> Optional[str]:
        """Extract Congress session number"""
        # Look for patterns like "54th Congress" or "Fifty-fourth Congress"
        patterns = [
            r'(\d{1,3})(?:st|nd|rd|th)\s+Congress',
            r'(Fifty-fourth|Fifty-third|etc\.)\s+Congress',
        ]
        
        for pattern in patterns:
            match = re.search(pattern, text, re.IGNORECASE)
            if match:
                return match.group(1)
        return None
    
    def extract_chamber(self, text: str) -> Optional[str]:
        """Determine if Senate or House"""
        if re.search(r'\bSENATE\b', text[:500], re.IGNORECASE):
            return "Senate"
        elif re.search(r'\bHOUSE\b', text[:500], re.IGNORECASE):
            return "House"
        return None
    
    def extract_bill_numbers(self, text: str) -> List[str]:
        """Extract all bill numbers mentioned"""
        # Patterns: H.R. 9710, S. 3526, etc.
        patterns = [
            r'\b([HS]\.?\s?R\.?\s?\d+)\b',
            r'\b([HS]\.?\s?J\.?\s?Res\.?\s?\d+)\b',
            r'\b(S\.\s?\d+)\b',
        ]
        
        bill_numbers = []
        for pattern in patterns:
            matches = re.findall(pattern, text, re.IGNORECASE)
            bill_numbers.extend(matches)
        
        # Clean and deduplicate
        cleaned = list(set([b.strip().upper() for b in bill_numbers]))
        return cleaned
    
    def extract_speakers(self, text: str) -> List[str]:
        """Extract speaker names"""
        # Pattern: "Mr. SMITH." or "The VICE-PRESIDENT."
        pattern = r'(?:Mr\.|Mrs\.|Miss|Ms\.|The)\s+([A-Z\-]+)[\.\:]'
        matches = re.findall(pattern, text)
        
        # Deduplicate and clean
        speakers = list(set([s.strip() for s in matches if len(s) > 2]))
        return speakers[:20]  # Limit to first 20 unique speakers
    
    def extract_committees(self, text: str) -> List[str]:
        """Extract committee names"""
        pattern = r'Committee on ([\w\s,]+?)(?:\.|;|,\s+to)'
        matches = re.findall(pattern, text, re.IGNORECASE)
        
        # Clean and deduplicate
        committees = list(set([c.strip() for c in matches]))
        return committees[:10]  # Limit to first 10
    
    def extract_topics(self, text: str) -> List[str]:
        """Extract main topics/subjects discussed"""
        topics = []
        
        # Look for common topic indicators
        topic_patterns = [
            r'(?:relating to|concerning|regarding)\s+([\w\s]{10,50})',
            r'(?:bill|resolution).*?(?:to|for)\s+([\w\s]{10,50})',
        ]
        
        for pattern in topic_patterns:
            matches = re.findall(pattern, text, re.IGNORECASE)
            topics.extend(matches)
        
        # Clean and limit
        topics = list(set([t.strip() for t in topics if len(t) > 10]))
        return topics[:5]
    
    def extract_votes(self, text: str) -> Dict:
        """Extract voting information"""
        votes = {
            'has_votes': False,
            'passed': None,
            'agreed_to': None
        }
        
        if re.search(r'The motion was agreed to', text):
            votes['has_votes'] = True
            votes['agreed_to'] = True
        
        if re.search(r'The motion was (?:not )?agreed to', text):
            votes['has_votes'] = True
        
        if re.search(r'(?:passed|agreed to|adopted)', text, re.IGNORECASE):
            votes['passed'] = True
        
        return votes
    
    def extract_from_pdf(self, pdf_path: Path, source_url: str = None) -> Dict:
        """Extract all relevant information from a Congressional PDF"""
        logger.info(f"Processing: {pdf_path.name}")
        
        result = {
            'filename': pdf_path.name,
            'source_url': source_url,
            'date': None,
            'congress_session': None,
            'chamber': None,
            'bill_numbers': [],
            'num_bills': 0,
            'speakers': [],
            'num_speakers': 0,
            'committees': [],
            'num_committees': 0,
            'topics': [],
            'has_votes': False,
            'page_count': 0,
            'status': 'success',
            'error': None
        }
        
        try:
            with pdfplumber.open(pdf_path) as pdf:
                result['page_count'] = len(pdf.pages)
                
                # Extract text from all pages
                full_text = ""
                for page in pdf.pages:
                    full_text += page.extract_text() + "\n"
                
                # Extract fields
                result['date'] = self.extract_date(full_text)
                result['congress_session'] = self.extract_congress_session(full_text)
                result['chamber'] = self.extract_chamber(full_text)
                result['bill_numbers'] = self.extract_bill_numbers(full_text)
                result['num_bills'] = len(result['bill_numbers'])
                result['speakers'] = self.extract_speakers(full_text)
                result['num_speakers'] = len(result['speakers'])
                result['committees'] = self.extract_committees(full_text)
                result['num_committees'] = len(result['committees'])
                result['topics'] = self.extract_topics(full_text)
                
                # Extract vote information
                vote_info = self.extract_votes(full_text)
                result.update(vote_info)
                
                # Store lists as strings for CSV
                result['bill_numbers'] = '; '.join(result['bill_numbers'])
                result['speakers'] = '; '.join(result['speakers'])
                result['committees'] = '; '.join(result['committees'])
                result['topics'] = '; '.join(result['topics'])
                
        except Exception as e:
            logger.error(f"Error processing {pdf_path.name}: {e}")
            result['status'] = 'error'
            result['error'] = str(e)
            self.errors.append(result)
        
        return result
    
    def process_url_list(self, url_file: str):
        """Process a list of URLs from a file"""
        with open(url_file, 'r') as f:
            urls = [line.strip() for line in f if line.strip() and not line.strip().startswith('#')]
        
        logger.info(f"Found {len(urls)} URLs to process")
        
        # Create temp directory for downloads
        temp_dir = Path("./temp_pdfs")
        temp_dir.mkdir(exist_ok=True, parents=True)
        
        for idx, url in enumerate(urls, 1):
            logger.info(f"Progress: {idx}/{len(urls)}")
            
            # Generate filename from URL
            filename = url.split('/')[-1]
            if not filename.endswith('.pdf'):
                filename = f"document_{idx}.pdf"
            
            pdf_path = temp_dir / filename
            
            # Download PDF
            if self.download_pdf(url, str(pdf_path)):
                # Extract data
                result = self.extract_from_pdf(pdf_path, url)
                self.results.append(result)
                
                # Clean up downloaded file
                try:
                    pdf_path.unlink()
                except:
                    pass
            
            # Save checkpoint every 100 files
            if idx % 100 == 0:
                self.save_results(f"checkpoint_{idx}.csv")
        
        logger.info(f"Completed processing {len(urls)} URLs")
    
    def process_directory(self, pdf_directory: str):
        """Process all PDFs in a directory"""
        pdf_files = list(Path(pdf_directory).glob("*.pdf"))
        
        if not pdf_files:
            logger.warning(f"No PDF files found in {pdf_directory}")
            return
        
        logger.info(f"Found {len(pdf_files)} PDF files to process")
        
        for idx, pdf_path in enumerate(pdf_files, 1):
            logger.info(f"Progress: {idx}/{len(pdf_files)}")
            result = self.extract_from_pdf(pdf_path)
            self.results.append(result)
            
            if idx % 100 == 0:
                self.save_results(f"checkpoint_{idx}.csv")
    
    def save_results(self, filename: Optional[str] = None):
        """Save results to CSV"""
        output_file = filename or self.output_file
        
        if not self.results:
            logger.warning("No results to save")
            return
        
        df = pd.DataFrame(self.results)
        df.to_csv(output_file, index=False)
        logger.info(f"Results saved to {output_file}")
        
        if self.errors:
            error_df = pd.DataFrame(self.errors)
            error_file = output_file.replace('.csv', '_errors.csv')
            error_df.to_csv(error_file, index=False)
            logger.info(f"Errors saved to {error_file}")
        
        return df
    
    def generate_report(self):
        """Generate a summary report"""
        if not self.results:
            logger.warning("No results to report")
            return
        
        df = pd.DataFrame(self.results)
        
        print("\n" + "="*70)
        print("CONGRESSIONAL RECORD EXTRACTION REPORT")
        print("="*70)
        print(f"Total documents processed: {len(self.results)}")
        print(f"Successful extractions: {len(df[df['status'] == 'success'])}")
        print(f"Failed extractions: {len(df[df['status'] == 'error'])}")
        print(f"\nField coverage:")
        print(f"  Dates extracted: {df['date'].notna().sum()} ({df['date'].notna().sum()/len(df)*100:.1f}%)")
        print(f"  Congress sessions: {df['congress_session'].notna().sum()} ({df['congress_session'].notna().sum()/len(df)*100:.1f}%)")
        print(f"  Chambers identified: {df['chamber'].notna().sum()} ({df['chamber'].notna().sum()/len(df)*100:.1f}%)")
        print(f"  Total bills mentioned: {df['num_bills'].sum()}")
        print(f"  Total speakers identified: {df['num_speakers'].sum()}")
        print(f"  Total committees mentioned: {df['num_committees'].sum()}")
        print("="*70 + "\n")


def main():
    """Main execution function"""
    
    # Initialize extractor
    extractor = CongressionalExtractor(
        output_file="./congressional_data.csv"
    )
    
    # OPTION 1: Process a single URL (for testing)
    single_url = "https://www.congress.gov/54/crecb/1897/01/14/GPO-CRECB-1897-pt1-v29-22-1.pdf"
    temp_dir = Path("./temp_pdfs")
    temp_dir.mkdir(exist_ok=True, parents=True)
    temp_pdf = temp_dir / "sample.pdf"
    
    if extractor.download_pdf(single_url, str(temp_pdf)):
        result = extractor.extract_from_pdf(temp_pdf, single_url)
        extractor.results.append(result)
        try:
            temp_pdf.unlink()
        except:
            pass
    
    # OPTION 2: Process from a file containing URLs (one per line)
    # Uncomment the line below and comment out OPTION 1 to use this
    # extractor.process_url_list("./urls.txt")
    
    # OPTION 3: Process PDFs already downloaded to a directory
    # Uncomment the line below and comment out OPTION 1 to use this
    # extractor.process_directory("./pdfs")
    
    # Save and report
    extractor.save_results()
    extractor.generate_report()


if __name__ == "__main__":
    main()

2025-10-26 14:59:38,529 - INFO - Downloading: https://www.congress.gov/54/crecb/1897/01/14/GPO-CRECB-1897-pt1-v29-22-1.pdf
2025-10-26 14:59:39,754 - INFO - Downloaded to: temp_pdfs/sample.pdf
2025-10-26 14:59:39,756 - INFO - Processing: sample.pdf
2025-10-26 14:59:43,301 - INFO - Results saved to ./congressional_data.csv



CONGRESSIONAL RECORD EXTRACTION REPORT
Total documents processed: 1
Successful extractions: 1
Failed extractions: 0

Field coverage:
  Dates extracted: 1 (100.0%)
  Congress sessions: 0 (0.0%)
  Chambers identified: 1 (100.0%)
  Total bills mentioned: 60
  Total speakers identified: 20
  Total committees mentioned: 10

