In [1]:
#!/usr/bin/env python3

"""
Congressional Record Homelessness Keyword Extractor
Filters PDFs for homelessness-related keywords and extracts relevant data
"""

import os
import re
import pandas as pd
import pdfplumber
from datetime import datetime
from pathlib import Path
from typing import Dict, List, Optional, Tuple
import requests
import logging

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('homelessness_extraction.log'),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)


class HomelessnessExtractor:
    """Extract Congressional Records that mention homelessness-related keywords"""
    
    # Define your keywords
    KEYWORDS = [
        "homeless",
        "homelessness",
        "houseless",
        "Houselessness",
        "unhoused",
        "tramp",
        "vagrant",
        "people experiencing homelessness"
    ]
    
    # Your specific URLs
    URLS = [
        "https://www.congress.gov/88/crecb/1963/06/03/GPO-CRECB-1963-pt8-3-1.pdf",
        "https://www.congress.gov/86/crecb/1960/05/27/GPO-CRECB-1960-pt9-2-1.pdf",
        "https://www.congress.gov/86/crecb/1959/05/14/GPO-CRECB-1959-pt6-10-2.pdf",
        "https://www.congress.gov/86/crecb/1959/01/26/GPO-CRECB-1959-pt1-13-1.pdf",
        "https://www.congress.gov/85/crecb/1958/08/22/GPO-CRECB-1958-pt15-2.pdf",
        "https://www.congress.gov/85/crecb/1957/02/04/GPO-CRECB-1957-pt2-1-2.pdf",
        "https://www.congress.gov/87/crecb/1961/04/20/GPO-CRECB-1961-pt5-9-2.pdf",
        "https://www.congress.gov/84/crecb/1955/04/27/GPO-CRECB-1955-pt4-10-1.pdf",
        "https://www.congress.gov/85/crecb/1957/05/06/GPO-CRECB-1957-pt5-9-2.pdf",
        "https://www.congress.gov/86/crecb/1959/05/21/GPO-CRECB-1959-pt7-2-1.pdf",
        "https://www.congress.gov/87/crecb/1961/08/10/GPO-CRECB-1961-pt11-9-1.pdf",
        "https://www.congress.gov/87/crecb/1961/05/24/GPO-CRECB-1961-pt7-5-2.pdf",
        "https://www.congress.gov/83/crecb/1954/05/07/GPO-CRECB-1954-pt5-6-1.pdf",
        "https://www.congress.gov/8721506/crecb/1961/01/04/GPO-CRECB-1961-pt1-2-2.pdf",
        "https://www.congress.gov/84/crecb/1956/05/10/GPO-CRECB-1956-pt6-9-2.pdf",
        "https://www.congress.gov/87/crecb/1962/10/01/GPO-CRECB-1962-pt16-3-1.pdf"
    ]
    
    def __init__(self, output_file: str = "homelessness_data.csv"):
        self.output_file = output_file
        self.results = []
        self.errors = []
        
    def download_pdf(self, url: str, save_path: str) -> bool:
        """Download a PDF from a URL"""
        try:
            logger.info(f"Downloading: {url}")
            response = requests.get(url, timeout=60)
            response.raise_for_status()
            
            with open(save_path, 'wb') as f:
                f.write(response.content)
            
            logger.info(f"Downloaded to: {save_path}")
            return True
            
        except Exception as e:
            logger.error(f"Failed to download {url}: {e}")
            return False
    
    def search_keywords(self, text: str) -> Dict:
        """Search for homelessness keywords in text"""
        results = {
            'contains_keywords': False,
            'keyword_counts': {},
            'total_mentions': 0,
            'keywords_found': []
        }
        
        # Search for each keyword (case-insensitive)
        for keyword in self.KEYWORDS:
            # Use word boundaries to avoid partial matches
            pattern = r'\b' + re.escape(keyword) + r'\b'
            matches = re.findall(pattern, text, re.IGNORECASE)
            count = len(matches)
            
            if count > 0:
                results['contains_keywords'] = True
                results['keyword_counts'][keyword] = count
                results['total_mentions'] += count
                results['keywords_found'].append(keyword)
        
        return results
    
    def extract_context(self, text: str, keyword: str, context_words: int = 50) -> List[str]:
        """Extract context around keyword mentions"""
        contexts = []
        pattern = r'\b' + re.escape(keyword) + r'\b'
        
        for match in re.finditer(pattern, text, re.IGNORECASE):
            start = match.start()
            end = match.end()
            
            # Get surrounding text
            words_before = text[:start].split()[-context_words:]
            words_after = text[end:].split()[:context_words]
            
            context = ' '.join(words_before) + ' **' + match.group() + '** ' + ' '.join(words_after)
            contexts.append(context.strip())
        
        return contexts[:3]  # Return first 3 contexts
    
    def extract_date(self, text: str) -> Optional[str]:
        """Extract date from Congressional Record"""
        patterns = [
            r'((?:MONDAY|TUESDAY|WEDNESDAY|THURSDAY|FRIDAY|SATURDAY|SUNDAY),\s+\w+\s+\d{1,2},\s+\d{4})',
            r'(\w+ \d{1,2}, \d{4})',
            r'(\d{1,2}/\d{1,2}/\d{4})',
        ]
        
        for pattern in patterns:
            match = re.search(pattern, text, re.IGNORECASE)
            if match:
                return match.group(1)
        return None
    
    def extract_congress_session(self, text: str) -> Optional[str]:
        """Extract Congress session number"""
        patterns = [
            r'(\d{1,3})(?:st|nd|rd|th)\s+Congress',
        ]
        
        for pattern in patterns:
            match = re.search(pattern, text, re.IGNORECASE)
            if match:
                return match.group(1)
        return None
    
    def extract_chamber(self, text: str) -> Optional[str]:
        """Determine if Senate or House"""
        if re.search(r'\bSENATE\b', text[:500], re.IGNORECASE):
            return "Senate"
        elif re.search(r'\bHOUSE\b', text[:500], re.IGNORECASE):
            return "House"
        return None
    
    def extract_year_from_url(self, url: str) -> Optional[str]:
        """Extract year from congress.gov URL"""
        match = re.search(r'/(\d{4})/', url)
        if match:
            return match.group(1)
        return None
    
    def extract_from_pdf(self, pdf_path: Path, source_url: str = None) -> Optional[Dict]:
        """Extract data from PDF, only if keywords are found"""
        logger.info(f"Processing: {pdf_path.name}")
        
        try:
            with pdfplumber.open(pdf_path) as pdf:
                # Extract text from all pages
                full_text = ""
                for page in pdf.pages:
                    page_text = page.extract_text()
                    if page_text:
                        full_text += page_text + "\n"
                
                # Search for keywords
                keyword_results = self.search_keywords(full_text)
                
                # Only process if keywords are found
                if not keyword_results['contains_keywords']:
                    logger.info(f"No keywords found in {pdf_path.name} - skipping")
                    return None
                
                logger.info(f"✓ Keywords found in {pdf_path.name}! Total mentions: {keyword_results['total_mentions']}")
                
                # Extract metadata
                result = {
                    'filename': pdf_path.name,
                    'source_url': source_url,
                    'year': self.extract_year_from_url(source_url) if source_url else None,
                    'date': self.extract_date(full_text),
                    'congress_session': self.extract_congress_session(full_text),
                    'chamber': self.extract_chamber(full_text),
                    'page_count': len(pdf.pages),
                    'total_keyword_mentions': keyword_results['total_mentions'],
                    'keywords_found': '; '.join(keyword_results['keywords_found']),
                    'keyword_counts': str(keyword_results['keyword_counts']),
                    'status': 'success'
                }
                
                # Add individual keyword counts as columns
                for keyword in self.KEYWORDS:
                    safe_key = keyword.replace(' ', '_')
                    result[f'count_{safe_key}'] = keyword_results['keyword_counts'].get(keyword, 0)
                
                # Extract context for the most common keyword
                if keyword_results['keywords_found']:
                    top_keyword = max(keyword_results['keyword_counts'].items(), key=lambda x: x[1])[0]
                    contexts = self.extract_context(full_text, top_keyword)
                    result['sample_context'] = ' [...] '.join(contexts) if contexts else ''
                
                return result
                
        except Exception as e:
            logger.error(f"Error processing {pdf_path.name}: {e}")
            return {
                'filename': pdf_path.name,
                'source_url': source_url,
                'status': 'error',
                'error': str(e)
            }
    
    def process_urls(self):
        """Process all URLs in the list"""
        logger.info(f"Processing {len(self.URLS)} URLs")
        logger.info(f"Searching for keywords: {', '.join(self.KEYWORDS)}")
        
        # Create temp directory for downloads
        temp_dir = Path("./temp_pdfs")
        temp_dir.mkdir(exist_ok=True, parents=True)
        
        for idx, url in enumerate(self.URLS, 1):
            logger.info(f"\n{'='*70}")
            logger.info(f"Progress: {idx}/{len(self.URLS)}")
            logger.info(f"{'='*70}")
            
            # Generate filename from URL
            filename = url.split('/')[-1]
            if not filename.endswith('.pdf'):
                filename = f"document_{idx}.pdf"
            
            pdf_path = temp_dir / filename
            
            # Download PDF
            if self.download_pdf(url, str(pdf_path)):
                # Extract data (only if keywords found)
                result = self.extract_from_pdf(pdf_path, url)
                
                if result:  # Only add if keywords were found
                    self.results.append(result)
                
                # Clean up downloaded file
                try:
                    pdf_path.unlink()
                except:
                    pass
            
            # Save checkpoint every 5 files
            if idx % 5 == 0 and self.results:
                self.save_results(f"checkpoint_{idx}.csv")
        
        logger.info(f"\n{'='*70}")
        logger.info(f"Completed processing {len(self.URLS)} URLs")
        logger.info(f"Found keywords in {len(self.results)} documents")
        logger.info(f"{'='*70}\n")
    
    def save_results(self, filename: Optional[str] = None):
        """Save results to CSV"""
        output_file = filename or self.output_file
        
        if not self.results:
            logger.warning("No results to save (no keywords found in any documents)")
            return None
        
        df = pd.DataFrame(self.results)
        
        # Sort by year and total mentions
        if 'year' in df.columns:
            df = df.sort_values(['year', 'total_keyword_mentions'], ascending=[True, False])
        
        df.to_csv(output_file, index=False)
        logger.info(f"Results saved to {output_file}")
        
        return df
    
    def generate_report(self):
        """Generate a summary report"""
        if not self.results:
            print("\n" + "="*70)
            print("NO KEYWORDS FOUND")
            print("="*70)
            print("None of the processed documents contained the target keywords.")
            print("="*70 + "\n")
            return
        
        df = pd.DataFrame(self.results)
        
        print("\n" + "="*70)
        print("HOMELESSNESS KEYWORD EXTRACTION REPORT")
        print("="*70)
        print(f"Documents processed: {len(self.URLS)}")
        print(f"Documents with keywords: {len(self.results)}")
        print(f"Total keyword mentions: {df['total_keyword_mentions'].sum()}")
        
        print(f"\nKeyword frequency:")
        for keyword in self.KEYWORDS:
            safe_key = keyword.replace(' ', '_')
            col_name = f'count_{safe_key}'
            if col_name in df.columns:
                total = df[col_name].sum()
                if total > 0:
                    print(f"  {keyword}: {total}")
        
        print(f"\nDocuments by year:")
        if 'year' in df.columns:
            year_counts = df['year'].value_counts().sort_index()
            for year, count in year_counts.items():
                print(f"  {year}: {count} document(s)")
        
        print(f"\nTop 5 documents by keyword mentions:")
        top_docs = df.nlargest(5, 'total_keyword_mentions')[['filename', 'year', 'total_keyword_mentions']]
        for _, row in top_docs.iterrows():
            print(f"  {row['filename']} ({row['year']}): {row['total_keyword_mentions']} mentions")
        
        print("="*70 + "\n")


def main():
    """Main execution function"""
    
    print("\n" + "="*70)
    print("CONGRESSIONAL RECORD HOMELESSNESS KEYWORD EXTRACTOR")
    print("="*70)
    print("Searching for: homeless, homelessness, houseless, unhoused,")
    print("               tramp, vagrant, people experiencing homelessness")
    print("="*70 + "\n")
    
    # Initialize extractor
    extractor = HomelessnessExtractor(
        output_file="./homelessness_data.csv"
    )
    
    # Process all URLs
    extractor.process_urls()
    
    # Save and report
    extractor.save_results()
    extractor.generate_report()
    
    print("\nDone! Check 'homelessness_data.csv' for results.")


if __name__ == "__main__":
    main()

2025-10-26 17:18:44,999 - INFO - Processing 16 URLs
2025-10-26 17:18:44,999 - INFO - Searching for keywords: homeless, homelessness, houseless, Houselessness, unhoused, tramp, vagrant, people experiencing homelessness
2025-10-26 17:18:45,000 - INFO - 
2025-10-26 17:18:45,000 - INFO - Progress: 1/16
2025-10-26 17:18:45,001 - INFO - Downloading: https://www.congress.gov/88/crecb/1963/06/03/GPO-CRECB-1963-pt8-3-1.pdf



CONGRESSIONAL RECORD HOMELESSNESS KEYWORD EXTRACTOR
Searching for: homeless, homelessness, houseless, unhoused,
               tramp, vagrant, people experiencing homelessness



2025-10-26 17:18:46,156 - INFO - Downloaded to: temp_pdfs/GPO-CRECB-1963-pt8-3-1.pdf
2025-10-26 17:18:46,158 - INFO - Processing: GPO-CRECB-1963-pt8-3-1.pdf
2025-10-26 17:18:49,234 - INFO - ✓ Keywords found in GPO-CRECB-1963-pt8-3-1.pdf! Total mentions: 5
2025-10-26 17:18:49,240 - INFO - 
2025-10-26 17:18:49,240 - INFO - Progress: 2/16
2025-10-26 17:18:49,241 - INFO - Downloading: https://www.congress.gov/86/crecb/1960/05/27/GPO-CRECB-1960-pt9-2-1.pdf
2025-10-26 17:18:50,885 - INFO - Downloaded to: temp_pdfs/GPO-CRECB-1960-pt9-2-1.pdf
2025-10-26 17:18:50,887 - INFO - Processing: GPO-CRECB-1960-pt9-2-1.pdf
2025-10-26 17:18:57,657 - INFO - ✓ Keywords found in GPO-CRECB-1960-pt9-2-1.pdf! Total mentions: 11
2025-10-26 17:18:57,675 - INFO - 
2025-10-26 17:18:57,676 - INFO - Progress: 3/16
2025-10-26 17:18:57,677 - INFO - Downloading: https://www.congress.gov/86/crecb/1959/05/14/GPO-CRECB-1959-pt6-10-2.pdf
2025-10-26 17:18:59,795 - INFO - Downloaded to: temp_pdfs/GPO-CRECB-1959-pt6-10-2.pdf



HOMELESSNESS KEYWORD EXTRACTION REPORT
Documents processed: 16
Documents with keywords: 15
Total keyword mentions: 77

Keyword frequency:
  homeless: 65
  homelessness: 2
  houseless: 1
  unhoused: 1
  tramp: 7
  vagrant: 1

Documents by year:
  1954: 1 document(s)
  1955: 1 document(s)
  1956: 1 document(s)
  1957: 2 document(s)
  1958: 1 document(s)
  1959: 3 document(s)
  1960: 1 document(s)
  1961: 3 document(s)
  1962: 1 document(s)
  1963: 1 document(s)

Top 5 documents by keyword mentions:
  GPO-CRECB-1960-pt9-2-1.pdf (1960): 11 mentions
  GPO-CRECB-1958-pt15-2.pdf (1958): 9 mentions
  GPO-CRECB-1956-pt6-9-2.pdf (1956): 7 mentions
  GPO-CRECB-1959-pt6-10-2.pdf (1959): 6 mentions
  GPO-CRECB-1959-pt7-2-1.pdf (1959): 6 mentions


Done! Check 'homelessness_data.csv' for results.
