# ITE Trip Generation PDF Extractor

This notebook extracts trip generation data from ITE Trip Generation Manual PDFs (Volumes 3-5) and converts them to JSON format for use in the ITE Trip Generation Calculator.

## Instructions:
1. Run Cell 1 to install dependencies
2. Run Cell 2 to upload your PDF files
3. Run Cell 3 to extract and parse the data
4. Download the generated JSON file

In [None]:
# Cell 1: Install Dependencies
!pip install pymupdf pandas tabula-py camelot-py[cv] pdfplumber
!apt-get install -y ghostscript python3-tk

import fitz  # PyMuPDF
import re
import json
import pandas as pd
from google.colab import files
import os

print("Dependencies installed successfully!")

In [None]:
# Cell 2: Upload PDF Files
print("Please upload your ITE Trip Generation Manual PDF files (Volumes 3, 4, or 5)")
print("You can upload multiple files at once.")
print("")

uploaded = files.upload()

pdf_files = [f for f in uploaded.keys() if f.lower().endswith('.pdf')]
print(f"\nUploaded {len(pdf_files)} PDF file(s): {pdf_files}")

In [None]:
# Cell 3: ITE Data Extraction Functions

class ITEDataExtractor:
    def __init__(self):
        self.land_use_data = {}
        
        # Regex patterns for ITE data extraction
        self.patterns = {
            # Land use code pattern: 3 digits
            'land_use_code': r'(?:Land Use|Code)?\s*(\d{3})\b',
            
            # Rate patterns
            'average_rate': r'Average\s*(?:Weekday)?\s*(?:Vehicle)?\s*Trip\s*(?:Generation)?\s*Rate[:\s]*([\d.]+)',
            'weekday_rate': r'(?:Weekday|Daily)\s*(?:Vehicle)?\s*Trip[s]?[:\s]*([\d.]+)\s*(?:per|trips)',
            'am_peak_rate': r'(?:AM|Morning)\s*Peak\s*(?:Hour)?[:\s]*([\d.]+)',
            'pm_peak_rate': r'(?:PM|Evening|Afternoon)\s*Peak\s*(?:Hour)?[:\s]*([\d.]+)',
            
            # Equation patterns
            'linear_equation': r'T\s*=\s*([\d.]+)\s*\(?X\)?\s*([+-])\s*([\d.]+)',
            'log_equation': r'(?:Ln|LN|ln)\s*\(?T\)?\s*=\s*([\d.]+)\s*(?:Ln|LN|ln)\s*\(?X\)?\s*([+-])\s*([\d.]+)',
            
            # R-squared
            'r_squared': r'R[²2]\s*=\s*([\d.]+)',
            
            # Sample size
            'sample_size': r'(?:Number of|N|n)\s*(?:Studies|Sites|Observations)[:\s]*([\d]+)',
            
            # Directional distribution
            'entering': r'(?:Entering|Enter|In)[:\s]*([\d]+)\s*%',
            'exiting': r'(?:Exiting|Exit|Out)[:\s]*([\d]+)\s*%',
            
            # Unit patterns
            'unit_dwelling': r'(?:per|Per)\s*(?:Dwelling\s*Unit|DU)',
            'unit_ksf': r'(?:per|Per)\s*(?:1,?000|1000|K)\s*(?:SF|Square\s*Feet|Sq\.?\s*Ft)',
            'unit_employee': r'(?:per|Per)\s*(?:Employee|Employees)',
            'unit_room': r'(?:per|Per)\s*(?:Room|Rooms)',
            'unit_student': r'(?:per|Per)\s*(?:Student|Students)',
            'unit_acre': r'(?:per|Per)\s*(?:Acre|Acres)',
            'unit_bed': r'(?:per|Per)\s*(?:Bed|Beds)',
            'unit_fueling': r'(?:per|Per)\s*(?:Fueling\s*Position|VFP)',
            'unit_screen': r'(?:per|Per)\s*(?:Screen|Screens)',
            'unit_lane': r'(?:per|Per)\s*(?:Lane|Lanes)',
            'unit_hole': r'(?:per|Per)\s*(?:Hole|Holes)',
        }
        
        # Known ITE land use names (11th Edition)
        self.land_use_names = {
            '110': 'General Light Industrial',
            '130': 'Industrial Park',
            '140': 'Manufacturing',
            '150': 'Warehousing',
            '151': 'Mini-Warehouse',
            '210': 'Single-Family Detached Housing',
            '215': 'Single-Family Attached Housing',
            '220': 'Multifamily Housing (Low-Rise)',
            '221': 'Multifamily Housing (Mid-Rise)',
            '222': 'Multifamily Housing (High-Rise)',
            '230': 'Residential Condominium/Townhouse',
            '240': 'Mobile Home Park',
            '251': 'Senior Adult Housing - Detached',
            '252': 'Senior Adult Housing - Attached',
            '253': 'Congregate Care Facility',
            '254': 'Assisted Living',
            '255': 'Continuing Care Retirement Community',
            '260': 'Recreational Homes',
            '270': 'Residential Planned Unit Development',
            '310': 'Hotel',
            '311': 'All Suites Hotel',
            '312': 'Business Hotel',
            '320': 'Motel',
            '330': 'Resort Hotel',
            '411': 'Public Park',
            '420': 'Marina',
            '430': 'Golf Course',
            '444': 'Movie Theater',
            '445': 'Multiplex Movie Theater',
            '480': 'Amusement Park',
            '491': 'Racquet/Tennis Club',
            '492': 'Health/Fitness Club',
            '495': 'Recreational Community Center',
            '520': 'Elementary School',
            '522': 'Middle School/Junior High School',
            '530': 'High School',
            '534': 'Private School (K-8)',
            '536': 'Private School (K-12)',
            '540': 'Junior/Community College',
            '550': 'University/College',
            '560': 'Church',
            '565': 'Day Care Center',
            '566': 'Cemetery',
            '575': 'Fire and Rescue Station',
            '590': 'Library',
            '610': 'Hospital',
            '620': 'Nursing Home',
            '630': 'Clinic',
            '640': 'Animal Hospital/Veterinary Clinic',
            '710': 'General Office Building',
            '714': 'Corporate Headquarters Building',
            '715': 'Single Tenant Office Building',
            '720': 'Medical-Dental Office Building',
            '730': 'Government Office Building',
            '750': 'Office Park',
            '760': 'Research and Development Center',
            '770': 'Business Park',
            '812': 'Building Materials and Lumber Store',
            '813': 'Free-Standing Discount Superstore',
            '814': 'Variety Store',
            '815': 'Free-Standing Discount Store',
            '816': 'Hardware/Paint Store',
            '817': 'Nursery (Garden Center)',
            '820': 'Shopping Center',
            '821': 'Shopping Plaza (40-150k)',
            '822': 'Strip Retail Plaza (<40k)',
            '840': 'Automobile Sales (New)',
            '841': 'Automobile Sales (Used)',
            '848': 'Tire Store',
            '850': 'Supermarket',
            '851': 'Convenience Market',
            '857': 'Discount Club',
            '860': 'Wholesale Market',
            '861': 'Sporting Goods Superstore',
            '862': 'Home Improvement Superstore',
            '863': 'Electronics Superstore',
            '866': 'Pet Supply Superstore',
            '867': 'Office Supply Superstore',
            '875': 'Department Store',
            '879': 'Arts and Crafts Store',
            '880': 'Pharmacy/Drugstore without Drive-Through',
            '881': 'Pharmacy/Drugstore with Drive-Through',
            '890': 'Furniture Store',
            '911': 'Walk-in Bank',
            '912': 'Drive-in Bank',
            '930': 'Fast Casual Restaurant',
            '931': 'Quality Restaurant',
            '932': 'High-Turnover (Sit-Down) Restaurant',
            '933': 'Fast Food Restaurant without Drive-Through',
            '934': 'Fast Food Restaurant with Drive-Through',
            '936': 'Coffee/Donut Shop without Drive-Through',
            '937': 'Coffee/Donut Shop with Drive-Through',
            '938': 'Coffee/Donut Shop with Drive-Through Only',
            '941': 'Quick Lubrication Vehicle Shop',
            '942': 'Automobile Care Center',
            '944': 'Gasoline/Service Station',
            '945': 'Gasoline/Service Station with Convenience Market',
            '947': 'Self-Service Car Wash',
            '948': 'Automated Car Wash',
        }

    def extract_text_from_pdf(self, pdf_path):
        """Extract all text from PDF with page markers."""
        doc = fitz.open(pdf_path)
        pages_text = []
        for page_num in range(doc.page_count):
            page = doc[page_num]
            text = page.get_text()
            pages_text.append({
                'page': page_num + 1,
                'text': text
            })
        doc.close()
        return pages_text

    def detect_unit(self, text):
        """Detect the unit of measurement from text."""
        text_lower = text.lower()
        if 'dwelling' in text_lower or ' du' in text_lower:
            return 'Dwelling Units'
        elif '1000' in text_lower or '1,000' in text_lower or 'ksf' in text_lower:
            if 'gla' in text_lower:
                return '1000 SF GLA'
            return '1000 SF GFA'
        elif 'employee' in text_lower:
            return 'Employees'
        elif 'room' in text_lower:
            return 'Rooms'
        elif 'student' in text_lower:
            return 'Students'
        elif 'acre' in text_lower:
            return 'Acres'
        elif 'bed' in text_lower:
            return 'Beds'
        elif 'fueling' in text_lower or 'vfp' in text_lower:
            return 'Fueling Positions'
        elif 'screen' in text_lower:
            return 'Screens'
        elif 'lane' in text_lower:
            return 'Lanes'
        elif 'hole' in text_lower:
            return 'Holes'
        elif 'seat' in text_lower:
            return 'Seats'
        elif 'service bay' in text_lower:
            return 'Service Bays'
        return '1000 SF GFA'  # Default

    def parse_equation(self, text):
        """Parse fitted curve equation from text."""
        # Try linear equation first: T = aX + b or T = aX - b
        linear_match = re.search(r'T\s*=\s*([\d.]+)\s*\(?X\)?\s*([+-])\s*([\d.]+)', text, re.IGNORECASE)
        if linear_match:
            a = float(linear_match.group(1))
            sign = 1 if linear_match.group(2) == '+' else -1
            b = float(linear_match.group(3)) * sign
            return {'type': 'linear', 'a': a, 'b': b}
        
        # Try log equation: Ln(T) = a*Ln(X) + b
        log_match = re.search(r'(?:Ln|LN|ln)\s*\(?T\)?\s*=\s*([\d.]+)\s*(?:Ln|LN|ln)\s*\(?X\)?\s*([+-])\s*([\d.]+)', text, re.IGNORECASE)
        if log_match:
            a = float(log_match.group(1))
            sign = 1 if log_match.group(2) == '+' else -1
            b = float(log_match.group(3)) * sign
            return {'type': 'log', 'a': a, 'b': b}
        
        return None

    def extract_land_use_data(self, pages_text):
        """Extract land use data from pages."""
        extracted_data = {}
        
        for page_data in pages_text:
            text = page_data['text']
            page_num = page_data['page']
            
            # Find land use codes in this page
            code_matches = re.findall(r'\b(\d{3})\b', text)
            
            for code in code_matches:
                # Skip if not a valid ITE code range
                code_int = int(code)
                if code_int < 100 or code_int > 999:
                    continue
                    
                # Check if this looks like a land use code page
                if code in self.land_use_names or any(keyword in text.lower() for keyword in ['trip generation', 'average rate', 'fitted curve']):
                    
                    if code not in extracted_data:
                        extracted_data[code] = {
                            'code': code,
                            'name': self.land_use_names.get(code, f'Land Use {code}'),
                            'pages_found': [],
                            'raw_text_samples': [],
                            'weekday': {},
                            'am_peak': {},
                            'pm_peak': {},
                        }
                    
                    extracted_data[code]['pages_found'].append(page_num)
                    
                    # Extract rates
                    rate_patterns = [
                        (r'(?:Average|Avg\.?)\s*(?:Rate|Trip)[:\s]*([\d.]+)', 'rate'),
                        (r'([\d.]+)\s*(?:trips?|vehicle)', 'rate'),
                    ]
                    
                    for pattern, field in rate_patterns:
                        match = re.search(pattern, text, re.IGNORECASE)
                        if match:
                            try:
                                rate = float(match.group(1))
                                if 0 < rate < 10000:  # Sanity check
                                    if 'weekday' in text.lower() or 'daily' in text.lower():
                                        extracted_data[code]['weekday']['rate'] = rate
                                    elif 'am' in text.lower() and 'peak' in text.lower():
                                        extracted_data[code]['am_peak']['rate'] = rate
                                    elif 'pm' in text.lower() and 'peak' in text.lower():
                                        extracted_data[code]['pm_peak']['rate'] = rate
                            except ValueError:
                                pass
                    
                    # Extract R-squared
                    r2_match = re.search(r'R[²2]\s*[=:]?\s*([\d.]+)', text)
                    if r2_match:
                        try:
                            r2 = float(r2_match.group(1))
                            if 0 <= r2 <= 1:
                                if 'weekday' in text.lower():
                                    extracted_data[code]['weekday']['r_squared'] = r2
                        except ValueError:
                            pass
                    
                    # Extract sample size
                    sample_match = re.search(r'(?:Number of|N|n)\s*(?:Studies|Sites)[:\s]*(\d+)', text, re.IGNORECASE)
                    if sample_match:
                        extracted_data[code]['weekday']['sample_size'] = int(sample_match.group(1))
                    
                    # Extract directional split
                    enter_match = re.search(r'(\d+)\s*%?\s*(?:entering|enter|in)', text, re.IGNORECASE)
                    exit_match = re.search(r'(\d+)\s*%?\s*(?:exiting|exit|out)', text, re.IGNORECASE)
                    if enter_match and exit_match:
                        entering = int(enter_match.group(1))
                        exiting = int(exit_match.group(1))
                        if entering + exiting == 100:
                            extracted_data[code]['am_peak']['entering'] = entering
                            extracted_data[code]['am_peak']['exiting'] = exiting
                            extracted_data[code]['pm_peak']['entering'] = exiting  # Typically reversed
                            extracted_data[code]['pm_peak']['exiting'] = entering
                    
                    # Extract equation
                    equation = self.parse_equation(text)
                    if equation:
                        extracted_data[code]['weekday']['equation'] = equation
                    
                    # Detect unit
                    extracted_data[code]['unit'] = self.detect_unit(text)
                    
                    # Store sample of raw text for debugging
                    if len(extracted_data[code]['raw_text_samples']) < 3:
                        extracted_data[code]['raw_text_samples'].append(text[:500])
        
        return extracted_data

    def format_for_database(self, extracted_data):
        """Format extracted data to match ite-database.js structure."""
        formatted = {}
        
        for code, data in extracted_data.items():
            # Determine category
            code_int = int(code)
            if code_int < 100:
                category = 'Port, Freight, Terminal'
            elif code_int < 200:
                category = 'Industrial'
            elif code_int < 300:
                category = 'Residential'
            elif code_int < 400:
                category = 'Lodging'
            elif code_int < 500:
                category = 'Recreational'
            elif code_int < 600:
                category = 'Institutional'
            elif code_int < 700:
                category = 'Medical'
            elif code_int < 800:
                category = 'Office'
            elif code_int < 900:
                category = 'Retail'
            else:
                category = 'Services'
            
            formatted[code] = {
                'code': code,
                'name': data.get('name', f'Land Use {code}'),
                'category': category,
                'unit': data.get('unit', '1000 SF GFA'),
                'weekday': {
                    'rate': data.get('weekday', {}).get('rate'),
                    'equation': data.get('weekday', {}).get('equation'),
                    'r_squared': data.get('weekday', {}).get('r_squared'),
                    'sample_size': data.get('weekday', {}).get('sample_size')
                },
                'am_peak': {
                    'rate': data.get('am_peak', {}).get('rate'),
                    'equation': data.get('am_peak', {}).get('equation'),
                    'r_squared': data.get('am_peak', {}).get('r_squared'),
                    'entering': data.get('am_peak', {}).get('entering'),
                    'exiting': data.get('am_peak', {}).get('exiting')
                },
                'pm_peak': {
                    'rate': data.get('pm_peak', {}).get('rate'),
                    'equation': data.get('pm_peak', {}).get('equation'),
                    'r_squared': data.get('pm_peak', {}).get('r_squared'),
                    'entering': data.get('pm_peak', {}).get('entering'),
                    'exiting': data.get('pm_peak', {}).get('exiting')
                },
                'source': 'ITE 11th Edition',
                'pages_found': data.get('pages_found', [])
            }
        
        return formatted

print("ITEDataExtractor class loaded successfully!")

In [None]:
# Cell 4: Process PDFs and Extract Data

extractor = ITEDataExtractor()
all_extracted_data = {}

for pdf_file in pdf_files:
    print(f"\nProcessing: {pdf_file}")
    print("="*50)
    
    # Extract text from PDF
    pages_text = extractor.extract_text_from_pdf(pdf_file)
    print(f"  Extracted text from {len(pages_text)} pages")
    
    # Extract land use data
    extracted = extractor.extract_land_use_data(pages_text)
    print(f"  Found {len(extracted)} potential land use codes")
    
    # Merge with existing data
    for code, data in extracted.items():
        if code not in all_extracted_data:
            all_extracted_data[code] = data
        else:
            # Merge data from multiple PDFs
            for key in ['weekday', 'am_peak', 'pm_peak']:
                for field, value in data.get(key, {}).items():
                    if value is not None and all_extracted_data[code].get(key, {}).get(field) is None:
                        if key not in all_extracted_data[code]:
                            all_extracted_data[code][key] = {}
                        all_extracted_data[code][key][field] = value

print(f"\n\nTotal unique land use codes found: {len(all_extracted_data)}")
print("\nCodes found:")
for code in sorted(all_extracted_data.keys()):
    name = all_extracted_data[code].get('name', 'Unknown')
    pages = all_extracted_data[code].get('pages_found', [])
    print(f"  {code}: {name} (pages: {pages[:5]}{'...' if len(pages) > 5 else ''})")

In [None]:
# Cell 5: Format and Export Data

# Format for database
formatted_data = extractor.format_for_database(all_extracted_data)

# Create output structure
output = {
    'metadata': {
        'source': 'ITE Trip Generation Manual, 11th Edition',
        'extracted_date': pd.Timestamp.now().strftime('%Y-%m-%d'),
        'total_codes': len(formatted_data),
        'files_processed': pdf_files
    },
    'land_use_codes': formatted_data
}

# Save to JSON
output_filename = 'ite_extracted_data.json'
with open(output_filename, 'w') as f:
    json.dump(output, f, indent=2)

print(f"Data saved to {output_filename}")
print(f"\nTotal codes extracted: {len(formatted_data)}")

# Show sample of extracted data
print("\n" + "="*50)
print("SAMPLE EXTRACTED DATA:")
print("="*50)

sample_codes = ['210', '710', '820', '934']  # Common codes to show
for code in sample_codes:
    if code in formatted_data:
        print(f"\n{code}: {formatted_data[code]['name']}")
        print(f"  Unit: {formatted_data[code]['unit']}")
        print(f"  Weekday Rate: {formatted_data[code]['weekday'].get('rate', 'Not found')}")
        print(f"  AM Peak Rate: {formatted_data[code]['am_peak'].get('rate', 'Not found')}")
        print(f"  PM Peak Rate: {formatted_data[code]['pm_peak'].get('rate', 'Not found')}")
        print(f"  R²: {formatted_data[code]['weekday'].get('r_squared', 'Not found')}")

In [None]:
# Cell 6: Generate JavaScript Database Code

def generate_js_database(formatted_data):
    """Generate JavaScript code for ite-database.js"""
    js_code = """/**
 * ITE Trip Generation Database
 * Extracted from ITE Trip Generation Manual, 11th Edition
 * Auto-generated on """ + pd.Timestamp.now().strftime('%Y-%m-%d') + """
 */

const ITE_DATABASE = {
"""
    
    for code in sorted(formatted_data.keys()):
        data = formatted_data[code]
        
        # Build weekday object
        weekday_parts = []
        if data['weekday'].get('rate') is not None:
            weekday_parts.append(f"rate: {data['weekday']['rate']}")
        else:
            weekday_parts.append("rate: null")
        
        if data['weekday'].get('equation'):
            eq = data['weekday']['equation']
            weekday_parts.append(f'equation: {{ type: "{eq["type"]}", a: {eq["a"]}, b: {eq["b"]} }}')
        else:
            weekday_parts.append("equation: null")
        
        if data['weekday'].get('r_squared') is not None:
            weekday_parts.append(f"r_squared: {data['weekday']['r_squared']}")
        else:
            weekday_parts.append("r_squared: null")
        
        if data['weekday'].get('sample_size') is not None:
            weekday_parts.append(f"sample_size: {data['weekday']['sample_size']}")
        
        # Build AM peak object
        am_parts = []
        if data['am_peak'].get('rate') is not None:
            am_parts.append(f"rate: {data['am_peak']['rate']}")
        else:
            am_parts.append("rate: null")
        am_parts.append("equation: null")
        am_parts.append("r_squared: null")
        if data['am_peak'].get('entering') is not None:
            am_parts.append(f"entering: {data['am_peak']['entering']}")
            am_parts.append(f"exiting: {data['am_peak']['exiting']}")
        
        # Build PM peak object
        pm_parts = []
        if data['pm_peak'].get('rate') is not None:
            pm_parts.append(f"rate: {data['pm_peak']['rate']}")
        else:
            pm_parts.append("rate: null")
        pm_parts.append("equation: null")
        pm_parts.append("r_squared: null")
        if data['pm_peak'].get('entering') is not None:
            pm_parts.append(f"entering: {data['pm_peak']['entering']}")
            pm_parts.append(f"exiting: {data['pm_peak']['exiting']}")
        
        js_code += f'''  "{code}": {{
    code: "{code}",
    name: "{data['name']}",
    category: "{data['category']}",
    unit: "{data['unit']}",
    weekday: {{
      {', '.join(weekday_parts)}
    }},
    am_peak: {{
      {', '.join(am_parts)}
    }},
    pm_peak: {{
      {', '.join(pm_parts)}
    }},
    source: "ITE 11th Edition"
  }},
'''
    
    js_code += "};"
    return js_code

# Generate JS code
js_code = generate_js_database(formatted_data)

# Save to file
js_filename = 'ite_database_extracted.js'
with open(js_filename, 'w') as f:
    f.write(js_code)

print(f"JavaScript database code saved to {js_filename}")
print(f"\nFirst 2000 characters of generated code:")
print("="*50)
print(js_code[:2000])

In [None]:
# Cell 7: Download Generated Files

print("Downloading generated files...")
print("")

# Download JSON
files.download('ite_extracted_data.json')
print("1. ite_extracted_data.json - Complete extracted data in JSON format")

# Download JS
files.download('ite_database_extracted.js')
print("2. ite_database_extracted.js - Ready-to-use JavaScript database code")

print("\n" + "="*50)
print("NEXT STEPS:")
print("="*50)
print("1. Review the extracted data for accuracy")
print("2. Compare with official ITE values if available")
print("3. Merge with existing ite-database.js in your project")
print("4. Update the 'source' field if using 12th Edition rates")

## Troubleshooting

If the extraction doesn't capture all data correctly, you can:

1. **View raw extracted text** to understand the PDF structure:
```python
# Show raw text from specific pages
for page in pages_text[0:10]:  # First 10 pages
    print(f"Page {page['page']}:")
    print(page['text'][:1000])
    print("\n" + "="*50 + "\n")
```

2. **Search for specific land use codes**:
```python
target_code = '210'
for page in pages_text:
    if target_code in page['text']:
        print(f"Found {target_code} on page {page['page']}")
        print(page['text'][:2000])
```

3. **Manually add missing data** to the JSON file before downloading

In [None]:
# Cell 8: Debug - View Raw Text for Specific Pages (Optional)

# Uncomment and modify to debug specific pages

# target_code = '210'  # Change this to the code you want to find
# 
# print(f"Searching for Land Use Code {target_code}...")
# print("="*50)
# 
# for pdf_file in pdf_files:
#     pages = extractor.extract_text_from_pdf(pdf_file)
#     for page in pages:
#         if target_code in page['text']:
#             print(f"\nFound in {pdf_file}, Page {page['page']}:")
#             print("-"*50)
#             print(page['text'][:3000])
#             print("\n")