<a href="https://colab.research.google.com/github/derek881107/Real-Time-Disaster-Detection-System/blob/main/google_news_scraper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
pip install openai>=1.0.0 pygooglenews beautifulsoup4 pandas plotly requests lxml openpyxl numpy

In [2]:
# COMPLETE ENHANCED DISASTER NEWS ANALYSIS SYSTEM v3.3
# Integration of PyGoogleNews API with Localized Search and GPT Language Detection
# Enhanced Disaster Analysis System with PyGoogleNews and Client Country Localization
# ==========================================

import os
import sys
import time
import json
import requests
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from typing import List, Dict, Optional, Tuple
from bs4 import BeautifulSoup
import html
import re
from dataclasses import dataclass, field
import traceback
import subprocess
import logging
import urllib.parse
import email.utils
import calendar

# Install required packages
def install_packages():
    """Install all required packages for the disaster analysis system"""
    packages = [
        "openai>=1.0.0", "pygooglenews", "beautifulsoup4", "pandas",
        "plotly", "requests", "lxml", "openpyxl", "numpy"
    ]

    for package in packages:
        try:
            subprocess.check_call([sys.executable, "-m", "pip", "install", package, "--quiet"])
            print(f"✅ {package} installed successfully")
        except:
            print(f"⚠️ {package} installation failed - using fallback")

# Handle OpenAI import
try:
    from openai import OpenAI
    OPENAI_V1 = True
    print("✅ Using OpenAI v1.0+ API")
except ImportError:
    import openai
    OPENAI_V1 = False
    print("✅ Using OpenAI legacy API")

# Handle PyGoogleNews import
try:
    from pygooglenews import GoogleNews
    print("✅ Using PyGoogleNews API")
except ImportError:
    print("❌ PyGoogleNews not available. Please install: pip install pygooglenews")
    sys.exit(1)

# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# ==========================================
# COUNTRY-LANGUAGE MAPPING AND GPT LOCALIZATION
# ==========================================

# Default country-language mappings for fallback
DEFAULT_COUNTRY_LANG_MAP = {
    'US': {'lang': 'en', 'country': 'US'},
    'MX': {'lang': 'es', 'country': 'MX'},
    'CN': {'lang': 'zh', 'country': 'CN'},
    'HK': {'lang': 'zh', 'country': 'HK'},
    'SG': {'lang': 'en', 'country': 'SG'},
    'IN': {'lang': 'en', 'country': 'IN'},
    'MY': {'lang': 'en', 'country': 'MY'},
    'NL': {'lang': 'nl', 'country': 'NL'},
    'RO': {'lang': 'ro', 'country': 'RO'},
    'HU': {'lang': 'hu', 'country': 'HU'},
    'JP': {'lang': 'ja', 'country': 'JP'},
    'KR': {'lang': 'ko', 'country': 'KR'},
    'TW': {'lang': 'zh', 'country': 'TW'},
    'TH': {'lang': 'th', 'country': 'TH'},
    'VN': {'lang': 'vi', 'country': 'VN'},
    'PH': {'lang': 'en', 'country': 'PH'},
    'AU': {'lang': 'en', 'country': 'AU'},
    'CA': {'lang': 'en', 'country': 'CA'},
    'GB': {'lang': 'en', 'country': 'GB'},
    'DE': {'lang': 'de', 'country': 'DE'},
    'FR': {'lang': 'fr', 'country': 'FR'},
    'IT': {'lang': 'it', 'country': 'IT'},
    'ES': {'lang': 'es', 'country': 'ES'},
    'BR': {'lang': 'pt', 'country': 'BR'},
    'AR': {'lang': 'es', 'country': 'AR'},
    'CL': {'lang': 'es', 'country': 'CL'},
    'CO': {'lang': 'es', 'country': 'CO'},
    'PE': {'lang': 'es', 'country': 'PE'},
}

class CountryLanguageAnalyzer:
    """GPT-powered country language analyzer for localized disaster news search"""

    def __init__(self, openai_client, use_legacy_api=False):
        self.openai_client = openai_client
        self.use_legacy_api = use_legacy_api
        self.country_lang_cache = {}  # Cache for GPT responses

    def analyze_client_country_language(self, client_country: str, client_address: str = "") -> Dict[str, str]:
        """Use GPT to analyze client country and determine appropriate language and country codes"""

        if not client_country or client_country.strip() in ['', 'N/A', 'Unknown']:
            return {'lang': 'en', 'country': 'US'}  # Default fallback

        # Check cache first
        cache_key = f"{client_country}_{client_address}".lower().strip()
        if cache_key in self.country_lang_cache:
            return self.country_lang_cache[cache_key]

        # Check if it's a known country code
        client_upper = client_country.upper().strip()
        if client_upper in DEFAULT_COUNTRY_LANG_MAP:
            result = DEFAULT_COUNTRY_LANG_MAP[client_upper]
            self.country_lang_cache[cache_key] = result
            return result

        try:
            prompt = f"""
You are a geographical and linguistic expert. Analyze the client location information and determine the appropriate language and country codes for Google News search.

Client Information:
- Client Country: {client_country}
- Client Address: {client_address}

Instructions:
1. Identify the specific country from the client information
2. Determine the primary language used for news and media in that country
3. Provide the appropriate Google News language and country codes

Return ONLY a JSON object with this exact format:
{{
    "lang": "language_code",
    "country": "country_code",
    "analysis": "Brief explanation of the determination",
    "primary_language_name": "Language name"
}}

Language Codes (ISO 639-1): en, es, zh, ja, ko, fr, de, it, pt, ru, ar, hi, th, vi, nl, sv, da, no, fi, pl, tr, etc.
Country Codes (ISO 3166-1): US, MX, CN, JP, KR, TW, HK, SG, IN, MY, TH, VN, PH, AU, CA, GB, DE, FR, IT, ES, BR, AR, etc.

Examples:
- Mexico → {{"lang": "es", "country": "MX"}}
- Taiwan → {{"lang": "zh", "country": "TW"}}
- Thailand → {{"lang": "th", "country": "TH"}}
- Netherlands → {{"lang": "nl", "country": "NL"}}
- Singapore → {{"lang": "en", "country": "SG"}}

Focus on the PRIMARY language used for news media in that country.
"""

            messages = [
                {"role": "system", "content": "You are a geographical and linguistic expert. Always respond with valid JSON only."},
                {"role": "user", "content": prompt}
            ]

            # Make GPT API call
            if self.use_legacy_api:
                response = openai.ChatCompletion.create(
                    model="gpt-3.5-turbo",
                    messages=messages,
                    max_tokens=300,
                    temperature=0.1
                )
                result = response.choices[0].message.content.strip()
            else:
                response = self.openai_client.chat.completions.create(
                    model="gpt-3.5-turbo",
                    messages=messages,
                    max_tokens=300,
                    temperature=0.1
                )
                result = response.choices[0].message.content.strip()

            # Clean and parse JSON
            result = self._clean_json_response(result)
            lang_data = json.loads(result)

            lang_config = {
                'lang': lang_data.get('lang', 'en').lower(),
                'country': lang_data.get('country', 'US').upper()
            }

            print(f"   🌍 GPT Language Analysis: {client_country} → {lang_config['lang']}-{lang_config['country']} ({lang_data.get('primary_language_name', 'Unknown')})")

            # Cache the result
            self.country_lang_cache[cache_key] = lang_config
            return lang_config

        except Exception as e:
            print(f"   ⚠️ GPT language analysis failed for {client_country}: {e}")

            # Fallback to default mapping or English
            fallback = DEFAULT_COUNTRY_LANG_MAP.get(client_upper, {'lang': 'en', 'country': 'US'})
            self.country_lang_cache[cache_key] = fallback
            return fallback

    def localize_disaster_terms(self, english_terms: List[str], target_lang: str, target_country: str) -> List[str]:
        """Use GPT to translate disaster terms to local language and terminology"""

        if target_lang == 'en' or not english_terms:
            return english_terms

        try:
            terms_text = ', '.join(english_terms)

            prompt = f"""
You are a disaster terminology localization expert. Translate these English disaster terms to the local language and terminology commonly used in {target_country}.

English Terms: {terms_text}
Target Language: {target_lang}
Target Country: {target_country}

Instructions:
1. Translate each term to the local language commonly used in {target_country}
2. Use the specific disaster terminology that local news media and authorities use
3. Consider regional variations and preferred terms
4. Maintain the disaster context and severity implications

Examples of localization:
- "tropical cyclone" in Mexico → "huracán"
- "tropical cyclone" in Taiwan → "颱風" (typhoon)
- "earthquake" in Japan → "地震"
- "flood" in Germany → "Hochwasser"
- "wildfire" in Spain → "incendio forestal"

Return ONLY a JSON object:
{{
    "localized_terms": ["term1", "term2", "term3"],
    "translations": {{
        "english_term1": "local_term1",
        "english_term2": "local_term2"
    }}
}}

Focus on terms that would be used in news headlines and official communications in {target_country}.
"""

            messages = [
                {"role": "system", "content": "You are a disaster terminology localization expert. Always respond with valid JSON only."},
                {"role": "user", "content": prompt}
            ]

            if self.use_legacy_api:
                response = openai.ChatCompletion.create(
                    model="gpt-3.5-turbo",
                    messages=messages,
                    max_tokens=400,
                    temperature=0.2
                )
                result = response.choices[0].message.content.strip()
            else:
                response = self.openai_client.chat.completions.create(
                    model="gpt-3.5-turbo",
                    messages=messages,
                    max_tokens=400,
                    temperature=0.2
                )
                result = response.choices[0].message.content.strip()

            result = self._clean_json_response(result)
            localization_data = json.loads(result)

            localized_terms = localization_data.get('localized_terms', english_terms)
            translations = localization_data.get('translations', {})

            print(f"   🌐 Localized terms for {target_country}: {', '.join(localized_terms[:3])}...")
            if translations:
                print(f"   📝 Key translations: {dict(list(translations.items())[:2])}")

            return localized_terms

        except Exception as e:
            print(f"   ⚠️ Term localization failed: {e}, using English terms")
            return english_terms

    def _clean_json_response(self, response: str) -> str:
      """Clean GPT response to extract valid JSON with robust error handling"""
      try:
          # Remove markdown code blocks
          response = re.sub(r'```json\s*', '', response, flags=re.IGNORECASE)
          response = re.sub(r'```\s*', '', response)

          # Remove common prefixes/suffixes
          response = re.sub(r'^[^{]*(?=\{)', '', response)  # Remove everything before first {
          response = re.sub(r'\}[^}]*$', '}', response)     # Remove everything after last }

          # Remove invisible characters
          response = response.replace('\ufeff', '').replace('\u200b', '').replace('\u200c', '').replace('\u200d', '').replace('\ufffe', '')

          # Fix common JSON issues
          # Remove trailing commas before } or ]
          response = re.sub(r',(\s*[}\]])', r'\1', response)
          # Fix multiple commas
          response = re.sub(r',\s*,', ',', response)
          # Remove commas at the end of the last property
          response = re.sub(r',(\s*}\s*})', r'\1', response)

          # Find JSON object
          json_match = re.search(r'\{.*\}', response, re.DOTALL)
          if json_match:
              response = json_match.group(0)

          # Additional cleanup for specific patterns that cause parsing issues
          # Remove trailing comma before closing brace in nested objects
          response = re.sub(r',(\s*}\s*[,}])', r'\1', response)

          response = response.strip()

          # Test if it's valid JSON
          json.loads(response)

          return response
      except json.JSONDecodeError:
          # If still invalid, try a more aggressive approach
          try:
              # Extract just the client_risk_scores part
              scores_match = re.search(r'"client_risk_scores":\s*\{[^}]+\}', response, re.DOTALL)
              if scores_match:
                  scores_part = scores_match.group(0)
                  # Create a minimal valid JSON
                  minimal_json = '{ ' + scores_part + ' }'
                  json.loads(minimal_json)  # Test validity
                  return minimal_json
          except:
              pass
          return response.strip()
      except:
          return response.strip()

# ==========================================
# DATE PARSING UTILITIES (UNCHANGED)
# ==========================================

def parse_date_input(date_str: str) -> Optional[datetime]:
    """Parse user date input with multiple formats including ISO datetime."""
    if not date_str or pd.isna(date_str):
        return None

    date_str = str(date_str).strip()

    date_formats = [
        '%Y-%m-%dT%H:%M:%S',      # ISO format from Excel (PRIORITY)
        '%Y-%m-%d',
        '%Y/%m/%d',
        '%d-%m-%Y',
        '%d/%m/%Y',
        '%m-%d-%Y',
        '%m/%d/%Y',
        '%Y-%m-%d %H:%M:%S',
        '%Y/%m/%d %H:%M:%S',
    ]

    for fmt in date_formats:
        try:
            return datetime.strptime(date_str, fmt)
        except ValueError:
            continue

    return None

def extract_date_range_from_excel_row(gdacs_event) -> Tuple[Optional[str], Optional[str]]:
    """Extract and format date ranges from Excel row data."""
    try:
        from_date = None
        to_date = None

        print(f"   📋 Raw Excel dates - From_Date: '{gdacs_event.from_date}', To_Date: '{gdacs_event.to_date}'")

        # Parse From_Date
        if hasattr(gdacs_event, 'from_date') and gdacs_event.from_date:
            from_dt = parse_date_input(gdacs_event.from_date)
            if from_dt:
                from_date = from_dt.strftime('%Y-%m-%d')
                print(f"   ✅ Successfully parsed From_Date: {gdacs_event.from_date} -> {from_date}")
            else:
                print(f"   ❌ Failed to parse From_Date: '{gdacs_event.from_date}'")
        else:
            print(f"   ⚠️ No From_Date found in Excel data")

        # Parse To_Date
        if hasattr(gdacs_event, 'to_date') and gdacs_event.to_date:
            to_dt = parse_date_input(gdacs_event.to_date)
            if to_dt:
                to_date = to_dt.strftime('%Y-%m-%d')
                print(f"   ✅ Successfully parsed To_Date: {gdacs_event.to_date} -> {to_date}")
            else:
                print(f"   ❌ Failed to parse To_Date: '{gdacs_event.to_date}'")
        else:
            print(f"   ⚠️ No To_Date found in Excel data")

        # If no dates are provided, use the default range (the past 7 days).
        if not from_date or not to_date:
            end_date = datetime.now()
            start_date = end_date - timedelta(days=7)
            from_date = start_date.strftime('%Y-%m-%d')
            to_date = end_date.strftime('%Y-%m-%d')
            print(f"   📅 Using default date range (parsing failed): {from_date} to {to_date}")
        else:
            print(f"   📅 Using Excel date range: {from_date} to {to_date}")

        return from_date, to_date

    except Exception as e:
        print(f"   ⚠️ Date extraction error: {e}, using default range")

        end_date = datetime.now()
        start_date = end_date - timedelta(days=7)
        return start_date.strftime('%Y-%m-%d'), end_date.strftime('%Y-%m-%d')

# ==========================================
# DATA CLASSES (UNCHANGED)
# ==========================================

@dataclass
class GDACSEvent:
    """GDACS event information extracted from Excel data"""
    event_id: str
    episode_id: str
    event_type: str
    event_name: str
    event_full_name: str
    event_description: str
    from_date: str
    to_date: str
    disaster_country: str
    alert_level: str
    alert_score: float
    episode_alert_level: str
    episode_alert_score: float
    severity_value: float
    severity_text: str
    client_country: str
    client_address: str
    distance_km: float

@dataclass
class ClientInfo:
    """Client information from Detailed_Matches"""
    event_id: str
    client_country: str
    severity_text: str
    client_distance_summary: str
    client_address: str

@dataclass
class DisasterAnalysis:
    """Enhanced disaster analysis results with GDACS-specific understanding"""
    gpt_disaster_keywords: List[str] = field(default_factory=list)
    gpt_response_keywords: List[str] = field(default_factory=list)
    gpt_timeliness_keywords: List[str] = field(default_factory=list)
    disaster_severity_score: float = 0.0
    response_capability_score: float = 0.0
    response_timeliness_score: float = 0.0
    disaster_relevance_score: float = 0.0
    gdacs_alignment_score: float = 0.0
    media_coverage_score: float = 0.0
    is_disaster_related: bool = False
    severity_level: str = ""
    response_level: str = ""
    timeliness_level: str = ""
    relevance_level: str = ""
    gdacs_alignment_level: str = ""
    media_coverage_level: str = ""
    primary_disaster_type: str = ""
    countries_mentioned: List[str] = field(default_factory=list)
    detailed_locations: List[str] = field(default_factory=list)
    gpt_analysis_summary: str = ""
    gdacs_context_analysis: str = ""
    client_country_match: bool = False
    final_risk_score: float = 0.0
    confidence_score: float = 0.0
    # Client risk scoring fields
    client_risk_scores: Dict[str, float] = field(default_factory=dict)
    matched_client_addresses: List[str] = field(default_factory=list)
    matched_location: str = ""

@dataclass
class MediaItem:
    """Media item data structure"""
    title: str
    url: str
    date_text: str
    parsed_date: Optional[datetime]
    summary: str = ""
    disaster_analysis: Optional[DisasterAnalysis] = None
    gdacs_event: Optional[GDACSEvent] = None

# ==========================================
# DISASTER TYPE AND GDACS KNOWLEDGE BASE (UNCHANGED)
# ==========================================

DISASTER_TYPE_MAPPING = {
    'EQ': 'earthquake',
    'FL': 'flood',
    'TC': 'hurricane',
    'ST': 'storm',
    'DR': 'drought',
    'WF': 'wildfire',
    'VO': 'volcano',
    'LS': 'landslide',
    'TS': 'tsunami',
    'TO': 'tornado',
    'AV': 'avalanche',
    'CY': 'cyclone',
    'BL': 'blizzard',
    'HW': 'heatwave',
    'HU': 'hurricane',
    'TY': 'typhoon',
    'TH': 'thunderstorm'
}

GDACS_SCORING_KNOWLEDGE = {
    'EQ': {
                'name': 'Earthquake',
                'models': ['Shakemap Model', 'EQ Parameters Model'],
                'key_inputs': ['MMI intensity', 'magnitude', 'depth', 'exposed population', 'vulnerability', 'INFORM LCC'],
                'scoring_method': 'Shakemap intensity (MMI), exposed population, vulnerability, INFORM Lack of Coping Capacity',
                'alert_thresholds': {'RED': '≥2.0', 'ORANGE': '1.0-2.0', 'GREEN': '<1.0'},
                'key_factors': ['magnitude', 'depth', 'population_exposure', 'MMI_intensity', 'vulnerability', 'coping_capacity'],
                'scoring_logic': {
                    'condition': 'If max MMI in populated area ≤ VI → Alert = GREEN (Score = 0)',
                    'scaled_population': '10×Pop(MMI IX) + Pop(MMI VIII) + 0.1×Pop(MMI VII)',
                    'raw_score_formula': '–0.59 + 0.53 × log₁₀(Scaled Population)',
                    'final_score': 'Shakemap Score × INFORM LCC',
                    'special_rule': 'If Shakemap Score >2 but LCC reduces below 1, set final to 1'
                },
                'impact_indicators': ['casualties', 'building_damage', 'infrastructure_damage', 'MMI_levels', 'population_affected', 'aftershocks'],
                'assessment_focus': 'How many people were strongly shaken? MMI IX (×10), VIII (×1), VII (×0.1)'
            },
            'TS': {
                'name': 'Tsunami',
                'models': ['Tsunami Wave Height Model'],
                'key_inputs': ['earthquake magnitude (>6.5)', 'depth', 'water depth', 'modeled wave height'],
                'scoring_method': 'Maximum wave height at coast, historical tsunami scenarios',
                'alert_thresholds': {'RED': 'wave ≥3m (≥2.0)', 'ORANGE': 'wave 1-3m (1.0-2.0)', 'GREEN': 'wave <1m'},
                'key_factors': ['wave_height', 'coastal_population', 'earthquake_magnitude', 'travel_time', 'coastal_topography'],
                'scoring_logic': {
                    'red_threshold': 'wave height ≥ 3m (score ≥ 2)',
                    'orange_threshold': '1–3m (1 ≤ score < 2)',
                    'green_threshold': '< 1m',
                    'fallback': 'If no precomputed scenario, use magnitude-based IOC matrix'
                },
                'impact_indicators': ['wave_height', 'coastal_inundation', 'evacuation_areas', 'travel_time', 'coastal_damage', 'maritime_disruption'],
                'assessment_focus': 'Wave height and coastal population exposure'
            },
            'TC': {
                'name': 'Tropical Cyclone',
                'models': ['Wind Impact Model', 'Storm Surge Model'],
                'key_inputs': ['wind speed (1-min sustained)', 'Saffir-Simpson category', 'population exposure', 'vulnerability (HDI + rural)'],
                'scoring_method': 'Wind impact zones, Saffir-Simpson scale, population exposure, HDI vulnerability',
                'alert_thresholds': {'RED': 'Cat 3+ with high exposure', 'ORANGE': 'Cat 1-2 with significant exposure', 'GREEN': 'TS <10M people'},
                'key_factors': ['wind_speed', 'category', 'population_exposure', 'storm_surge', 'vulnerability', 'track'],
                'scoring_logic': {
                    'green': 'TS affecting < 10M people (all vulnerability levels)',
                    'orange': 'TS >10M high vuln; Cat 1-2 >100K/10% med-high vuln; Cat 3 >1M low vuln',
                    'red': 'Cat 1-2 >1M high vuln; Cat 3 >100K/10% med-high vuln; Cat 4 >1M low vuln',
                    'storm_surge': 'RED ≥3m; ORANGE 1-3m; GREEN ≤1m (calculated but not in overall score yet)'
                },
                'impact_indicators': ['wind_damage', 'storm_surge_height', 'flooding', 'evacuation_numbers', 'infrastructure_damage', 'landfall_intensity'],
                'assessment_focus': 'Wind impact zones and population vulnerability'
            },
            'FL': {
                'name': 'Flood',
                'models': ['Impact-based Assessment'],
                'key_inputs': ['reported deaths', 'displaced people', 'affected area', 'duration', 'official sources'],
                'scoring_method': 'Impact-based assessment, reported casualties and displacement',
                'alert_thresholds': {'RED': '>1000 deaths or >800K displaced', 'ORANGE': '>100 deaths or >80K displaced', 'GREEN': 'Other floods'},
                'key_factors': ['deaths', 'displaced_people', 'affected_area', 'duration', 'infrastructure_damage', 'economic_impact'],
                'scoring_logic': {
                    'red_criteria': '>1,000 deaths OR >800,000 displaced',
                    'orange_criteria': '>100 deaths OR >80,000 displaced',
                    'green_criteria': 'All other floods',
                    'trigger_method': 'Based on reported impacts rather than automated modeling'
                },
                'impact_indicators': ['casualties', 'displaced_population', 'flooded_area', 'infrastructure_damage', 'economic_loss', 'rescue_operations'],
                'assessment_focus': 'Reported human impact and displacement numbers'
            },
            'VO': {
                'name': 'Volcano',
                'models': ['Volcanic Ash Advisory Model'],
                'key_inputs': ['Volcanic Ash Advisories (VAAs)', 'ash plume height', 'eruption size', 'population proximity'],
                'scoring_method': 'Volcanic Ash Advisories, eruption magnitude, populated area proximity',
                'alert_thresholds': {'RED': 'Major international attention', 'ORANGE': 'Significant activity', 'GREEN': 'VAA detected activity'},
                'key_factors': ['ash_plume_height', 'eruption_size', 'population_proximity', 'aviation_impact', 'ash_distribution'],
                'scoring_logic': {
                    'green': 'VAA red/orange or new activity detected (automatic)',
                    'orange_red': 'Manually assigned for major volcanic events with international attention',
                    'detection': 'Automatic via VAAs; manual interpretation for significant events'
                },
                'impact_indicators': ['ash_plume_height', 'aviation_disruption', 'population_evacuation', 'ash_fall', 'air_quality', 'international_attention'],
                'assessment_focus': 'Aviation impact and international significance'
            },
            'DR': {
                'name': 'Drought',
                'models': ['RDrI-Agri Model'],
                'key_inputs': ['RDrI-Agri index', 'agricultural risk', 'socio-economic factors', 'duration ≥1 month'],
                'scoring_method': 'RDrI-Agri index, agricultural risk, socio-economic factors',
                'alert_thresholds': {'RED': 'Life-threatening impacts', 'ORANGE': 'Economic impacts', 'GREEN': 'Mild/localized'},
                'key_factors': ['duration', 'affected_area', 'agricultural_impact', 'food_security', 'economic_impact', 'coping_capacity'],
                'scoring_logic': {
                    'green': '0.25–0.5: Mild, localized, no impacts or high coping capacity',
                    'orange': '0.75–1.75: Relevant economic/sectoral impacts, media coverage',
                    'red': '2.0–3.0: Life-threatening—displacement, famine, international aid',
                    'validation': 'Expert review with independent sources for impact confirmation'
                },
                'impact_indicators': ['crop_yield', 'food_security', 'livestock_impact', 'water_scarcity', 'economic_loss', 'migration'],
                'assessment_focus': 'Agricultural impact and food security implications'
            },
            'WF': {
                'name': 'Wildfire',
                'models': ['Burnt Area and Population Proximity Model'],
                'key_inputs': ['MODIS/VIIRS satellite data', 'burned area', 'population within 5km', 'casualties'],
                'scoring_method': 'Burned area, population within 5km, casualties, infrastructure damage',
                'alert_thresholds': {'RED': '2.5 (severe impact)', 'ORANGE': '1.5', 'GREEN': '0.5 (auto)'},
                'key_factors': ['burned_area', 'population_proximity', 'fatalities', 'infrastructure_damage', 'evacuation', 'coping_capacity'],
                'scoring_logic': {
                    'detection': 'Automatic when burned area ≥5,000 ha; manual if smaller but near population',
                    'display': 'Displayed if ≥10,000 ha burned AND ≥10,000 people within 5km',
                    'red_criteria': 'Severe impact: displacements, fatalities, UCPM activation, low coping capacity'
                },
                'impact_indicators': ['burned_area', 'casualties', 'displaced_population', 'property_damage', 'air_quality', 'evacuation_scale'],
                'assessment_focus': 'Burned area scale and population proximity'
            }
}

ALERT_LEVEL_DEFINITIONS = {
    'RED': {
        'severity': 'High',
        'definition': 'Very likely humanitarian impact',
        'description': 'Humanitarian impact is very likely, significant international assistance may be required',
        'action_required': 'Immediate response and international assistance needed'
    },
    'ORANGE': {
        'severity': 'Medium',
        'definition': 'Humanitarian impact possible',
        'description': 'Humanitarian impact is possible, affected country response capabilities should be monitored',
        'action_required': 'Monitor closely and prepare response resources'
    },
    'GREEN': {
        'severity': 'Low',
        'definition': 'No or minimal humanitarian impact',
        'description': 'No significant humanitarian impact expected, but event is being monitored',
        'action_required': 'Continue monitoring situation'
    }
}

# ==========================================
# EXCEL DATA PROCESSOR CLASS (UNCHANGED)
# ==========================================

class OptimizedExcelProcessor:
    """Optimized Excel processor that supports loading customer information."""

    def __init__(self, excel_file_path="enhanced_disaster_analysis_400.0km_buffer10.0km 2.xlsx"):
        self.excel_file_path = excel_file_path
        self.disaster_data = None
        self.client_info_by_event = {}
        self.client_countries_by_event = {}
        self.load_data()
        self.load_client_info()

    def load_data(self):
        """Load disaster data from the Detailed_Matches sheet"""
        try:
            if not os.path.exists(self.excel_file_path):
                print(f"❌ Excel file not found: {self.excel_file_path}")
                return

            # Load the Detailed_Matches sheet directly
            self.disaster_data = pd.read_excel(self.excel_file_path, sheet_name="Detailed_Matches")
            print(f"📊 Successfully loaded {len(self.disaster_data)} rows from Detailed_Matches sheet")
            print(f"📋 Key columns found: Event_ID, Event_Type, Event_Name, Alert_Level, Client_Country, etc.")

        except Exception as e:
            print(f"❌ Error loading Excel data: {str(e)}")
            self.disaster_data = None

    def load_client_info(self):
        """Load complete client information from Detailed_Matches sheet"""
        try:
            logger.info("📊 Loading client information from Detailed_Matches sheet")

            if self.disaster_data is None:
                logger.warning("No disaster data available for client info loading")
                return

            required_columns = ['Event_ID', 'Client_Country', 'Severity_Text', 'Client_Distance_Summary', 'Client_Address']
            missing_columns = [col for col in required_columns if col not in self.disaster_data.columns]

            if missing_columns:
                logger.warning(f"Missing required columns for client info: {missing_columns}")
                return

            # Process each row to extract client information
            for idx, row in self.disaster_data.iterrows():
                try:
                    event_id = self.safe_string_conversion(row.get('Event_ID', ''))
                    if not event_id:
                        continue

                    client_info = ClientInfo(
                        event_id=event_id,
                        client_country=self.safe_string_conversion(row.get('Client_Country', '')),
                        severity_text=self.safe_string_conversion(row.get('Severity_Text', '')),
                        client_distance_summary=self.safe_string_conversion(row.get('Client_Distance_Summary', '')),
                        client_address=self.safe_string_conversion(row.get('Client_Address', ''))
                    )

                    if event_id not in self.client_info_by_event:
                        self.client_info_by_event[event_id] = []
                        self.client_countries_by_event[event_id] = []

                    self.client_info_by_event[event_id].append(client_info)

                    # Also maintain the old client_countries_by_event for compatibility
                    if client_info.client_country not in self.client_countries_by_event[event_id]:
                        self.client_countries_by_event[event_id].append(client_info.client_country)

                except Exception as e:
                    logger.warning(f"Error processing client info for row {idx}: {e}")
                    continue

            total_events_with_clients = len(self.client_info_by_event)
            total_client_records = sum(len(clients) for clients in self.client_info_by_event.values())

            logger.info(f"✅ Loaded client information for {total_events_with_clients} events")
            logger.info(f"📋 Total client records: {total_client_records}")

        except Exception as e:
            logger.error(f"❌ Loading client information failed: {e}")
            logger.info("Continuing without client information...")

    def safe_string_conversion(self, value, default=''):
        """Safe string conversion"""
        if pd.isna(value) or value is None:
            return default
        return str(value).strip()

    def get_row_data(self, row_index: int) -> Optional[GDACSEvent]:
        """Extract GDACS event data from a specific row"""
        if self.disaster_data is None or row_index >= len(self.disaster_data):
            return None

        try:
            row = self.disaster_data.iloc[row_index]

            # Extract and clean data with proper handling of NaN values
            def safe_str(value, default=''):
                if pd.isna(value) or value is None:
                    return default
                return str(value).strip()

            def safe_float(value, default=0.0):
                if pd.isna(value) or value is None:
                    return default
                try:
                    return float(value)
                except (ValueError, TypeError):
                    return default

            # Map event type to readable format
            event_type_raw = safe_str(row.get('Event_Type', ''))
            event_type = DISASTER_TYPE_MAPPING.get(event_type_raw.upper(), event_type_raw.lower())

            # Normalize alert level
            alert_level = safe_str(row.get('Alert_Level', '')).upper()
            if alert_level not in ['RED', 'ORANGE', 'GREEN']:
                if alert_level.lower() in ['high', 'severe', '3']:
                    alert_level = 'RED'
                elif alert_level.lower() in ['medium', 'moderate', '2']:
                    alert_level = 'ORANGE'
                else:
                    alert_level = 'GREEN'

            return GDACSEvent(
                event_id=safe_str(row.get('Event_ID', f"event_{row_index}")),
                episode_id=safe_str(row.get('Episode_ID', '')),
                event_type=event_type,
                event_name=safe_str(row.get('Event_Name', f'Event {row_index + 1}')),
                event_full_name=safe_str(row.get('Event_Full_Name', '')),
                event_description=safe_str(row.get('Event_Description', ''))[:500],
                from_date=safe_str(row.get('From_Date', '')),
                to_date=safe_str(row.get('To_Date', '')),
                disaster_country=safe_str(row.get('Disaster_Country', '')),
                alert_level=alert_level,
                alert_score=safe_float(row.get('Alert_Score', 1.0)),
                episode_alert_level=safe_str(row.get('Episode_Alert_Level', alert_level)),
                episode_alert_score=safe_float(row.get('Episode_Alert_Score', 1.0)),
                severity_value=safe_float(row.get('Severity_Value', 0.0)),
                severity_text=safe_str(row.get('Severity_Text', '')),
                client_country=safe_str(row.get('Client_Country', '')),
                client_address=safe_str(row.get('Client_Address', '')),
                distance_km=safe_float(row.get('Distance_KM', 0.0))
            )

        except Exception as e:
            print(f"❌ Error processing row {row_index}: {e}")
            return None

    def get_total_rows(self):
        """Get total number of rows available for processing"""
        return len(self.disaster_data) if self.disaster_data is not None else 0

    def get_client_countries_for_event(self, event_id: str) -> List[str]:
        """Get client countries for a specific Event_ID"""
        return self.client_countries_by_event.get(event_id, [])

    def get_client_info_for_event(self, event_id: str) -> List[ClientInfo]:
        """Get client info for a specific Event_ID"""
        return self.client_info_by_event.get(event_id, [])

# ==========================================
# PYGOOGLENEWS CLIENT WITH LOCALIZATION (NEW)
# ==========================================

class LocalizedPyGoogleNewsClient:
    """Enhanced PyGoogleNews client with localization and date filtering"""

    def __init__(self, lang='en', country='US'):
        self.lang = lang.lower()
        self.country = country.upper()
        try:
            self.gn = GoogleNews(lang=self.lang, country=self.country)
            print(f"✅ PyGoogleNews initialized with lang={self.lang}, country={self.country}")
        except Exception as e:
            print(f"❌ PyGoogleNews initialization failed: {e}")
            # Fallback to default
            self.gn = GoogleNews(lang='en', country='US')
            print(f"✅ PyGoogleNews fallback to lang=en, country=US")

    def reinitialize_for_country(self, lang: str, country: str):
        """Reinitialize GoogleNews with new language and country settings"""
        try:
            self.lang = lang.lower()
            self.country = country.upper()
            self.gn = GoogleNews(lang=self.lang, country=self.country)
            print(f"🌍 PyGoogleNews reinitialized for lang={self.lang}, country={self.country}")
            return True
        except Exception as e:
            print(f"❌ PyGoogleNews reinitialization failed: {e}")
            return False

    def _parse_pubdate(self, entry) -> datetime:
        """Convert the entry's published field to datetime"""
        try:
            if hasattr(entry, 'published') and entry.published:
                # Try parsing various formats
                pub_date = entry.published

                # Handle different date formats from PyGoogleNews
                for fmt in ['%a, %d %b %Y %H:%M:%S %Z',
                           '%Y-%m-%dT%H:%M:%SZ',
                           '%Y-%m-%d %H:%M:%S',
                           '%d %b %Y %H:%M:%S']:
                    try:
                        return datetime.strptime(pub_date, fmt)
                    except ValueError:
                        continue

                # Fallback parsing
                from email.utils import parsedate_to_datetime
                return parsedate_to_datetime(pub_date).replace(tzinfo=None)

            return None
        except Exception:
            return None

    def _within_range(self, dt: datetime, start_dt: datetime, end_dt: datetime) -> bool:
        """Check if datetime is within range"""
        if dt is None:
            return False
        return start_dt <= dt <= end_dt

    def _to_dt(self, s, default=None):
        """Convert date string to datetime"""
        if not s:
            return default
        for fmt in ('%Y-%m-%d', '%Y/%m/%d'):
            try:
                return datetime.strptime(s, fmt)
            except ValueError:
                continue
        for fmt in ('%Y-%m-%d %H:%M:%S', '%Y/%m/%d %H:%M:%S'):
            try:
                return datetime.strptime(s, fmt)
            except ValueError:
                continue
        return default

    def search(self, query: str, max_results=20, from_=None, to_=None):
        """Search with PyGoogleNews and apply date filtering"""
        try:
            if not query.strip():
                return {'entries': [], 'status': 'error', 'message': 'Empty query'}

            # Prepare date bounds
            now = datetime.now()
            start_dt = self._to_dt(from_, datetime(1900, 1, 1))
            end_dt = self._to_dt(to_, now)
            if end_dt > now:
                end_dt = now
            if start_dt > end_dt:
                start_dt, end_dt = end_dt, end_dt

            print(f"      Date filtering: {start_dt.strftime('%Y-%m-%d')} to {end_dt.strftime('%Y-%m-%d')}")

            # Perform search with PyGoogleNews
            search_results = self.gn.search(query)

            processed_entries = []
            total_entries = 0
            filtered_out = 0

            for entry in search_results['entries']:
                total_entries += 1

                # Parse publication date and apply local filtering
                pub_dt = self._parse_pubdate(entry)
                if not self._within_range(pub_dt, start_dt, end_dt):
                    filtered_out += 1
                    continue

                # Process entry content
                title = self._clean_html(getattr(entry, 'title', '') or '')
                summary = self._clean_html(getattr(entry, 'summary', '') or '')
                link = getattr(entry, 'link', '') or ''
                published = getattr(entry, 'published', '') or ''

                source = 'Unknown'
                if hasattr(entry, 'source') and entry.source:
                    source = getattr(entry.source, 'title', 'Unknown')

                if not title or not link or len(title.strip()) < 5:
                    continue

                processed_entries.append({
                    'title': title[:300],
                    'summary': summary[:500],
                    'link': link,
                    'published': published,
                    'parsed_date': pub_dt.strftime('%Y-%m-%d %H:%M:%S') if pub_dt else '',
                    'source': source[:100]
                })

                if len(processed_entries) >= max_results:
                    break

            print(f"      PyGoogleNews entries: {total_entries}, Filtered out: {filtered_out}, Selected: {len(processed_entries)}")

            return {
                'entries': processed_entries,
                'status': 'ok',
                'total_results': len(processed_entries),
                'query': query,
                'search_query': query,
                'date_filter_stats': {
                    'total_entries': total_entries,
                    'filtered_out': filtered_out,
                    'selected': len(processed_entries)
                },
                'debug_query': query,
                'lang_country': f"{self.lang}-{self.country}"
            }

        except Exception as e:
            return {'entries': [], 'status': 'error', 'message': str(e)}

    def _clean_html(self, text: str) -> str:
        """Clean HTML tags and normalize text"""
        if not text or pd.isna(text):
            return ""

        try:
            text = html.unescape(str(text))
            soup = BeautifulSoup(text, 'html.parser')
            clean_text = soup.get_text()
            clean_text = re.sub(r'\s+', ' ', clean_text).strip()
            return clean_text
        except Exception:
            return str(text).strip() if text else ""

# ==========================================
# GPT URL CONTENT ANALYZER
# ==========================================

class GPTURLAnalyzer:
    """GPT-based URL content analyzer to replace BeautifulSoup web scraping"""

    def __init__(self, openai_client, use_legacy_api=False):
        self.openai_client = openai_client
        self.use_legacy_api = use_legacy_api
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        })

    def extract_url_content_with_gpt(self, url: str, title: str = "", timeout: int = 20) -> Dict[str, str]:
        """
        Use GPT to analyze and extract comprehensive content from a URL.
        Returns dict with 'content', 'summary', and 'status' keys.
        """
        try:
            if not url or url == 'No URL':
                return {
                    'content': title,
                    'summary': title[:300] if title else "No content available",
                    'status': 'no_url'
                }

            print(f"            🤖 GPT analyzing URL for comprehensive content: {url[:60]}...")

            # First, fetch the raw HTML
            try:
                response = self.session.get(url, timeout=timeout)
                if response.status_code != 200:
                    print(f"            ❌ HTTP {response.status_code} for URL")
                    return {
                        'content': title,
                        'summary': title[:300] if title else "URL not accessible",
                        'status': 'http_error'
                    }

                # Get raw HTML (increased limit for better extraction)
                raw_html = response.text[:15000]  # Increased from 8000 to 15000 chars

            except Exception as fetch_error:
                print(f"            ❌ URL fetch failed: {fetch_error}")
                return {
                    'content': title,
                    'summary': title[:300] if title else "URL fetch failed",
                    'status': 'fetch_error'
                }

            # Use GPT to extract comprehensive content
            prompt = f"""
You are an expert news article content extractor specialized in disaster and emergency news analysis. Your task is to extract comprehensive, detailed article content for thorough analysis.

URL: {url}
Article Title: {title}

HTML Content:
{raw_html}

You are an expert news article content extractor specialized in disaster and emergency news analysis. Your task is to extract comprehensive, detailed article content for thorough analysis.

URL: {url}
Article Title: {title}

HTML Content:
{raw_html}

Please extract and return ONLY a JSON object with this exact format:
{{
    "main_content": "COMPREHENSIVE main article text content including ALL countries disruption happening, paragraphs, quotes, statistics, and details. Remove HTML tags, ads, navigation, but KEEP ALL article text including background information, casualty figures, response details, timeline information, expert quotes, and humanitarian impacts.",
    "summary": "A detailed 13-15 sentence summary capturing disaster severity, human/economic impact, response measures, recovery efforts, and resilience factors. If certain categories are missing, explicitly state their absence in natural language (e.g., 'The article does not mention economic losses' or 'There is no information about school closures or preparedness drills').",
    "extraction_success": true/false,
    "content_length": <number of words in main_content>,
    "key_details_found": [
        "list of categories covered",
        "plus natural notes for missing items (e.g., 'no information on economic losses')"
    ]
}}

---------------------------------------------------
CATEGORY SET 1 – DISASTER SEVERITY & IMPACT
(Focus on the seriousness of the disaster and its direct consequences)

1. Casualties & Affected Population
   - Deaths, injuries, missing persons, displaced people (exact numbers or estimates).

2. Property & Infrastructure Damage
   - Homes, schools, hospitals, businesses, transportation systems.

3. Economic Losses
   - Monetary estimates, industry-specific impacts.

4. Public Service Disruptions
   - School closures, work stoppages, transportation shutdowns, electricity/water/communication outages (with duration).

5. Disaster Timeline
   - Event onset, sequence of events, secondary hazards.

6. Geographic & Contextual Background
   - Locations, geography, comparisons with past disasters.

---------------------------------------------------
CATEGORY SET 2 – RESPONSE & RESILIENCE
(Focus on government, community, and systemic response measures)

1. Critical Infrastructure Functionality
   - Hospitals, power, water, communication systems; speed of restoration.

2. Government Crisis Management
   - Effectiveness of response, staffing, leadership, emergency declarations.

3. Evacuation & Preparedness Measures
   - Evacuation orders, warning systems, drills, preparedness programs.

4. Emergency Resources & Relief Efforts
   - Food, water, fuel, medical resources, shelters, search-and-rescue operations.

5. Coordination & Communication
   - Agency cooperation, command structures, info sharing, international support.

6. Learning from Past Disasters
   - Building codes, land-use planning, retrofitted infrastructure, lessons applied.

7. Economic Continuity & Recovery
   - Essential services (banks, supply chains, utilities), business continuity, recovery capacity.

8. Community & Volunteer Support
   - Volunteer groups, NGOs, social/religious organizations.

9. Risk Reduction & Public Education
   - Protective infrastructure, public education, awareness campaigns.

10. Specific Resilience Evidence
   - Concrete examples (e.g., “backup generators kept hospitals running 72 hours,” “80% residents received SMS alerts”).

---------------------------------------------------
CRITICAL ENFORCEMENT
- For EVERY category in both sets, either extract the actual information OR explicitly state its absence in natural language.
- Do NOT leave categories blank. Always confirm presence or absence.
- Be explicit and concrete. Example: “Three bridges collapsed” instead of “some damage occurred.”
- Maintain structured completeness: all categories must be addressed.

---------------------------------------------------
EXAMPLE OUTPUT
{{
  "main_content": "A magnitude 6.9 earthquake struck City Z, destroying hundreds of homes and cutting electricity for 48 hours. Officials confirmed 210 deaths and more than 1,000 injuries. Around 30,000 residents were displaced. The government declared a state of emergency, deploying 3,500 soldiers for search and rescue. Hospitals reported overcrowding, and relief camps were established in schools and stadiums. International aid agencies announced they were sending supplies. However, the article does not mention economic losses or details about preparedness drills.",
  "summary": "A powerful earthquake hit City Z, causing 210 deaths, 1,000 injuries, and displacing 30,000 residents. Major damage occurred to housing and infrastructure, with power outages lasting 48 hours. The government deployed 3,500 soldiers, and shelters were opened in schools and stadiums. Hospitals struggled with capacity, while international aid groups mobilized assistance. The article does not mention economic losses. There is also no information about preparedness drills or resilience measures.",
  "extraction_success": true,
  "content_length": 162,
  "key_details_found": [
    "casualties (210 dead, 1,000 injured, 30,000 displaced)",
    "infrastructure damage (housing destroyed, 48-hour power outage)",
    "government response (emergency declared, 3,500 soldiers deployed)",
    "relief efforts (shelters, international aid)",
    "the article does not mention economic losses",
    "no mention of preparedness drills or resilience measures"
  ]
}}

"""

            messages = [
                {"role": "system", "content": "You are a comprehensive news content extraction expert focused on disaster analysis. Extract complete article content, not summaries. Prioritize thoroughness over brevity."},
                {"role": "user", "content": prompt}
            ]

            try:
                # Make GPT API call with increased token limit for comprehensive extraction
                if self.use_legacy_api:
                    gpt_response = openai.ChatCompletion.create(
                        model="gpt-3.5-turbo",
                        messages=messages,
                        max_tokens=2000,  # Increased from 1000 to 2000
                        temperature=0.1
                    )
                    result = gpt_response.choices[0].message.content.strip()
                else:
                    gpt_response = self.openai_client.chat.completions.create(
                        model="gpt-3.5-turbo",
                        messages=messages,
                        max_tokens=2000,  # Increased from 1000 to 2000
                        temperature=0.1
                    )
                    result = gpt_response.choices[0].message.content.strip()

                # Clean and parse JSON response
                result = self._clean_json_response(result)

                try:
                    content_data = json.loads(result)

                    if content_data.get('extraction_success', False):
                        main_content = content_data.get('main_content', '').strip()
                        summary = content_data.get('summary', '').strip()
                        key_details = content_data.get('key_details_found', [])

                        if main_content and len(main_content) > 100:  # Reduced minimum from 50 to 100
                            word_count = len(main_content.split())
                            print(f"            ✅ GPT extracted comprehensive content: {len(main_content)} chars, {word_count} words")
                            if key_details:
                                print(f"            📋 Key details found: {', '.join(key_details[:5])}")

                            return {
                                'content': main_content[:5000],  # Increased from 2000 to 5000 chars for analysis
                                'summary': summary[:800],        # Increased from 500 to 800 chars
                                'status': 'gpt_success',
                                'word_count': word_count,
                                'key_details': key_details[:10]  # Store key details for tracking
                            }

                    print(f"            ⚠️ GPT extraction unsuccessful or content too short")

                except json.JSONDecodeError as je:
                    print(f"            ❌ GPT response JSON parsing failed: {str(je)[:100]}")

            except Exception as gpt_error:
                print(f"            ❌ GPT API error: {gpt_error}")

            # Fallback to title/summary if GPT extraction fails
            return {
                'content': title,
                'summary': title[:300] if title else "Content extraction failed",
                'status': 'gpt_fallback'
            }

        except Exception as e:
            print(f"            ❌ URL content analysis failed: {e}")
            return {
                'content': title,
                'summary': title[:300] if title else "Analysis failed",
                'status': 'error'
            }

    def _clean_json_response(self, response: str) -> str:
        """Clean GPT response to extract valid JSON"""
        try:
            response = re.sub(r'```json\s*', '', response)
            response = re.sub(r'```\s*', '', response)

            json_match = re.search(r'\{.*\}', response, re.DOTALL)
            if json_match:
                return json_match.group(0)

            return response.strip()
        except:
            return response

# ==========================================
# ENHANCED DISASTER ANALYZER WITH LOCALIZATION
# ==========================================

class EnhancedDisasterAnalyzer:
    """Complete disaster analyzer with PyGoogleNews localization, GPT-powered intelligence, and client risk scoring"""

    def __init__(self, openai_api_key: str, excel_file_path: str):
        self.excel_processor = OptimizedExcelProcessor(excel_file_path)

        # Initialize with default news client
        self.news_client = LocalizedPyGoogleNewsClient()

        # Initialize OpenAI client
        try:
            if OPENAI_V1:
                self.openai_client = OpenAI(api_key=openai_api_key)
                self.use_legacy_api = False
            else:
                openai.api_key = openai_api_key
                self.openai_client = None
                self.use_legacy_api = True

            print("✅ OpenAI client initialized successfully")
        except Exception as e:
            print(f"❌ OpenAI client initialization failed: {e}")
            raise

        # Initialize country-language analyzer (NEW)
        self.country_lang_analyzer = CountryLanguageAnalyzer(self.openai_client, self.use_legacy_api)
        print("✅ Country Language Analyzer initialized - GPT-powered localization active")

        # Initialize GPT URL analyzer
        self.url_analyzer = GPTURLAnalyzer(self.openai_client, self.use_legacy_api)
        print("✅ GPT URL Analyzer initialized")

        # Statistics tracking
        self.stats = {
            'rows_processed': 0,
            'articles_fetched': 0,
            'gpt_analyses': 0,
            'successful_analyses': 0,
            'client_risk_calculations': 0,
            'errors': 0,
            'api_calls': 0,
            'gpt_url_extractions': 0,
            'gpt_url_successes': 0,
            'country_localizations': 0,
            'term_localizations': 0,
            'start_time': time.time()
        }

    def process_single_row(self, row_index: int) -> Optional[Dict]:
        """Process a single disaster event row with localized search based on client country"""
        try:
            gdacs_event = self.excel_processor.get_row_data(row_index)
            if not gdacs_event:
                print(f"❌ Failed to load row {row_index}")
                return None

            print(f"\n🔄 Processing Row {row_index + 1}/{self.excel_processor.get_total_rows()}")
            print(f"   Event: {gdacs_event.event_name} ({gdacs_event.event_type})")
            print(f"   Alert: {gdacs_event.alert_level} | Country: {gdacs_event.disaster_country}")
            print(f"   Client: {gdacs_event.client_country} | Address: {gdacs_event.client_address[:50]}...")

            # Extract date range from Excel data
            from_date, to_date = extract_date_range_from_excel_row(gdacs_event)

            # Determine language/country for localized search (NEW)
            lang_config = self.country_lang_analyzer.analyze_client_country_language(
                gdacs_event.client_country, gdacs_event.client_address
            )

            # Reinitialize news client with localized settings
            self.news_client.reinitialize_for_country(lang_config['lang'], lang_config['country'])
            self.stats['country_localizations'] += 1

            # Search for news articles with localized terms and date filtering (MODIFIED)
            articles = self.search_disaster_news_localized(gdacs_event, from_date, to_date, lang_config)

            if not articles:
                print("   ⚠️ No articles found")
                return self._create_empty_result(row_index, gdacs_event)

            # Process articles with GPT analysis
            print(f"   📰 Processing {len(articles)} articles with GPT analysis...")
            processed_articles = []
            analyses = []

            for i, article in enumerate(articles[:20], 1):  # Limit to 20 articles
                try:
                    print(f"      Article {i}: {article['title'][:40]}...")

                    # Extract content using GPT URL analyzer
                    content_result = self.extract_article_content_gpt(article)
                    content = content_result['content']
                    article_summary = content_result['summary']

                    # Analyze with GPT
                    analysis = self.analyze_with_gpt(article['title'], content, gdacs_event)

                    # Calculate client risk scores with proper address display (MODIFIED)
                    client_risk_scores = self.calculate_client_risk_scores_with_addresses(
                        article['title'], content, gdacs_event.event_id)
                    analysis.client_risk_scores = client_risk_scores

                    if client_risk_scores:
                        self.stats['client_risk_calculations'] += 1
                        print(f"         📊 Client Risk Scores: {client_risk_scores}")

                    processed_articles.append({
                        'title': article['title'],
                        'summary': article_summary,
                        'link': article.get('link', ''),
                        'source': article.get('source', 'Unknown'),
                        'content': content[:500],
                        'extraction_method': content_result['status'],
                        'word_count': content_result.get('word_count', 0),
                        'key_details': content_result.get('key_details', []),
                        'lang_country': article.get('lang_country', f"{lang_config['lang']}-{lang_config['country']}")  # NEW
                    })

                    analyses.append(analysis)

                except Exception as e:
                    print(f"         ❌ Error processing article: {e}")
                    continue

            # Calculate summary
            summary = self._calculate_summary(analyses)
            print(f"   ✅ Completed: {len(analyses)} articles analyzed with localized search")
            print(f"   🌍 Language/Country: {lang_config['lang']}-{lang_config['country']}")

            result = {
                'row_index': row_index,
                'gdacs_event': gdacs_event,
                'articles': processed_articles,
                'analyses': analyses,
                'summary': summary,
                'date_range': {'from_date': from_date, 'to_date': to_date},
                'localization': lang_config  # NEW: Track localization used
            }

            self.stats['rows_processed'] += 1
            return result

        except Exception as e:
            print(f"   ❌ Row processing failed: {e}")
            self.stats['errors'] += 1
            return None

    def search_disaster_news_localized(self, gdacs_event: GDACSEvent, from_date: str = None, to_date: str = None, lang_config: Dict = None) -> List[Dict]:
        """Search for relevant disaster news with localized terms and PyGoogleNews"""

        if not lang_config:
            lang_config = {'lang': 'en', 'country': 'US'}

        # Create basic English search terms
        english_terms = []

        # Create targeted search terms
        if gdacs_event.disaster_country and gdacs_event.disaster_country.strip():
            country_terms = gdacs_event.disaster_country.split(',')[:2]  # Limit countries
            for country in country_terms:
                country = country.strip()
                if country and len(country) > 1:
                    english_terms.extend([
                        f"{gdacs_event.event_type} {country}",
                        f"{country} {gdacs_event.event_type}"
                    ])

        # Add event name if available
        if gdacs_event.event_name and len(gdacs_event.event_name.strip()) > 5:
            english_terms.append(gdacs_event.event_name)

        # Add generic disaster type terms
        english_terms.extend([
            gdacs_event.event_type,
            f"{gdacs_event.event_type} disaster",
            f"{gdacs_event.event_type} emergency"
        ])

        # Localize terms using GPT (NEW)
        if lang_config['lang'] != 'en':
            print(f"   🌐 Localizing search terms to {lang_config['lang']} for {lang_config['country']}")
            localized_terms = self.country_lang_analyzer.localize_disaster_terms(
                english_terms[:5], lang_config['lang'], lang_config['country']
            )
            self.stats['term_localizations'] += 1
            search_terms = localized_terms[:6]  # Use localized terms
        else:
            search_terms = english_terms[:6]  # Use English terms

        all_articles = []
        for term in search_terms:
            try:
                # Use PyGoogleNews with date filtering
                results = self.news_client.search(
                    query=term,
                    max_results=15,
                    from_=from_date,
                    to_=to_date
                )

                if results['status'] == 'ok':
                    # Add lang_country info to each article
                    for article in results['entries']:
                        article['lang_country'] = results.get('lang_country', f"{lang_config['lang']}-{lang_config['country']}")
                    all_articles.extend(results['entries'])

                time.sleep(1)  # Rate limiting

            except Exception as e:
                print(f"         Search error for '{term}': {e}")
                continue

        # Remove duplicates and limit results
        unique_articles = self._remove_duplicates(all_articles)
        final_articles = unique_articles[:20]

        self.stats['articles_fetched'] += len(final_articles)
        return final_articles

    def extract_article_content_gpt(self, article: Dict) -> Dict[str, str]:
        """Extract content from article URL using GPT analysis"""
        url = article.get('link', '')
        title = article.get('title', '')

        self.stats['gpt_url_extractions'] += 1

        # Use GPT URL analyzer
        content_result = self.url_analyzer.extract_url_content_with_gpt(url, title)

        if content_result['status'] == 'gpt_success':
            self.stats['gpt_url_successes'] += 1

        return content_result

    def analyze_with_gpt(self, title: str, content: str, gdacs_event: GDACSEvent) -> DisasterAnalysis:
        """Analyze article with GPT using enhanced prompt"""
        try:
            disaster_info = GDACS_SCORING_KNOWLEDGE.get(gdacs_event.event_type, {})
            alert_info = ALERT_LEVEL_DEFINITIONS.get(gdacs_event.alert_level, {})

            prompt = f"""
You are a GDACS disaster expert analyzing media coverage of a specific GDACS event with deep understanding of GDACS scoring methodology.

GDACS EVENT CONTEXT:
- Event ID: {gdacs_event.event_id}
- Event Type: {gdacs_event.event_type} ({disaster_info.get('name', 'Unknown')})
- Event Name: {gdacs_event.event_name}
- GDACS Alert Level: {gdacs_event.alert_level} ({alert_info.get('severity', 'Unknown')} Severity)
- GDACS Alert Score: {gdacs_event.alert_score}
- GDACS Countries: {gdacs_event.disaster_country}
- GDACS Description: {gdacs_event.event_description}
- Severity Value: {gdacs_event.severity_value}
- Severity Text: {gdacs_event.severity_text}
- Date Range: {gdacs_event.from_date} to {gdacs_event.to_date}

GDACS {gdacs_event.event_type} SCORING METHODOLOGY:
- Disaster Name: {disaster_info.get('name', 'Unknown')}
- GDACS Models Used: {', '.join(disaster_info.get('models', []))}
- Key Assessment Inputs: {', '.join(disaster_info.get('key_inputs', []))}
- GDACS Scoring Method: {disaster_info.get('scoring_method', 'Standard methodology')}
- GDACS Alert Thresholds: {disaster_info.get('alert_thresholds', {})}
- Key Assessment Factors: {', '.join(disaster_info.get('key_factors', []))}
- Expected Impact Indicators: {', '.join(disaster_info.get('impact_indicators', []))}
- Assessment Focus: {disaster_info.get('assessment_focus', 'Standard assessment')}

ALERT LEVEL MEANING:
- {gdacs_event.alert_level} Alert Definition: {alert_info.get('definition', 'No definition')}
- Description: {alert_info.get('description', 'No description')}
- Required Action: {alert_info.get('action_required', 'Unknown')}

MEDIA ARTICLE TO ANALYZE:
Title: {title}
Content: {content[:3000]}  {f"... (Content truncated from {len(content)} characters for analysis)" if len(content) > 3000 else ""}

Please analyze this media article about the GDACS {gdacs_event.event_type} event considering the GDACS context and scoring methodology. Return ONLY a JSON object:

{{
    "disaster_severity_score": <0-100>,
    "response_capability_score": <0-100>,
    "response_timeliness_score": <0-100>,
    "disaster_relevance_score": <0-100>,
    "gdacs_alignment_score": <0-100>,
    "media_coverage_score": <0-100>,
    "is_disaster_related": <true/false>,
    "primary_disaster_type": "<type>",
    "disaster_keywords": ["keyword1", "keyword2", ...],
    "response_keywords": ["specific response measure 1", "specific response measure 2", ...],
    "timeliness_keywords": ["specific timeframe 1", "specific timeframe 2", ...],
    "analysis_summary": "<150 words>",
    "gdacs_context_analysis": "<100 words>",
    "final_risk_score": <0-100>,
    "confidence_score": <0-100>
}}

ENHANCED SCORING GUIDELINES (with GDACS context):

Do not return any zeros. Analyze all articles and list the scoring content relevant to each category.

1. disaster_severity_score (0-100):
   - Use GDACS alert score {gdacs_event.alert_score} as authoritative baseline
   - Consider {gdacs_event.event_type} specific factors: {', '.join(disaster_info.get('key_factors', []))}
   - Look for impact indicators: {', '.join(disaster_info.get('impact_indicators', []))}
   - Expected for {gdacs_event.alert_level}: {alert_info.get('description', 'standard impact')}
   - GDACS Assessment Focus: {disaster_info.get('assessment_focus', 'standard assessment')}

2. response_capability_score (0-100) - ENHANCED RESILIENCE ASSESSMENT:
   - INFRASTRUCTURE RESILIENCE: Are critical infrastructure systems (hospitals, power, water, communication) functioning or quickly restored?
   - INSTITUTIONAL CAPACITY: Do local/regional authorities demonstrate effective crisis management? Are emergency services adequately staffed and equipped?
   - COMMUNITY PREPAREDNESS: Evidence of evacuation plans, early warning systems, disaster drills, or community resilience programs
   - RESOURCE AVAILABILITY: Sufficient emergency supplies, medical resources, shelter capacity, search and rescue capabilities
   - COORDINATION EFFECTIVENESS: Multi-agency cooperation, clear command structure, information sharing between organizations
   - ADAPTATION MEASURES: Evidence of previous disaster experience improving current response, building codes, land use planning
   - ECONOMIC RESILIENCE: Local economy's ability to maintain essential services, business continuity plans, rapid recovery capacity
   - SOCIAL COHESION: Community networks, volunteer organizations, social support systems helping with disaster response
   - VULNERABILITY REDUCTION: Pre-disaster risk reduction measures, protective infrastructure, public education programs
   - Look for specific evidence of resilience factors rather than general statements about government response

3. response_timeliness_score (0-100):
   - Speed of response relative to {gdacs_event.event_type} onset characteristics
   - Warning system effectiveness and advance notice provided
   - Proactive vs reactive measures and preparedness activation timing

4. disaster_relevance_score (0-100):
   - How much this article focuses on actual disaster impacts vs other topics
   - Operational vs academic content
   - IMPORTANT: Score should be VERY LOW (0-20) if article is about business disruptions, supply chain issues, economic impacts, or general news unrelated to actual natural disasters or emergency events
   - HIGH scores (70-100) only for articles directly about natural disasters, emergency responses, casualties, damage, evacuations, or rescue operations

5. gdacs_alignment_score (0-100):
   - How well the media report aligns with GDACS {gdacs_event.alert_level} assessment
   - Consistency with GDACS scoring methodology for {gdacs_event.event_type}
   - Agreement with expected {gdacs_event.event_type} impact patterns

6. media_coverage_score (0-100):
   - Quality and depth of media coverage
   - Accuracy of disaster information reported
   - Completeness of impact assessment

7. final_risk_score (0-100):
   - Calculate a weighted composite score using the following methodology:
   - Primary Weight (60%): disaster_severity_score (most critical for risk assessment - the inherent severity of the disaster itself)
   - Secondary Weight (30%): response_capability_score (significantly affects actual risk to populations - government/institutional response capability)
   - Quality Weight (10%): media_coverage_score (reliability and completeness of information source affects assessment accuracy)
   - Formula: (disaster_severity × 0.6) + (response_capability × 0.3) + (media_coverage × 0.1)
   - Consider GDACS alert level as validation: RED events should generally score 70-100, ORANGE 40-80, GREEN 10-50
   - Higher disaster severity with poor response capability should yield higher risk scores
   - Strong response capability can moderately reduce overall risk even for severe disasters

8. confidence_score (0-100):

Calculate the confidence_score as a weighted composite of the following factors:

confidence_score = int(
  Formula: 0.3 * disaster_severity_score + 0.3 * response_capability_score + 0.2 * media_coverage_score + 0.2 * gdacs_alignment_score
)

Adjustment rules:
- If disaster_relevance_score < 30, subtract 12 points (penalty for weak disaster relevance).
- If media_coverage_score < 40, subtract 8 points (penalty for poor information quality).
- If gdacs_alignment_score < 50, subtract 5 points (penalty for inconsistency with GDACS).
- Final score must be clamped between 1 and 100 (no zeros).

Confidence levels:
- 80–100 → High confidence
- 50–79 → Medium confidence
- 20–49 → Low confidence
- 1–19 → Very Low confidence

Key idea: confidence_score reflects how reliable the article analysis is, based on clarity of disaster impact, completeness of response information, quality of media coverage, disaster relevance, and alignment with GDACS classification.

CRITICAL DISASTER RELEVANCE CHECK:
Before assigning is_disaster_related as true, verify that the article is ACTUALLY about:
- Natural disasters (earthquakes, floods, typhoons, etc.)
- Emergency situations requiring humanitarian response
- Rescue and evacuation operations
- Casualty reports and damage assessments
- Government disaster response activities
- Relief and recovery efforts

DO NOT classify as disaster-related if the article is primarily about:
- Business disruptions or economic impacts without actual disaster focus
- Supply chain issues without disaster context
- General business news or market reports
- Political news unrelated to disaster response
- Technology or infrastructure issues without disaster context
- General social issues not related to specific disasters

Set is_disaster_related to FALSE if the article is not genuinely focused on disaster events or emergency response.

SPECIAL INSTRUCTIONS FOR KEYWORDS:

For response_keywords - Extract SPECIFIC response measures mentioned in the article, such as:
- "deployed 500 rescue personnel to disaster area"
- "opened 50 emergency shelters"
- "activated national emergency medical system"
- "deployed military helicopters for rescue operations"
- "established temporary water supply stations"
- "distributed emergency food supplies to 10,000 families"
- "set up field hospitals with 200 beds"
Instead of generic terms like "rescue", "evacuation", provide the actual detailed measures described.

For timeliness_keywords - Extract SPECIFIC timeframes and deadlines mentioned:
- "evacuation completed within 24 hours"
- "power expected to be restored within 3 days"
- "rescue operations started 2 hours after disaster"
- "next 48 hours critical for rescue efforts"
- "temporary housing rebuilt within one week"
- "emergency supplies delivered within 6 hours"
- "assessment teams deployed within 12 hours"
Instead of generic terms like "immediate", "soon", "quickly", provide actual timeframes mentioned.

For gdacs_context_analysis: Analyze how well this media coverage reflects the GDACS {gdacs_event.alert_level} classification and {gdacs_event.event_type} scoring methodology.

This media article is about the GDACS {gdacs_event.event_type} event with {gdacs_event.alert_level} alert level (score: {gdacs_event.alert_score}).

It’s possible that {gdacs_event.disaster_country} and {gdacs_event.client_country} are not the same, so we must evaluate both the geographic distance between the location and the client country as well as the news content to determine the final score.
"""

            messages = [
                {
                    "role": "system",
                    "content": "You are a GDACS-trained disaster expert who understands GDACS scoring methodologies and analyzes media coverage with this context. Always respond with valid JSON."
                },
                {"role": "user", "content": prompt}
            ]

            # Make API call
            try:
                self.stats['api_calls'] += 1

                if self.use_legacy_api:
                    response = openai.ChatCompletion.create(
                        model="gpt-3.5-turbo",
                        messages=messages,
                        max_tokens=1500,
                        temperature=0.2
                    )
                    result = response.choices[0].message.content.strip()
                else:
                    response = self.openai_client.chat.completions.create(
                        model="gpt-3.5-turbo",
                        messages=messages,
                        max_tokens=1500,
                        temperature=0.2
                    )
                    result = response.choices[0].message.content.strip()

                # Clean and parse JSON response
                result = self._clean_json_response(result)

                try:
                    analysis_data = json.loads(result)
                except json.JSONDecodeError:
                    return self._create_fallback_analysis(gdacs_event)

                # Create analysis object with validation
                analysis = DisasterAnalysis()

                analysis.disaster_severity_score = self._validate_score(analysis_data.get('disaster_severity_score', 50))
                analysis.response_capability_score = self._validate_score(analysis_data.get('response_capability_score', 40))
                analysis.response_timeliness_score = self._validate_score(analysis_data.get('response_timeliness_score', 40))
                analysis.disaster_relevance_score = self._validate_score(analysis_data.get('disaster_relevance_score', 60))
                analysis.gdacs_alignment_score = self._validate_score(analysis_data.get('gdacs_alignment_score', 50))
                analysis.media_coverage_score = self._validate_score(analysis_data.get('media_coverage_score', 50))
                analysis.final_risk_score = self._validate_score(analysis_data.get('final_risk_score', 50))
                analysis.confidence_score = self._validate_score(analysis_data.get('confidence_score', 60))

                analysis.is_disaster_related = bool(analysis_data.get('is_disaster_related', True))
                analysis.primary_disaster_type = str(analysis_data.get('primary_disaster_type', gdacs_event.event_type))
                analysis.gpt_analysis_summary = str(analysis_data.get('analysis_summary', ''))[:300]
                analysis.gdacs_context_analysis = str(analysis_data.get('gdacs_context_analysis', ''))[:200]

                analysis.gpt_disaster_keywords = self._validate_keywords(analysis_data.get('disaster_keywords', []))
                analysis.gpt_response_keywords = self._validate_keywords(analysis_data.get('response_keywords', []))
                analysis.gpt_timeliness_keywords = self._validate_keywords(analysis_data.get('timeliness_keywords', []))

                # Set classification levels
                analysis.severity_level = self._get_severity_level(analysis.disaster_severity_score)
                analysis.response_level = self._get_response_level(analysis.response_capability_score)
                analysis.timeliness_level = self._get_timeliness_level(analysis.response_timeliness_score)
                analysis.relevance_level = self._get_relevance_level(analysis.disaster_relevance_score)
                analysis.gdacs_alignment_level = self._get_gdacs_alignment_level(analysis.gdacs_alignment_score)
                analysis.media_coverage_level = self._get_media_coverage_level(analysis.media_coverage_score)

                # Check client country match
                analysis.client_country_match = self._check_client_country_match(gdacs_event, content)

                self.stats['gpt_analyses'] += 1
                self.stats['successful_analyses'] += 1
                return analysis

            except Exception as api_error:
                print(f"            API error: {api_error}")
                return self._create_fallback_analysis(gdacs_event)

        except Exception as e:
            print(f"            Analysis failed: {e}")
            self.stats['errors'] += 1
            return self._create_fallback_analysis(gdacs_event)

    def calculate_client_risk_scores_with_addresses(self, title: str, content: str, event_id: str) -> Dict[str, float]:
        """Calculate risk scores for clients using actual Client_Address names instead of codes"""
        clients = self.excel_processor.get_client_info_for_event(event_id)

        if not clients:
            return {}

        try:
            # Prepare client information for GPT with actual addresses
            client_details = []
            for client in clients:
                client_details.append({
                    'client_address': client.client_address,  # Use full address as identifier
                    'client_country': client.client_country,
                    'severity_text': client.severity_text,
                    'distance_summary': client.client_distance_summary
                })

            prompt = f"""
You are a disaster risk assessment expert. Please analyze this news article and calculate risk scores (1-10) for each client based on their geographical location and the disaster impact.

Article Title: {title}
Article Content: {content[:4000]}  {f"... (Content truncated from {len(content)} characters for analysis)" if len(content) > 4000 else ""}

Client Information:
{json.dumps(client_details, indent=2)}

Instructions:
1. Analyze the disaster severity and impact described in the news article
2. For each client, use their Client_Address as the primary identifier
3. For each client, FIRST determine the exact geographical location of their Client_Address:
   - Identify the specific city, district, province/state, or region where the client is located
   - Consider the detailed geographical context of the address
   - Use your geographical knowledge to understand the precise location relative to the disaster center
4. For each client, consider:
   - Severity_Text: How severe is the impact in their specific area/region?
   - Client_Distance_Summary: How close are they to the disaster center?
   - The detailed geographical location derived from Client_Address
   - The overall disaster impact described in the news relative to their specific location
5. Assign risk scores (1-10):
   - 1-2: Minimal risk (very far from disaster center, minimal regional impact)
   - 3-4: Low risk (moderate distance, light regional impact)
   - 5-6: Medium risk (closer regional proximity, moderate regional impact)
   - 7-8: High risk (close regional proximity, significant regional impact)
   - 9-10: Critical risk (very close to disaster center, severe regional impact)

Return ONLY a JSON object with this exact format:
{{
    "client_risk_scores": {{
        "Full_Client_Address_1": risk_score,
        "Full_Client_Address_2": risk_score,
        ...
    }},
    "detailed_location_analysis": {{
        "Full_Client_Address_1": "Detailed geographical location analysis and risk assessment reasoning",
        "Full_Client_Address_2": "Detailed geographical location analysis and risk assessment reasoning",
        ...
    }},
    "risk_assessment_summary": "Brief explanation of overall risk assessment logic based on detailed locations"
}}

IMPORTANT:
- Use the FULL Client_Address as the key in client_risk_scores (not country codes like MX, CN, etc.)
- Base your risk assessment on the DETAILED GEOGRAPHICAL LOCATIONS derived from Client_Address
- Provide clear reasoning for each location's risk assessment
"""

            messages = [
                {"role": "system", "content": "You are a geographical risk assessment expert. Always respond with valid JSON only, no markdown formatting."},
                {"role": "user", "content": prompt}
            ]

            try:
                self.stats['api_calls'] += 1

                if self.use_legacy_api:
                    response = openai.ChatCompletion.create(
                        model="gpt-3.5-turbo",
                        messages=messages,
                        max_tokens=800,
                        temperature=0.1
                    )
                    result = response.choices[0].message.content.strip()
                else:
                    response = self.openai_client.chat.completions.create(
                        model="gpt-3.5-turbo",
                        messages=messages,
                        max_tokens=800,
                        temperature=0.1
                    )
                    result = response.choices[0].message.content.strip()

                # Clean and parse JSON response
                cleaned_result = self._clean_json_response(result)

                try:
                    risk_data = json.loads(cleaned_result)
                    risk_scores = risk_data.get('client_risk_scores', {})
                    location_analysis = risk_data.get('detailed_location_analysis', {})

                    # Ensure all scores are floats between 1-10
                    for address, score in risk_scores.items():
                        try:
                            risk_scores[address] = max(1.0, min(10.0, float(score)))
                        except (ValueError, TypeError):
                            risk_scores[address] = 5.0  # Default medium risk

                    logger.info(f"✅ Calculated risk scores for {len(risk_scores)} clients using addresses")

                    # Log detailed location analysis
                    if location_analysis:
                        logger.info("📍 Detailed Location Analysis:")
                        for address, analysis in location_analysis.items():
                            logger.info(f"   {address[:50]}...: {analysis}")

                    return risk_scores

                except json.JSONDecodeError as e:
                    logger.warning(f"JSON parsing failed: {e}")
                    # Try to extract just the client_risk_scores part as fallback
                    try:
                        scores_match = re.search(r'"client_risk_scores":\s*(\{[^}]+\})', cleaned_result)
                        if scores_match:
                            scores_json = scores_match.group(1)
                            risk_scores = json.loads(scores_json)
                            # Ensure all scores are floats between 1-10
                            for address, score in risk_scores.items():
                                try:
                                    risk_scores[address] = max(1.0, min(10.0, float(score)))
                                except (ValueError, TypeError):
                                    risk_scores[address] = 5.0
                            logger.info(f"✅ Fallback: Extracted {len(risk_scores)} client risk scores")
                            return risk_scores
                    except:
                        pass

                    logger.warning(f"Complete JSON parsing failure. Raw response: {cleaned_result[:200]}...")
                    return {}

            except Exception as api_error:
                logger.error(f"❌ Client risk API call failed: {api_error}")
                return {}

        except Exception as e:
            logger.error(f"❌ Client risk score calculation failed: {e}")
            return {}

    def process_batch_rows(self, start_row: int, num_rows: int) -> List[Dict]:
        """Process a batch of disaster event rows with localization"""
        total_rows = self.excel_processor.get_total_rows()

        if start_row >= total_rows:
            print(f"Start row {start_row} exceeds total rows {total_rows}")
            return []

        end_row = min(start_row + num_rows, total_rows)
        print(f"\n📊 Processing batch: rows {start_row} to {end_row - 1} ({end_row - start_row} rows)")
        print(f"🌍 Using PyGoogleNews with GPT-powered localization")

        batch_results = []
        start_time = time.time()

        for row_index in range(start_row, end_row):
            try:
                result = self.process_single_row(row_index)
                if result:
                    batch_results.append(result)

            except KeyboardInterrupt:
                print(f"\n⚠️ Batch processing interrupted at row {row_index}")
                break
            except Exception as e:
                print(f"❌ Error processing row {row_index}: {e}")
                continue

        elapsed_time = time.time() - start_time
        print(f"\n✅ Batch completed: {len(batch_results)} rows processed in {elapsed_time/60:.1f} minutes")
        print(f"🌍 Localization Stats: {self.stats['country_localizations']} countries, {self.stats['term_localizations']} term sets")

        return batch_results

    # Helper methods (mostly unchanged, keeping all existing helper methods)
    def _remove_duplicates(self, articles: List[Dict]) -> List[Dict]:
        """Remove duplicate articles based on title similarity"""
        if len(articles) <= 1:
            return articles

        unique_articles = []
        seen_titles = set()

        for article in articles:
            title = article.get('title', '').lower().strip()

            if not title or len(title) < 10:
                continue

            normalized = re.sub(r'[^\w\s]', '', title)
            normalized = re.sub(r'\s+', ' ', normalized).strip()

            if normalized not in seen_titles:
                unique_articles.append(article)
                seen_titles.add(normalized)

                if len(unique_articles) >= 50:
                    break

        return unique_articles

    def _clean_json_response(self, response: str) -> str:
        """Clean GPT response to extract valid JSON"""
        try:
            response = re.sub(r'```json\s*', '', response)
            response = re.sub(r'```\s*', '', response)

            json_match = re.search(r'\{.*\}', response, re.DOTALL)
            if json_match:
                return json_match.group(0)

            return response.strip()
        except:
            return response

    def _validate_score(self, score) -> float:
        """Validate and normalize score to 0-100 range"""
        try:
            score = float(score)
            return max(0.0, min(100.0, score))
        except (ValueError, TypeError):
            return 50.0

    def _validate_keywords(self, keywords) -> List[str]:
        """Validate and clean keywords list"""
        if not isinstance(keywords, list):
            return []

        valid_keywords = []
        for keyword in keywords:
            if isinstance(keyword, str) and len(keyword.strip()) > 0:
                valid_keywords.append(str(keyword).strip()[:100])

        return valid_keywords[:5]

    def _get_severity_level(self, score: float) -> str:
        if score >= 80: return "Critical"
        elif score >= 60: return "High"
        elif score >= 40: return "Medium"
        elif score >= 20: return "Low"
        else: return "Minimal"

    def _get_response_level(self, score: float) -> str:
        if score >= 80: return "Excellent"
        elif score >= 60: return "Good"
        elif score >= 40: return "Adequate"
        elif score >= 20: return "Limited"
        else: return "Poor"

    def _get_timeliness_level(self, score: float) -> str:
        if score >= 80: return "Immediate"
        elif score >= 60: return "Rapid"
        elif score >= 40: return "Timely"
        elif score >= 20: return "Delayed"
        else: return "Slow"

    def _get_relevance_level(self, score: float) -> str:
        if score >= 80: return "Highly Relevant"
        elif score >= 60: return "Relevant"
        elif score >= 40: return "Moderately Relevant"
        elif score >= 20: return "Somewhat Relevant"
        else: return "Not Relevant"

    def _get_gdacs_alignment_level(self, score: float) -> str:
        if score >= 80: return "Highly Aligned"
        elif score >= 60: return "Well Aligned"
        elif score >= 40: return "Moderately Aligned"
        elif score >= 20: return "Poorly Aligned"
        else: return "Not Aligned"

    def _get_media_coverage_level(self, score: float) -> str:
        if score >= 80: return "Excellent Coverage"
        elif score >= 60: return "Good Coverage"
        elif score >= 40: return "Adequate Coverage"
        elif score >= 20: return "Limited Coverage"
        else: return "Poor Coverage"

    def _expand_client_country_names(self, client_countries: List[str]) -> List[str]:
        """Convert client country codes/addresses to full country names based on actual client warehouse locations"""
        # Country code mappings based on actual client warehouse data
        country_mappings = {
            'MX': ['mexico', 'mexican'],
            'CN': ['china', 'prc', 'chinese'],
            'HK': ['hong kong', 'hongkong'],
            'SG': ['singapore'],
            'IN': ['india', 'indian'],
            'US': ['united states', 'usa', 'america', 'american'],
            'MY': ['malaysia', 'malaysian'],
            'NL': ['netherlands', 'holland', 'dutch'],
            'RO': ['romania', 'romanian'],
            'HU': ['hungary', 'hungarian']
        }

        expanded_names = []

        for client in client_countries:
            client_clean = client.strip()

            # If it's a known country code from our client data, add the full names
            if client_clean.upper() in country_mappings:
                expanded_names.extend(country_mappings[client_clean.upper()])
                expanded_names.append(client_clean.lower())  # Also keep original
            else:
                # For longer addresses, extract potential country names
                client_lower = client_clean.lower()

                # Specific patterns based on actual client addresses
                if any(term in client_lower for term in ['mexico', 'mexican', 'carretera', 'lopez mateos']):
                    expanded_names.extend(['mexico', 'mexican'])
                elif any(term in client_lower for term in ['china', 'chinese', 'suhong road', 'guan pu rd', 'suqian rd']):
                    expanded_names.extend(['china', 'prc', 'chinese'])
                elif any(term in client_lower for term in ['hong kong', 'hongkong', 'kam pok road', 'yeung uk road', 'san tin']):
                    expanded_names.extend(['hong kong', 'hongkong'])
                elif any(term in client_lower for term in ['singapore', 'alps avenue']):
                    expanded_names.extend(['singapore'])
                elif any(term in client_lower for term in ['india', 'indian', 'sipcot']):
                    expanded_names.extend(['india', 'indian'])
                elif any(term in client_lower for term in ['united states', 'america', 'usa', 'yosemite dr']):
                    expanded_names.extend(['united states', 'usa', 'america', 'american'])
                elif any(term in client_lower for term in ['malaysia', 'malaysian', 'mukim']):
                    expanded_names.extend(['malaysia', 'malaysian'])
                elif any(term in client_lower for term in ['netherlands', 'holland', 'dutch']):
                    expanded_names.extend(['netherlands', 'holland', 'dutch'])
                elif any(term in client_lower for term in ['romania', 'romanian']):
                    expanded_names.extend(['romania', 'romanian'])
                elif any(term in client_lower for term in ['hungary', 'hungarian']):
                    expanded_names.extend(['hungary', 'hungarian'])
                else:
                    # Add the original as is for other cases
                    expanded_names.append(client_lower)

        return list(set(expanded_names))  # Remove duplicates

    def _check_client_country_match(self, gdacs_event: GDACSEvent, content: str) -> bool:
        """Check if content mentions client country using expanded country names"""
        if not content:
            return False

        # Get all client countries for this event from the Excel processor
        event_clients = self.excel_processor.get_client_countries_for_event(gdacs_event.event_id)

        if not event_clients:
            # Fallback to the individual event's client_country field
            if not gdacs_event.client_country:
                return False
            event_clients = [country.strip() for country in gdacs_event.client_country.split(',') if country.strip()]

        if not event_clients:
            return False

        # Expand client country codes/addresses to full country names
        expanded_country_names = self._expand_client_country_names(event_clients)

        content_lower = content.lower()

        # Check each expanded country name
        for country_name in expanded_country_names:
            if country_name and country_name in content_lower:
                print(f"            ✅ Client country match found: '{country_name}' in content (from {event_clients})")
                return True

        print(f"            ⚠️ No client country match found. Original: {event_clients}, Expanded: {expanded_country_names[:5]}")
        return False

    def _create_fallback_analysis(self, gdacs_event: GDACSEvent) -> DisasterAnalysis:
        """Create fallback analysis when GPT fails"""
        analysis = DisasterAnalysis()

        analysis.disaster_severity_score = min(gdacs_event.alert_score * 30, 100) if gdacs_event.alert_score else 50
        analysis.response_capability_score = 45.0
        analysis.response_timeliness_score = 40.0
        analysis.disaster_relevance_score = 70.0
        analysis.gdacs_alignment_score = 60.0
        analysis.media_coverage_score = 50.0
        analysis.final_risk_score = 50.0
        analysis.confidence_score = 30.0

        analysis.is_disaster_related = True
        analysis.primary_disaster_type = gdacs_event.event_type
        analysis.gpt_analysis_summary = f"Fallback analysis for {gdacs_event.event_name}"
        analysis.gdacs_context_analysis = f"GDACS {gdacs_event.alert_level} alert event"

        analysis.severity_level = self._get_severity_level(analysis.disaster_severity_score)
        analysis.response_level = self._get_response_level(analysis.response_capability_score)
        analysis.timeliness_level = self._get_timeliness_level(analysis.response_timeliness_score)

        return analysis

    def _create_empty_result(self, row_index: int, gdacs_event: GDACSEvent) -> Dict:
        """Create empty result structure when no articles found"""
        return {
            'row_index': row_index,
            'gdacs_event': gdacs_event,
            'articles': [],
            'analyses': [],
            'summary': {
                'total_articles': 0,
                'avg_disaster_severity': 0,
                'avg_response_capability': 0,
                'avg_response_timeliness': 0,
                'avg_disaster_relevance': 0,
                'avg_gdacs_alignment': 0,
                'avg_media_coverage': 0,
                'avg_final_risk': 0,
                'avg_confidence': 0,
                'disaster_related_count': 0,
                'client_country_matches': 0,
                'avg_client_risk': 0,
                'total_client_assessments': 0
            },
            'date_range': {'from_date': None, 'to_date': None},
            'localization': {'lang': 'en', 'country': 'US'}
        }

    def _calculate_summary(self, analyses: List[DisasterAnalysis]) -> Dict:
        """Calculate summary statistics from analyses including client risk data"""
        if not analyses:
            return self._create_empty_result(0, None)['summary']

        disaster_related = [a for a in analyses if a.is_disaster_related]
        client_matches = [a for a in analyses if a.client_country_match]

        # Calculate client risk statistics
        all_client_risks = []
        for analysis in analyses:
            if analysis.client_risk_scores:
                all_client_risks.extend(analysis.client_risk_scores.values())

        return {
            'total_articles': len(analyses),
            'avg_disaster_severity': np.mean([a.disaster_severity_score for a in analyses]),
            'avg_response_capability': np.mean([a.response_capability_score for a in analyses]),
            'avg_response_timeliness': np.mean([a.response_timeliness_score for a in analyses]),
            'avg_disaster_relevance': np.mean([a.disaster_relevance_score for a in analyses]),
            'avg_gdacs_alignment': np.mean([a.gdacs_alignment_score for a in analyses]),
            'avg_media_coverage': np.mean([a.media_coverage_score for a in analyses]),
            'avg_final_risk': np.mean([a.final_risk_score for a in analyses]),
            'avg_confidence': np.mean([a.confidence_score for a in analyses]),
            'disaster_related_count': len(disaster_related),
            'client_country_matches': len(client_matches),
            'avg_client_risk': np.mean(all_client_risks) if all_client_risks else 0,
            'total_client_assessments': len(all_client_risks)
        }

# ==========================================
# EXCEL EXPORT FUNCTIONALITY WITH LOCALIZATION TRACKING (MODIFIED)
# ==========================================

def export_results_to_excel(results: List[Dict], filename: str = None) -> str:
    """Export comprehensive results to Excel with localization and client address tracking"""

    if not filename:
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        filename = f"disaster_analysis_localized_{timestamp}.xlsx"

    print(f"📊 Exporting results to Excel: {filename}")

    # Prepare data rows for export
    export_rows = []

    for result in results:
        gdacs_event = result['gdacs_event']
        articles = result['articles']
        analyses = result['analyses']
        date_range = result.get('date_range', {})
        localization = result.get('localization', {'lang': 'en', 'country': 'US'})

        # Process each article-analysis pair
        for i, (article, analysis) in enumerate(zip(articles, analyses)):
            try:
                # Basic row data
                row = {
                    # Article Information
                    'title': str(article.get('title', ''))[:500],
                    'url': str(article.get('link', '')),
                    'date_text': str(article.get('published', '')),
                    'parsed_date': str(article.get('parsed_date', '')),
                    'summary': str(article.get('summary', ''))[:1000],
                    'extraction_method': str(article.get('extraction_method', 'unknown')),
                    'lang_country': str(article.get('lang_country', 'unknown')),  # NEW: Track localization

                    # GDACS Event Information
                    'gdacs_event_id': str(gdacs_event.event_id),
                    'gdacs_event_type': str(gdacs_event.event_type),
                    'gdacs_event_name': str(gdacs_event.event_name),
                    'gdacs_alert_level': str(gdacs_event.alert_level),
                    'gdacs_alert_score': float(gdacs_event.alert_score) if gdacs_event.alert_score else 0.0,
                    'gdacs_country': str(gdacs_event.disaster_country),
                    'gdacs_description': str(gdacs_event.event_description)[:500],
                    'gdacs_severity_value': float(gdacs_event.severity_value) if gdacs_event.severity_value else 0.0,
                    'gdacs_severity_text': str(gdacs_event.severity_text),
                    'gdacs_from_date': str(gdacs_event.from_date),
                    'gdacs_to_date': str(gdacs_event.to_date),

                    # Date Range and Localization Information (NEW)
                    'search_from_date': str(date_range.get('from_date', '')),
                    'search_to_date': str(date_range.get('to_date', '')),
                    'search_language': str(localization.get('lang', 'en')),
                    'search_country': str(localization.get('country', 'US')),

                    # Client Information
                    'client_country': str(gdacs_event.client_country),
                    'client_address': str(gdacs_event.client_address),
                    'distance_km': float(gdacs_event.distance_km) if gdacs_event.distance_km else 0.0,

                    # Analysis Scores
                    'disaster_severity_score': float(analysis.disaster_severity_score),
                    'response_capability_score': float(analysis.response_capability_score),
                    'response_timeliness_score': float(analysis.response_timeliness_score),
                    'disaster_relevance_score': float(analysis.disaster_relevance_score),
                    'gdacs_alignment_score': float(analysis.gdacs_alignment_score),
                    'media_coverage_score': float(analysis.media_coverage_score),
                    'final_risk_score': float(analysis.final_risk_score),
                    'confidence_score': float(analysis.confidence_score),

                    # Analysis Results
                    'is_disaster_related': bool(analysis.is_disaster_related),
                    'severity_level': str(analysis.severity_level),
                    'response_level': str(analysis.response_level),
                    'timeliness_level': str(analysis.timeliness_level),
                    'relevance_level': str(analysis.relevance_level),
                    'gdacs_alignment_level': str(analysis.gdacs_alignment_level),
                    'media_coverage_level': str(analysis.media_coverage_level),
                    'primary_disaster_type': str(analysis.primary_disaster_type),
                    'client_country_match': bool(analysis.client_country_match),

                    # GPT Analysis Details
                    'gpt_disaster_keywords': ', '.join(analysis.gpt_disaster_keywords),
                    'gpt_response_keywords': ', '.join(analysis.gpt_response_keywords),
                    'gpt_timeliness_keywords': ', '.join(analysis.gpt_timeliness_keywords),
                    'gpt_analysis_summary': str(analysis.gpt_analysis_summary)[:500],
                    'gdacs_context_analysis': str(analysis.gdacs_context_analysis)[:300],

                    # Client Risk Scores with Addresses (MODIFIED)
                    'client_risk_scores_json': json.dumps(analysis.client_risk_scores) if analysis.client_risk_scores else '{}',
                    'matched_client_addresses': ', '.join(analysis.matched_client_addresses) if hasattr(analysis, 'matched_client_addresses') else '',
                    'matched_location': str(analysis.matched_location) if hasattr(analysis, 'matched_location') else ''
                }

                # Add individual client risk scores using address names (MODIFIED)
                if analysis.client_risk_scores:
                    for client_address, risk_score in analysis.client_risk_scores.items():
                        # Create a clean column name from the address
                        clean_address = re.sub(r'[^\w\s]', '_', client_address)[:50]  # Limit length
                        clean_address = re.sub(r'\s+', '_', clean_address)
                        column_name = f'risk_{clean_address}'
                        row[column_name] = float(risk_score)

                    risk_summary = []
                    for address, score in analysis.client_risk_scores.items():
                        short_address = address[:30] + "..." if len(address) > 30 else address
                        risk_summary.append(f"{short_address}: {score:.1f}")
                    row['client_risk_scores_summary'] = '; '.join(risk_summary)
                else:
                    row['client_risk_scores_summary'] = 'No client risk scores calculated'

                export_rows.append(row)

            except Exception as e:
                print(f"❌ Error processing article {i} for event {gdacs_event.event_id}: {e}")
                continue

    # Create DataFrame and export
    if not export_rows:
        print("❌ No data to export")
        return None

    df_export = pd.DataFrame(export_rows)

    # Create summary statistics with localization info
    summary_data = []
    for result in results:
        gdacs_event = result['gdacs_event']
        analyses = result['analyses']
        articles = result['articles']
        date_range = result.get('date_range', {})
        localization = result.get('localization', {'lang': 'en', 'country': 'US'})

        if analyses:
            # Count extraction methods
            extraction_methods = [a.get('extraction_method', 'unknown') for a in articles]
            gpt_success_count = sum(1 for method in extraction_methods if method == 'gpt_success')

            # Calculate client risk statistics
            all_client_risks = []
            unique_clients = set()
            for analysis in analyses:
                if analysis.client_risk_scores:
                    for address, score in analysis.client_risk_scores.items():
                        all_client_risks.append(score)
                        unique_clients.add(address)

            summary_data.append({
                'Event_ID': gdacs_event.event_id,
                'Event_Name': gdacs_event.event_name,
                'Event_Type': gdacs_event.event_type,
                'Alert_Level': gdacs_event.alert_level,
                'Alert_Score': gdacs_event.alert_score,
                'Countries': gdacs_event.disaster_country,
                'Search_From_Date': date_range.get('from_date', ''),
                'Search_To_Date': date_range.get('to_date', ''),
                'Search_Language': localization.get('lang', 'en'),  # NEW
                'Search_Country': localization.get('country', 'US'),  # NEW
                'Articles_Count': len(analyses),
                'GPT_URL_Extractions': len(articles),
                'GPT_URL_Successes': gpt_success_count,
                'GPT_Success_Rate': f"{(gpt_success_count/len(articles)*100):.1f}%" if articles else "0%",
                'Avg_Disaster_Severity': np.mean([a.disaster_severity_score for a in analyses]),
                'Avg_Response_Capability': np.mean([a.response_capability_score for a in analyses]),
                'Avg_Response_Timeliness': np.mean([a.response_timeliness_score for a in analyses]),
                'Avg_Final_Risk': np.mean([a.final_risk_score for a in analyses]),
                'Avg_Confidence': np.mean([a.confidence_score for a in analyses]),
                'Client_Matches': len([a for a in analyses if a.client_country_match]),
                'Unique_Client_Addresses': len(unique_clients),  # MODIFIED: Changed from countries to addresses
                'Client_Addresses_List': ', '.join([addr[:30] + "..." if len(addr) > 30 else addr for addr in sorted(unique_clients)]) if unique_clients else 'None',
                'Avg_Client_Risk_Score': np.mean(all_client_risks) if all_client_risks else 0.0,
                'Max_Client_Risk_Score': max(all_client_risks) if all_client_risks else 0.0,
                'Min_Client_Risk_Score': min(all_client_risks) if all_client_risks else 0.0,
                'Total_Client_Risk_Assessments': len(all_client_risks)
            })

    df_summary = pd.DataFrame(summary_data) if summary_data else pd.DataFrame()

    # Create client risk details sheet with addresses
    client_risk_details = []
    for result in results:
        gdacs_event = result['gdacs_event']
        for i, analysis in enumerate(result['analyses']):
            if analysis.client_risk_scores:
                for client_address, risk_score in analysis.client_risk_scores.items():
                    client_risk_details.append({
                        'Event_ID': gdacs_event.event_id,
                        'Event_Name': gdacs_event.event_name,
                        'Event_Type': gdacs_event.event_type,
                        'Alert_Level': gdacs_event.alert_level,
                        'Article_Index': i,
                        'Client_Address': client_address,  # MODIFIED: Full address instead of country code
                        'Risk_Score': float(risk_score),
                        'Risk_Level': 'Critical' if risk_score >= 8 else 'High' if risk_score >= 6 else 'Medium' if risk_score >= 4 else 'Low'
                    })

    df_client_risks = pd.DataFrame(client_risk_details) if client_risk_details else pd.DataFrame()

    # Export to Excel with multiple sheets
    try:
        with pd.ExcelWriter(filename, engine='openpyxl') as writer:
            # Main analysis results
            df_export.to_excel(writer, sheet_name='Disaster_Analysis', index=False)

            # Summary statistics
            if not df_summary.empty:
                df_summary.to_excel(writer, sheet_name='Summary_Statistics', index=False)

            # Client risk details
            if not df_client_risks.empty:
                df_client_risks.to_excel(writer, sheet_name='Client_Risk_Details', index=False)
                print(f"✅ Client Risk Details sheet created with {len(df_client_risks)} risk assessments using addresses")

            # Metadata sheet
            metadata = {
                'Export_Date': [datetime.now().strftime('%Y-%m-%d %H:%M:%S')],
                'Total_Articles': [len(df_export)],
                'Total_Events': [len(results)],
                'Total_Client_Risk_Assessments': [len(df_client_risks)] if not df_client_risks.empty else [0],
                'System_Version': ['Enhanced Disaster Analysis v3.3 - PyGoogleNews with Localization'],
                'Description': ['GDACS Event Media Analysis with PyGoogleNews, GPT Country/Language Detection, Localized Search, and Client Address Risk Scoring'],
                'News_API': ['PyGoogleNews with country/language localization'],
                'Localization_Features': ['GPT-powered country/language detection, Localized disaster terminology, Client address-based risk scoring'],
                'Date_Range_Feature': ['Automatic extraction from Excel From_Date/To_Date for targeted news search'],
                'New_Features': ['PyGoogleNews API integration, GPT language localization, Client address risk scoring instead of country codes']
            }
            pd.DataFrame(metadata).to_excel(writer, sheet_name='Export_Info', index=False)

        print(f"✅ Successfully exported {len(df_export)} article analyses with localization")
        print(f"🌍 Localization: PyGoogleNews with GPT country/language detection")
        print(f"📊 Client Risk Scores: {len(df_client_risks)} assessments using client addresses" if client_risk_details else "⚠️ No client risk scores found")
        return filename

    except Exception as e:
        print(f"❌ Error creating Excel file: {e}")

        # Fallback to CSV
        csv_filename = filename.replace('.xlsx', '.csv')
        df_export.to_csv(csv_filename, index=False, encoding='utf-8')
        print(f"📄 Exported to CSV instead: {csv_filename}")
        return csv_filename

# ==========================================
# MAIN EXECUTION FUNCTIONS (UPDATED)
# ==========================================

def run_enhanced_disaster_analysis():
    """Main interactive function for the enhanced disaster analysis system with PyGoogleNews localization"""

    print("ENHANCED DISASTER NEWS ANALYSIS SYSTEM v3.3")
    print("=" * 80)
    print("Complete system with PyGoogleNews, GPT localization, client address risk scoring")
    print("NEW: PyGoogleNews API with GPT-powered country/language detection and localized search")
    print("Features: Localized news search, GPT country analysis, Client address risk assessment, Excel export")
    print("=" * 80)

    # Get API key
    api_key = input("\nEnter your OpenAI API key: ").strip()
    if not api_key:
        print("API key is required. Exiting...")
        return

    # Get Excel file path
    excel_path = input("\nEnter Excel file path (or press Enter for default): ").strip()
    if not excel_path:
        excel_path = "enhanced_disaster_analysis_400.0km_buffer10.0km 2.xlsx"

    try:
        # Initialize analyzer
        print("\nInitializing Enhanced Disaster Analyzer with PyGoogleNews localization...")
        analyzer = EnhancedDisasterAnalyzer(api_key, excel_path)

        if analyzer.excel_processor.get_total_rows() == 0:
            print("No data found in Excel file. Please check the file path and format.")
            return

        print("✅ PyGoogleNews Localization: System will detect client country language and localize search terms")
        print("🌍 Client address-based risk scoring replaces country codes")

        stored_results = []

        # Main menu loop
        while True:
            print("\n" + "=" * 60)
            print("ENHANCED DISASTER ANALYSIS MENU - PYGOOGLENEWS LOCALIZED VERSION")
            print("=" * 60)
            print("1. Process single row")
            print("2. Process batch of rows")
            print("3. Quick demo (first 3 rows)")
            print("4. Export results to Excel")
            print("5. System statistics")
            print("6. Exit")
            print("=" * 60)

            choice = input("\nSelect option (1-6): ").strip()

            if choice == '1':
                # Single row processing
                total_rows = analyzer.excel_processor.get_total_rows()
                print(f"\nTotal rows available: {total_rows}")

                try:
                    row_index = int(input(f"Enter row index (0-{total_rows-1}): "))
                    if 0 <= row_index < total_rows:
                        result = analyzer.process_single_row(row_index)
                        if result:
                            stored_results = [result]
                            print(f"\nSingle row analysis completed!")
                            print(f"Articles found: {len(result['articles'])}")
                            print(f"Localization: {result['localization']['lang']}-{result['localization']['country']}")
                            print(f"Analyses completed: {len(result['analyses'])}")
                            print(f"Date range used: {result['date_range']['from_date']} to {result['date_range']['to_date']}")

                            summary = result['summary']
                            print(f"Average severity score: {summary['avg_disaster_severity']:.1f}")
                            print(f"Average response score: {summary['avg_response_capability']:.1f}")
                            print(f"Average client risk: {summary['avg_client_risk']:.1f}")
                        else:
                            print("Single row processing failed")
                    else:
                        print(f"Invalid row index. Must be 0-{total_rows-1}")
                except ValueError:
                    print("Please enter a valid number")

            elif choice == '2':
                # Batch processing
                total_rows = analyzer.excel_processor.get_total_rows()
                print(f"\nTotal rows available: {total_rows}")

                try:
                    start_row = int(input(f"Enter start row (0-{total_rows-1}): "))
                    num_rows = int(input("Enter number of rows to process: "))

                    if 0 <= start_row < total_rows and num_rows > 0:
                        print(f"\nProcessing {num_rows} rows starting from row {start_row}...")
                        print("🌍 Using PyGoogleNews with GPT localization and client address risk scoring")
                        results = analyzer.process_batch_rows(start_row, num_rows)

                        if results:
                            stored_results = results
                            print(f"\nBatch processing completed!")
                            print(f"Processed {len(results)} rows successfully")
                            print(f"Localizations: {analyzer.stats['country_localizations']} countries, {analyzer.stats['term_localizations']} term sets")

                            # Auto-export to Excel
                            print(f"\nExporting results to Excel...")
                            filename = export_results_to_excel(results)
                            if filename:
                                print(f"Results saved to: {filename}")
                        else:
                            print("Batch processing failed")
                    else:
                        print("Invalid parameters")
                except ValueError:
                    print("Please enter valid numbers")

            elif choice == '3':
                # Quick demo
                print("\nRunning quick demo (first 3 rows) with PyGoogleNews localization...")
                results = analyzer.process_batch_rows(0, 3)

                if results:
                    stored_results = results
                    print(f"\nDemo completed! Processed {len(results)} rows")

                    for result in results:
                        event = result['gdacs_event']
                        summary = result['summary']
                        date_range = result['date_range']
                        localization = result['localization']
                        print(f"\nEvent: {event.event_name}")
                        print(f"  Type: {event.event_type} | Alert: {event.alert_level}")
                        print(f"  Client: {event.client_country} | Address: {event.client_address[:30]}...")
                        print(f"  Localization: {localization['lang']}-{localization['country']}")
                        print(f"  Search dates: {date_range['from_date']} to {date_range['to_date']}")
                        print(f"  Articles: {summary['total_articles']}")
                        print(f"  Avg Risk Score: {summary['avg_final_risk']:.1f}")

                    # Export demo results
                    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
                    filename = export_results_to_excel(results, f"demo_localized_{timestamp}.xlsx")
                    if filename:
                        print(f"Demo results saved to: {filename}")
                else:
                    print("Demo failed")

            elif choice == '4':
                # Export to Excel
                if stored_results:
                    print(f"\nExporting {len(stored_results)} processed results to Excel...")
                    custom_filename = input("Enter filename (or press Enter for auto-generated): ").strip()

                    if custom_filename and not custom_filename.endswith('.xlsx'):
                        custom_filename += '.xlsx'

                    filename = export_results_to_excel(stored_results, custom_filename or None)
                    if filename:
                        print(f"Results exported successfully to: {filename}")
                        print(f"Format: Complete analysis with PyGoogleNews localization and client address risk scoring")
                else:
                    print("No results available for export. Please process some data first.")

            elif choice == '5':
                # System statistics
                print(f"\nSYSTEM STATISTICS - PYGOOGLENEWS LOCALIZED VERSION")
                print("=" * 50)
                print(f"Rows processed: {analyzer.stats['rows_processed']}")
                print(f"Articles fetched: {analyzer.stats['articles_fetched']}")
                print(f"Country localizations: {analyzer.stats['country_localizations']}")
                print(f"Term localizations: {analyzer.stats['term_localizations']}")
                print(f"GPT URL extractions attempted: {analyzer.stats['gpt_url_extractions']}")
                print(f"GPT URL extractions successful: {analyzer.stats['gpt_url_successes']}")
                print(f"GPT URL success rate: {(analyzer.stats['gpt_url_successes']/analyzer.stats['gpt_url_extractions']*100):.1f}%" if analyzer.stats['gpt_url_extractions'] > 0 else "0%")
                print(f"GPT analyses: {analyzer.stats['gpt_analyses']}")
                print(f"Client risk calculations: {analyzer.stats['client_risk_calculations']}")
                print(f"API calls made: {analyzer.stats['api_calls']}")
                print(f"Successful analyses: {analyzer.stats['successful_analyses']}")
                print(f"Errors encountered: {analyzer.stats['errors']}")
                print(f"Results in memory: {len(stored_results)} events")
                print(f"News API: PyGoogleNews with localization")
                print(f"Risk Scoring: Client address-based (not country codes)")

                elapsed = time.time() - analyzer.stats['start_time']
                print(f"Total runtime: {elapsed/60:.1f} minutes")

            elif choice == '6':
                print("\nExiting Enhanced Disaster Analysis System")
                break

            else:
                print("\nInvalid choice. Please select 1-6.")

    except Exception as e:
        print(f"\nSystem error: {e}")
        print("Please check your inputs and try again")

def quick_demo_analysis():
    """Quick demonstration of system capabilities with PyGoogleNews localization"""

    print("QUICK DEMO - Enhanced Disaster Analysis System v3.3")
    print("=" * 60)

    api_key = input("Enter OpenAI API key for demo: ").strip()
    if not api_key:
        print("API key required for demo")
        return

    excel_path = "enhanced_disaster_analysis_400.0km_buffer10.0km 2.xlsx"

    try:
        print("\nInitializing system for demo with PyGoogleNews localization...")
        analyzer = EnhancedDisasterAnalyzer(api_key, excel_path)

        if analyzer.excel_processor.get_total_rows() == 0:
            print("No data found. Please check Excel file path.")
            return

        print("\nRunning demo analysis on first 2 rows...")
        print("News API: PyGoogleNews with GPT country/language detection")
        print("Risk Scoring: Client addresses instead of country codes")
        results = analyzer.process_batch_rows(0, 2)

        if results:
            print(f"\nDemo completed successfully!")
            print(f"Processed {len(results)} events")

            for i, result in enumerate(results, 1):
                event = result['gdacs_event']
                summary = result['summary']
                date_range = result['date_range']
                localization = result['localization']

                print(f"\nEVENT {i}: {event.event_name}")
                print(f"  Type: {event.event_type} | Alert: {event.alert_level}")
                print(f"  Country: {event.disaster_country}")
                print(f"  Client Address: {event.client_address[:50]}...")
                print(f"  Localization Used: {localization['lang']}-{localization['country']}")
                print(f"  Search From_Date: {date_range['from_date']}")
                print(f"  Search To_Date: {date_range['to_date']}")
                print(f"  Articles Found: {summary['total_articles']}")
                print(f"  Disaster Severity: {summary['avg_disaster_severity']:.1f}/100")
                print(f"  Response Capability: {summary['avg_response_capability']:.1f}/100")
                print(f"  Final Risk Score: {summary['avg_final_risk']:.1f}/100")
                print(f"  Client Risk Score: {summary['avg_client_risk']:.1f}/10")

            print(f"\nDEMO STATISTICS:")
            print(f"API calls: {analyzer.stats['api_calls']}")
            print(f"Country localizations: {analyzer.stats['country_localizations']}")
            print(f"Term localizations: {analyzer.stats['term_localizations']}")
            print(f"GPT URL analysis success rate: {(analyzer.stats['gpt_url_successes']/analyzer.stats['gpt_url_extractions']*100):.1f}%" if analyzer.stats['gpt_url_extractions'] > 0 else "0%")
            print(f"Client risk calculations: {analyzer.stats['client_risk_calculations']}")
            print(f"Processing time: {(time.time() - analyzer.stats['start_time'])/60:.1f} minutes")
            print(f"News API: PyGoogleNews with localization")
            print(f"Risk Scoring: Client address-based")

            # Export demo results
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            filename = export_results_to_excel(results, f"demo_localized_{timestamp}.xlsx")
            if filename:
                print(f"Demo results exported to: {filename}")
                print("Export includes PyGoogleNews localization tracking and client address risk data")

            print(f"\nDemo completed! System ready for full analysis with PyGoogleNews localization.")

        else:
            print("Demo failed - no results generated")

    except Exception as e:
        print(f"Demo error: {e}")

def batch_analysis(api_key: str, excel_path: str, start_row: int = 0, num_rows: int = None) -> Optional[Dict]:
    """Programmatic batch analysis function for external use with PyGoogleNews localization"""

    try:
        print(f"Starting batch analysis with PyGoogleNews localization...")
        print(f"Excel file: {excel_path}")
        print(f"Start row: {start_row}")
        print(f"Number of rows: {num_rows or 'All remaining'}")
        print(f"News API: PyGoogleNews with GPT country/language detection")
        print(f"Risk Scoring: Client address-based")

        analyzer = EnhancedDisasterAnalyzer(api_key, excel_path)
        total_rows = analyzer.excel_processor.get_total_rows()

        if total_rows == 0:
            print("No data found in Excel file")
            return None

        if num_rows is None:
            num_rows = total_rows - start_row

        if start_row >= total_rows:
            print(f"Start row {start_row} exceeds total rows {total_rows}")
            return None

        print(f"Processing {num_rows} rows starting from row {start_row}...")
        results = analyzer.process_batch_rows(start_row, num_rows)

        if not results:
            print("Batch analysis failed - no results")
            return None

        # Export results automatically
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        filename = export_results_to_excel(results, f"batch_localized_{timestamp}.xlsx")

        return {
            'results': results,
            'stats': analyzer.stats,
            'total_processed': len(results),
            'processing_time': (time.time() - analyzer.stats['start_time']) / 60,
            'export_filename': filename,
            'country_localizations': analyzer.stats['country_localizations'],
            'term_localizations': analyzer.stats['term_localizations'],
            'gpt_url_extractions': analyzer.stats['gpt_url_extractions'],
            'gpt_url_successes': analyzer.stats['gpt_url_successes'],
            'gpt_url_success_rate': (analyzer.stats['gpt_url_successes']/analyzer.stats['gpt_url_extractions']*100) if analyzer.stats['gpt_url_extractions'] > 0 else 0
        }

    except Exception as e:
        print(f"Batch analysis error: {e}")
        return None

# ==========================================
# SYSTEM VALIDATION AND HEALTH CHECK (UPDATED)
# ==========================================

def validate_system_requirements():
    """Validate that all required packages are available"""
    required_packages = [
        'openai', 'pygooglenews', 'beautifulsoup4', 'pandas',
        'plotly', 'requests', 'lxml', 'openpyxl', 'numpy'
    ]

    missing_packages = []
    for package in required_packages:
        try:
            __import__(package.replace('-', '_'))
        except ImportError:
            missing_packages.append(package)

    if missing_packages:
        print(f"Missing required packages: {', '.join(missing_packages)}")
        print("Please install them using: pip install " + ' '.join(missing_packages))
        return False

    print("All required packages are available")
    return True

def system_health_check():
    """Perform comprehensive system health check"""
    print("SYSTEM HEALTH CHECK - Enhanced Disaster Analysis System v3.3")
    print("=" * 60)

    packages_ok = validate_system_requirements()

    # Check for sample Excel file
    excel_file = "enhanced_disaster_analysis_400.0km_buffer10.0km 2.xlsx"
    excel_exists = os.path.exists(excel_file)
    print(f"\nExcel File Status: {'✓ Found' if excel_exists else '✗ Missing'} ({excel_file})")

    # Test internet connection
    try:
        response = requests.get("https://news.google.com", timeout=10)
        internet_ok = response.status_code == 200
        print(f"Internet Connection: {'✓ OK' if internet_ok else '✗ Failed'}")
    except:
        internet_ok = False
        print(f"Internet Connection: ✗ Failed")

    # Test PyGoogleNews initialization
    try:
        test_gn = GoogleNews(lang='en', country='US')
        pygooglenews_ok = True
        print(f"PyGoogleNews API: ✓ OK")
    except:
        pygooglenews_ok = False
        print(f"PyGoogleNews API: ✗ Failed")

    # Test date parsing functionality
    test_dates = ["2024-01-15", "15/01/2024", "01-15-2024"]
    date_parsing_ok = True
    for test_date in test_dates:
        if parse_date_input(test_date) is None:
            date_parsing_ok = False
            break
    print(f"Date Parsing: {'✓ OK' if date_parsing_ok else '✗ Failed'}")

    overall_health = packages_ok and internet_ok and pygooglenews_ok and date_parsing_ok
    print(f"\nOVERALL SYSTEM HEALTH: {'✓ GOOD' if overall_health else '✗ ISSUES DETECTED'}")
    print(f"PyGoogleNews Localization: {'✓ ACTIVE' if pygooglenews_ok else '✗ INACTIVE'}")
    print(f"GPT Country/Language Detection: ✓ ACTIVE")
    print(f"Client Address Risk Scoring: ✓ ACTIVE")
    print(f"DATE RANGE INTEGRATION: ✓ ACTIVE (Excel From_Date/To_Date)")

    if not overall_health:
        print("\nRECOMMENDATIONS:")
        if not packages_ok:
            print("   - Install missing packages with pip install")
        if not internet_ok:
            print("   - Check internet connection for news search")
        if not pygooglenews_ok:
            print("   - Install PyGoogleNews: pip install pygooglenews")
        if not excel_exists:
            print("   - Ensure Excel file is in the correct location")
        if not date_parsing_ok:
            print("   - Date parsing functionality issue detected")

    return overall_health

# ==========================================
# STARTUP AND MAIN EXECUTION (UPDATED)
# ==========================================

# System validation on load
print("\n" + "="*80)
print("ENHANCED DISASTER NEWS ANALYSIS SYSTEM v3.3 - PYGOOGLENEWS WITH LOCALIZATION")
print("="*80)
print("Features: PyGoogleNews API • GPT Country/Language Detection • Localized Search • Client Address Risk Assessment")
print("NEW: PyGoogleNews replaces RSS feeds with GPT-powered localization and client address-based risk scoring")
print("Capabilities: Localized news search • GPT country analysis • Client address risk scoring • Comprehensive Excel export")
print("Status: Production Ready • PyGoogleNews Active • GPT Localization Active • Client Address Risk Scoring Active")
print("="*80)
print("Quick Start: run_enhanced_disaster_analysis()")
print("Demo Mode: quick_demo_analysis()")
print("Batch Mode: batch_analysis(api_key, excel_path, start_row, num_rows)")
print("Health Check: system_health_check()")
print("="*80)

# Main execution block
if __name__ == "__main__":
    """Main entry point for the disaster analysis system"""
    import sys

    print("ENHANCED DISASTER NEWS ANALYSIS SYSTEM v3.3")
    print("=" * 80)
    print("Complete system with PyGoogleNews, GPT localization, and client address risk scoring")
    print("PyGoogleNews API with GPT country/language detection and localized disaster terminology")
    print("Automatic From_Date/To_Date extraction from Excel for targeted news search")
    print("Client address-based risk assessment replacing country codes")
    print("=" * 80)

    # Handle command line arguments
    if len(sys.argv) > 1:
        if sys.argv[1] == "--demo":
            quick_demo_analysis()
        elif sys.argv[1] == "--batch":
            if len(sys.argv) >= 4:
                api_key = sys.argv[2]
                excel_path = sys.argv[3]
                start_row = int(sys.argv[4]) if len(sys.argv) > 4 else 0
                num_rows = int(sys.argv[5]) if len(sys.argv) > 5 else None

                result = batch_analysis(api_key, excel_path, start_row, num_rows)
                if result:
                    print("Batch analysis completed successfully!")
                    print(f"Processed {result['total_processed']} rows in {result['processing_time']:.1f} minutes")
                    print(f"Localizations: {result['country_localizations']} countries, {result['term_localizations']} term sets")
                    print(f"GPT URL Analysis: {result['gpt_url_successes']}/{result['gpt_url_extractions']} successful ({result['gpt_url_success_rate']:.1f}%)")
                    print(f"Results exported to: {result['export_filename']}")
                else:
                    print("Batch analysis failed")
            else:
                print("Usage: python script.py --batch <api_key> <excel_path> [start_row] [num_rows]")
        elif sys.argv[1] == "--health":
            system_health_check()
        elif sys.argv[1] == "--help":
            print("""
DISASTER ANALYSIS SYSTEM HELP v3.3 - PYGOOGLENEWS WITH LOCALIZATION
====================================================================

USAGE OPTIONS:
1. Interactive Mode (Default):
   python disaster_analysis.py

2. Demo Mode:
   python disaster_analysis.py --demo

3. Batch Processing:
   python disaster_analysis.py --batch API_KEY excel_file.xlsx [start_row] [num_rows]

4. Health Check:
   python disaster_analysis.py --health

5. Help:
   python disaster_analysis.py --help

NEW FEATURES IN v3.3:
- PyGoogleNews API: Replaces RSS-based Google News search with proper API
- GPT Country/Language Detection: AI-powered analysis of client countries and languages
- Localized Search Terms: GPT translates disaster terms to local languages (e.g., "huracán" in Mexico)
- Client Address Risk Scoring: Uses full client addresses instead of country codes in reports
- Enhanced Localization: Supports 20+ languages with proper country/language combinations

PYGOOGLENEWS LOCALIZATION FUNCTIONALITY:
- Analyzes Client_Country and Client_Address to determine appropriate language/country
- Uses GPT to translate disaster terms to local terminology
- Initializes PyGoogleNews with correct lang/country parameters
- Searches with localized terms for better local news coverage
- Tracks localization usage in Excel exports

CLIENT ADDRESS RISK SCORING:
- Risk scores now use full Client_Address names instead of codes like "MX_1"
- Provides clear geographical context for risk assessment
- Excel export shows actual warehouse/facility addresses
- Better identification of specific locations at risk

EXAMPLES:
- Run demo with localization: python disaster_analysis.py --demo
- Process all rows with localization: python disaster_analysis.py --batch sk-xxx... data.xlsx
- Process rows 5-15: python disaster_analysis.py --batch sk-xxx... data.xlsx 5 10
- Check system including PyGoogleNews: python disaster_analysis.py --health

REQUIREMENTS:
- OpenAI API key (for GPT localization and analysis)
- Excel file with Client_Country, Client_Address, From_Date and To_Date columns
- Internet connection for PyGoogleNews API
- pip install pygooglenews

LOCALIZATION PROCESS:
1. Read Client_Country and Client_Address from Excel
2. Use GPT to determine appropriate language and country codes
3. Initialize PyGoogleNews with localized settings
4. Translate disaster terms to local language using GPT
5. Search with localized terms for better regional coverage
6. Track localization methods in Excel export

EXCEL OUTPUT ENHANCEMENTS:
- search_language and search_country columns show localization used
- client_risk_scores use actual Client_Address names
- Client_Addresses_List replaces Client_Countries_List in summary
- Localization tracking in metadata sheet

SUPPORTED LANGUAGES:
English, Spanish, Chinese, Japanese, Korean, French, German, Italian, Portuguese,
Russian, Arabic, Hindi, Thai, Vietnamese, Dutch, Swedish, Danish, Norwegian,
Finnish, Polish, Turkish, and more via GPT detection

SYSTEM ADVANTAGES:
- Better local news coverage through proper localization
- More accurate disaster terminology for each region
- Clear client address identification in risk reports
- Improved cultural and linguistic relevance
- Future-proof API-based news access
            """)
        else:
            print(f"Unknown option: {sys.argv[1]}. Use --help for available options.")
    else:
        # Run interactive system
        run_enhanced_disaster_analysis()

# System validation when imported as module
if __name__ != "__main__":
    try:
        validate_system_requirements()
        print("Enhanced Disaster News Analysis System v3.3 loaded successfully!")
        print("PyGoogleNews Localization: Active - GPT-powered country/language detection and localized search")
        print("Client Address Risk Scoring: Active - Full addresses replace country codes in reports")
        print("Date Range Integration: Active - Automatic extraction from Excel From_Date/To_Date")
        print("System uses PyGoogleNews API with GPT localization for superior regional news coverage")
        print("Ready to analyze disaster news with complete localization, client address risk assessment, and GDACS intelligence")
        print("Use run_enhanced_disaster_analysis() to start the interactive system")
        print("Or use batch_analysis() for programmatic processing")
        print("System validated and ready for production use with PyGoogleNews localization!")
    except Exception as e:
        print(f"System validation warning: {e}")
        print("System may still function, but some features might be limited")

✅ Using OpenAI v1.0+ API
✅ Using PyGoogleNews API

ENHANCED DISASTER NEWS ANALYSIS SYSTEM v3.3 - PYGOOGLENEWS WITH LOCALIZATION
Features: PyGoogleNews API • GPT Country/Language Detection • Localized Search • Client Address Risk Assessment
NEW: PyGoogleNews replaces RSS feeds with GPT-powered localization and client address-based risk scoring
Capabilities: Localized news search • GPT country analysis • Client address risk scoring • Comprehensive Excel export
Status: Production Ready • PyGoogleNews Active • GPT Localization Active • Client Address Risk Scoring Active
Quick Start: run_enhanced_disaster_analysis()
Demo Mode: quick_demo_analysis()
Batch Mode: batch_analysis(api_key, excel_path, start_row, num_rows)
Health Check: system_health_check()
ENHANCED DISASTER NEWS ANALYSIS SYSTEM v3.3
Complete system with PyGoogleNews, GPT localization, and client address risk scoring
PyGoogleNews API with GPT country/language detection and localized disaster terminology
Automatic From_Date/To_Dat

In [None]:
run_enhanced_disaster_analysis()

ENHANCED DISASTER NEWS ANALYSIS SYSTEM v3.3
Complete system with PyGoogleNews, GPT localization, client address risk scoring
NEW: PyGoogleNews API with GPT-powered country/language detection and localized search
Features: Localized news search, GPT country analysis, Client address risk assessment, Excel export
