In [None]:
import pandas as pd
import re
import requests
from opencage.geocoder import OpenCageGeocode
import googlemaps
import json
import time

In [None]:
def load_and_clean_data(file_path):
    """
    Load data and perform initial cleanup.

    Args:
        file_path (str): Path to the CSV file.

    Returns:
        pd.DataFrame: Cleaned DataFrame.
    """
    df = pd.read_csv(file_path, encoding='latin1')
    df = df.dropna(subset=['address'])
    df['latitude']= None
    df['longitude']= None
    return df

In [None]:
prefix_mapping = {
    'carrer': 'calle',
    'c.': 'calle',
    'c/': 'calle',
    'avinguda': 'avenida',
    'avda': 'avenida',
    'av.': 'avenida',
    'plaça': 'plaza',
    'pza': 'plaza',
    'plz': 'plaza',
    'cra': 'carretera',
    'ctra': 'carretera',
    'cmno': 'camino',
    'paseig': 'paseo',
    'camí': 'camino',
    'psje': 'pasaje',
    ' urb ': 'urbanización'
}

In [None]:
def clean_address_column(addresses):
    """
    Clean and standardize addresses.

    Args:
        addresses (pd.Series): Series containing address data.

    Returns:
        pd.Series: Series with cleaned addresses.
    """
    def standardize_address(address):
        address = address.lower()
        page_title_pattern = (
            r'bolet[íi]n oficial del registro mercantil núm\. \d+ '
            r'(?:\w+ \d+ de \w+ de \d+|[a-z]+ \d+ [a-z]+ de \d+) '
            r'pág\. \d+ cve: borme-\w+-\d+-\d+(-\d+)?'
        )
        address = re.sub(page_title_pattern, '', address)

        unwanted_phrases = [
            "número", "numero", "sin número", "sin numero", "sinNúmero", "sinNumero",
            "numero ", "número ", "sin numero ", "sin número "
        ]
        
        for phrase in unwanted_phrases:
            address = address.replace(phrase, '')

        for valencian_term, spanish_term in prefix_mapping.items():
            address = address.replace(valencian_term, spanish_term)
            
        address = re.sub(r'[&]', '', address)
        city_pattern = r'\(([^()]+(?:\([^()]+\))?)\)'
        city_match = re.search(city_pattern, address)
        
        if city_match:
            city = city_match.group(1)
            nested_match = re.match(r'(.+?)\((.+?)\)', city)
            if nested_match:
                city = f"{nested_match.group(2)} {nested_match.group(1)}"
            address = re.sub(city_pattern, city.strip(), address)
        
        address = re.sub(r'\s+', ' ', address).strip()
        address = re.sub(r'[,.]+$', '', address)
        address = f"{address}, valencia"
        
        return address

    return addresses.apply(standardize_address)

In [None]:
def load_api_keys(config_path):
    """
    Load API keys from a configuration file.

    Args:
        config_path (str): Path to the configuration file.

    Returns:
        dict: Dictionary with API keys.
    """
    with open(config_path, 'r') as config_file:
        return json.load(config_file)

In [None]:
def in_province(lat, lng):
    return 38.8 <= float(lat) <= 39.74 and -1.1 <= float(lng) <= -0.075


In [None]:
def geocode_with_nominatim(address, cache, in_province):
    if address in cache:
        return cache[address]

    url = f"https://nominatim.openstreetmap.org/search?q={address}&format=json&addressdetails=1"
    headers = {
        'User-Agent': 'MyGeocodingApp/1.0 (your.email@example.com)'  
    }
    
    response = requests.get(url, headers=headers)
    
    if response.status_code == 200:
        data = response.json()
        if data:
            lat = float(data[0]['lat'])
            lng = float(data[0]['lon'])
            if in_province(lat, lng):
                result = (lat, lng)
                cache[address] = result
                return result
            else:
                print(f"Nominatim: Coordinates ({lat}, {lng}) are outside Valencia province.")
                return None, None
        else:
            print(f"Nominatim: No data found for address '{address}'.")
            return None, None
    elif response.status_code == 429:
        print("Nominatim: Rate limit exceeded. Retrying in 5 seconds...")
        time.sleep(5)
        return geocode_with_nominatim(address, cache, in_province)
    else:
        print(f"Nominatim: Error {response.status_code}. Retrying...")
        time.sleep(1)

    return None, None

In [None]:
def geocode_with_opencage(address, opencage_geocoder, max_requests, requests_made, in_province):
    if requests_made >= max_requests:
        print("OpenCage: Daily request limit reached.")
        return None, None
    
    try:
        result = opencage_geocoder.geocode(address)
        if result and len(result):
            lat = float(result[0]['geometry']['lat'])
            lng = float(result[0]['geometry']['lng'])
            if in_province(lat, lng):
                return lat, lng
            else:
                print(f"OpenCage: Coordinates ({lat}, {lng}) are outside Valencia province.")
                return "fuera de provincia", "fuera de provincia"
        else:
            print("OpenCage: No location found for address.")
            return None, None  
    except Exception as e:
        print(f"OpenCage: Error occurred - {e}")
        return None, None
    

In [None]:
def geocode_with_google_maps(address, gmaps, max_requests, requests_made, in_province):
    if requests_made >= max_requests:
        print("Google Maps: Request limit reached.")
        return None, None
    
    try:
        geocode_result = gmaps.geocode(address)
        if geocode_result:
            lat = geocode_result[0]['geometry']['location']['lat']
            lng = geocode_result[0]['geometry']['location']['lng']
            if in_province(lat, lng):
                return lat, lng
            else:
                print(f"Google Maps: Coordinates ({lat}, {lng}) are outside Valencia province.")
                return "fuera de provincia", "fuera de provincia"
        else:
            print("Google Maps: No location found for address.")
            return None, None
    except Exception as e:
        print(f"Google Maps: Error occurred - {e}")
        return None, None


In [None]:
def geocode_address(address, cache, geocoders, in_province):
    if address in cache:
        return cache[address]

    lat, lng = geocode_with_nominatim(address, cache, in_province)
    if lat is not None and lng is not None:
        return lat, lng

    lat, lng = geocode_with_opencage(address, geocoders['opencage'], geocoders['max_opencage_requests'], geocoders['requests_made_opencage'], in_province)
    if lat is not None and lng is not None:
        geocoders['requests_made_opencage'] += 1
        return lat, lng

    lat, lng = geocode_with_google_maps(address, geocoders['google_maps'], geocoders['max_google_requests'], geocoders['requests_made_google'], in_province)
    if lat == "fuera de provincia" and lng == "fuera de provincia":
        cache[address] = ("fuera de provincia", "fuera de provincia")
        return "fuera de provincia", "fuera de provincia"
    elif lat is not None and lng is not None:
        geocoders['requests_made_google'] += 1
        return lat, lng

    return None, None


In [None]:
def update_geocode_df(df, geocoders, in_province):
    """
    Update the DataFrame with geocoded coordinates.

    Args:
        df (pd.DataFrame): DataFrame to update.
        geocoders (dict): Dictionary containing geocoders and counters.
        in_province (function): Function to check if coordinates are within the province.
    """
    cache = {}
    for index, row in df.iterrows():
        address = row['cleaned_address']
        if pd.isna(row['latitude']) or pd.isna(row['longitude']):
            lat, lng = geocode_address(address, cache, geocoders, in_province)
            df.at[index, 'latitude'] = lat
            df.at[index, 'longitude'] = lng

            if lat == "fuera de provincia" and lng == "fuera de provincia":
                print(f"Updated row {index}: Address '{address}' -> Coordinates outside provincia")
            elif lat is None and lng is None:
                print(f"Updated row {index}: Address '{address}' -> Geocoding failed")
            else:
                print(f"Updated row {index}: Address '{address}' -> Latitude: {lat}, Longitude: {lng}")
        else:
            print(f"Row {index}: Address '{address}' already has coordinates.")

In [None]:
def save_geocoded_data(df, file_path):
    """
    Save the DataFrame with geocoded coordinates to a file.

    Args:
        df (pd.DataFrame): DataFrame to save.
        file_path (str): Path to save the CSV file.
    """
    df.to_csv(file_path, index=False, encoding='latin1')

In [None]:
config = load_api_keys('config.json')
geocoders = {
    'opencage': OpenCageGeocode(config['OPENCAGE_API_KEY']),
    'google_maps': googlemaps.Client(config['GOOGLE_MAPS_API_KEY']),
    'max_opencage_requests': 2500,
    'max_google_requests': 10000,
    'requests_made_opencage': 0,
    'requests_made_google': 0
}
lifecycle = load_and_clean_data('/mnt/c/Users/clayt/Data Science/UCM/TFM/Datos/Processed/valencia_data_merged_survival.csv')
lifecycle['cleaned_address'] = clean_address_column(lifecycle['address'])
update_geocode_df(lifecycle, geocoders, in_province)
save_geocoded_data(lifecycle, '/mnt/c/Users/clayt/Data Science/UCM/TFM/Datos/Processed/data_valencia_geocoded_final_survival.csv')