## TODO: 
- Run another time the algorithm but checking that the entry doesn't have a previous computed location

## This file will calculate the latitude and longitude given a location 

In [246]:
import pandas as pd, ast
import numpy as np 
from functools import cache 
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter
import time

In [247]:
df = pd.read_csv("../working_data/properties_vlc.csv") 
df["location"] = df["location"].apply(ast.literal_eval)

In [248]:
df.dropna(subset = "price", inplace = True)

In [249]:
df.loc[df.location.apply(len) == 1, "location"].iloc[0] == [""]

False

In [250]:
df.loc[df["location"].apply(lambda x: "".join(x)).str.contains("València") == False, "location"].shape[0]

1

In [251]:
geolocator = Nominatim(user_agent="valencia_price_predictor")
geocode = RateLimiter(geolocator.geocode, min_delay_seconds=1, max_retries = 3, error_wait_seconds = 2, swallow_exceptions= False)

In [252]:
default = geolocator.geocode("Valencia, València")
default = (default.latitude, default.longitude)
default, geolocator.geocode("Valencia, València") == geolocator.geocode("València, España")

((39.4697065, -0.3763353), True)

#### The problem with adding the center as the default value is that if you run a clustering algorithm, there's a lot of properties that will get that cluster without being there. So they might have a predicted price above the real one, as the center of the city is one of the most look after place to live and for the properites that where actually in the center, the predicted price will be lower. 

I need to find a way to know when an address is very general and it doesn't represent the real location of the property. I think the easiest way is to turn the default address to NaN, as there's not gonna be a real property with that exact longitude and latitude. 

In [253]:
@cache 
def cached_geocode(address): 
    loc = geolocator.geocode(address)
    if loc: 
        return (loc.latitude, loc.longitude)
    else: 
        raise ValueError("No location found")

In [254]:
def non_cached_geocode(address): 
    loc = geolocator.geocode(address)
    if loc: 
        return (loc.latitude, loc.longitude)
    else: 
        raise ValueError("No location found")

In [256]:
def selective_geocode(location): #Going from most specific location to more general 
    address = ", ".join(location) + ", España" #Idealista doesn't store the country, as every country has a different Idealista site
    first = True
    while "," in address:  
        if first: 
            first = False 
        else: 
            address = address.split(", ", 1)[1].strip()
            if address[0].isdigit():  
                continue 
        try: 
            if len(address) < 50: 
                return cached_geocode(address)
            else:
                return non_cached_geocode(address)
        except Exception as e: 
            pass 
            
    return (np.nan, np.nan) 

#### Handling Caching with Exceptions 

If you cache automatically using Python functools library, it caches every value returned, so if you return np.nan comming from an Exception, it won't matter, it will be stored and subsequent calls with have that value, and the exception could be raised because you loose connection with the server, rate liming or other factors you don't control. So my solution to avoid this functionality, is to determine a number of retries before caching when the value comes from an exception. 

In [259]:
df["latitude"], df["longitude"] = zip(*df["location"].apply(selective_geocode))

In [261]:
df.loc[(df["latitude"] == default[0]) & (df["longitude"] == default[1]), ["latitude", "longitude"]].shape[0]

914

In [262]:
df.loc[(df["latitude"] == default[0]) & (df["longitude"] == default[1]), ["latitude", "longitude"]] = np.nan

In [263]:
df[["latitude", "longitude"]].to_csv("../working_data/lat_lon3_vlc.csv", index = False)

In [264]:
cached_geocode.cache_info()

CacheInfo(hits=12839, misses=495, maxsize=None, currsize=319)

In [270]:
df.loc[df["latitude"].isna(), ["latitude", "longitude", "location"]]

Unnamed: 0,latitude,longitude,location
127,,,"[L'Horta Sud, València]"
2053,,,"[Valencia, València]"
2055,,,"[Valencia, València]"
2122,,,"[Valencia, València]"
2129,,,"[Valencia, València]"
...,...,...,...
20797,,,"[Barrio La Raiosa, Distrito Jesús, València, V..."
20798,,,"[carcagente, 14, Barrio La Raiosa, Distrito Je..."
20800,,,"[Valencia, València]"
20804,,,"[Valencia, València]"
