# load the data from excel

In [1]:
import pandas as pd

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [2]:
import os
os.environ["DJANGO_ALLOW_ASYNC_UNSAFE"] = "true"

In [3]:
# read the excel data
df = pd.read_csv('/app/notebooks/rapprochements/Ramses/fr-en-adresse-et-geolocalisation-etablissements-premier-et-second-degre.csv', sep=";",
                 dtype={"adresse_uai": "string"})

# print the df length
print('The length of the dataframe is: ', len(df))

The length of the dataframe is:  64460


In [4]:
# display df columns
print(df.columns)

Index(['numero_uai', 'appellation_officielle', 'denomination_principale',
       'patronyme_uai', 'secteur_public_prive_libe', 'adresse_uai',
       'lieu_dit_uai', 'boite_postale_uai', 'code_postal_uai',
       'localite_acheminement_uai', 'libelle_commune', 'coordonnee_x',
       'coordonnee_y', 'EPSG', 'latitude', 'longitude', 'appariement',
       'localisation', 'nature_uai', 'nature_uai_libe', 'etat_etablissement',
       'etat_etablissement_libe', 'code_departement', 'code_region',
       'code_academie', 'code_commune', 'libelle_departement',
       'libelle_region', 'libelle_academie', 'position',
       'secteur_prive_code_type_contrat', 'secteur_prive_libelle_type_contrat',
       'code_ministere', 'libelle_ministere', 'date_ouverture', 'sigle'],
      dtype='object')


# map to inputs

In [5]:
filtered_df = df

In [6]:
def row_to_input(df_row_raw):
    df_row = dict(df_row_raw)
    lat = df_row["latitude"]
    lng = df_row["longitude"]
    address = None
    if pd.isna(df_row["adresse_uai"]):
        address = None
    else:
        address = f"{df_row["adresse_uai"]}, {df_row["localite_acheminement_uai"]}"
        
        address = address.strip()
        if not address[0].isdigit():
            address = None
        
    return {
        "ext_id" : df_row["numero_uai"],
        "name": df_row["appellation_officielle"],
        "address": address,
        "lat": float(lat),
        "lng": float(lng)
    }

inputs = list(filtered_df.apply(row_to_input, axis=1))

In [7]:
import random
#inputs = random.sample(inputs, 1000)
print(len(inputs))
print(inputs[0])

64460
{'ext_id': '0131201K', 'name': "Ecole élémentaire d'application Frédéric Mistral", 'address': '2 avenue Doyen Guyon, AIX EN PROVENCE', 'lat': 43.534916145595616, 'lng': 5.434260408273862}


In [8]:
from batid.services.guess_bdg_new import Guesser, GeocodeNameHandler, GeocodeAddressHandler, ClosestFromPointHandler
guesses_path = "./guesses_v6.json"
guesser = Guesser(batch_size=1000)
guesser.handlers = [
    GeocodeNameHandler(sleep_time=0, photon_url="http://host.docker.internal:2322/api/", bbox_apothem_in_meters=1000),
    GeocodeAddressHandler(closest_radius=200)
]
guesser.create_work_file(list(inputs), guesses_path)
guesser.guess_work_file(guesses_path)

  0%|          | 0/65 [00:00<?, ?it/s]

In [9]:
guesser = Guesser()
guesser.load_work_file(guesses_path)
guesser.report()

-- Report --
Number of rows: 64460
Number of match: 43846 (68.02%)

-- finished_steps --
Rows with finished_steps closest_from_point: 0 (0.00%)
Rows with finished_steps geocode_address: 64460 (100.00%)
Rows with finished_steps geocode_name: 64460 (100.00%)
Rows with empty finished_steps: 0 (0.00%)

-- match_reasons : absolute --
match_reason
found_name_in_osm_point_on_bdg            20888
precise_address_match                     17758
found_name_in_osm_isolated_closest_bdg     5200
Name: count, dtype: int64

-- match_reasons : % --
match_reason
found_name_in_osm_point_on_bdg            32.404592
precise_address_match                     27.548868
found_name_in_osm_isolated_closest_bdg     8.067018
Name: count, dtype: float64

-- Inputs --
rows with ban_id: 21242 (32.95%)


In [10]:
guesser.to_csv("/app/notebooks/rapprochements/Ramses/Ramses_out_2.csv", ext_id_col_name="ext_id")