# load the data from excel

In [1]:
import pandas as pd

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [2]:
import os
os.environ["DJANGO_ALLOW_ASYNC_UNSAFE"] = "true"

In [3]:
# read the excel data
df = pd.read_csv('/app/notebooks/rapprochements/Saint-Malo/SM.csv', sep=";", dtype={"POINT GPS": str})

# print the df length
print('The length of the dataframe is: ', len(df))

The length of the dataframe is:  426


In [4]:
# display df columns
print(df.columns)

Index(['SERVICE / DIRECTION PILOTE', 'NUM', 'QUARTIER', 'CODE QUARTIER',
       'NOM SITE', 'CODE SITE', 'CODE BÂTIMENT', 'CODE COMPLET',
       'NOM BÂTIMENT', 'TYPOLOGIE ST MALO',
       ...
       '2020', '2021', '2022', '2023', '2024', '2025', '2026', '2027', '2028',
       'Unnamed: 256'],
      dtype='object', length=257)


# map to inputs

In [5]:
print(df.head())

  SERVICE / DIRECTION PILOTE    NUM           QUARTIER CODE QUARTIER  \
0                        NaN  183.0  Q01 : Intra-Muros           Q01   
1                        NaN  235.0  Q01 : Intra-Muros           Q01   
2                        NaN    NaN  Q01 : Intra-Muros           Q01   
3                        NaN    NaN  Q01 : Intra-Muros           Q01   
4                        NaN    NaN  Q01 : Intra-Muros           Q01   

   NOM SITE CODE SITE CODE BÂTIMENT   CODE COMPLET  \
0  Remparts   Q01-000    Q01-000-01     Q01-000-01   
1  Remparts   Q01-000    Q01-000-02     Q01-000-02   
2  Remparts   Q01-000    Q01-000-03     Q01-000-03   
3  Remparts   Q01-000    Q01-000-04  Q01-000-04-01   
4  Remparts   Q01-000    Q01-000-04  Q01-000-04-02   

                                        NOM BÂTIMENT  \
0              Sanitaires publics - Porte St Vincent   
1     Dépôt ville ex local jardinier au R+1 remparts   
2                 Remparts st Vincent à Grand' Porte   
3  Grand Porte - e

In [6]:
filtered_df = df[df["BATI / NON BATI"] == "Bâti"]

In [7]:
def row_to_input(df_row_raw):
    df_row = dict(df_row_raw)
    position = df_row["POINT GPS"]
    return {
        "ext_id" : df_row["CODE COMPLET"],
        "name": df_row["NOM BÂTIMENT"],
        "address": df_row["ADRESSE TOTALE"],
        "lat": float(position.split(",")[0].strip()),
        "lng": float(position.split(",")[1].strip())
    }

inputs = list(filtered_df.apply(row_to_input, axis=1))

In [8]:
import random
#inputs = random.sample(inputs, 10)
print(len(inputs))
print(inputs[0])

402
{'ext_id': 'Q01-000-01', 'name': 'Sanitaires publics - Porte St Vincent', 'address': 'porte saint-vincent 35400 SAINT-MALO', 'lat': 48.65078991372582, 'lng': -2.023215289871935}


In [9]:
from batid.services.guess_bdg_new import Guesser
guesses_path = "./guesses.json"
guesser = Guesser()
guesser.create_work_file(list(inputs), guesses_path)
guesser.guess_work_file(guesses_path)

- saving work file
- work file saved
- loading work file
- converting guesses to batches
- converted 402 guesses to 1 batches
Batch 1/1
Batch changed
- saving work file
- work file saved


In [10]:
guesser = Guesser()
guesser.load_work_file(guesses_path)
guesser.report()

- loading work file
-- Report --
Number of rows: 402
Number of match: 331 (82.34%)

-- finished_steps --
Rows with finished_steps closest_from_point: 402 (100.00%)
Rows with finished_steps geocode_address: 402 (100.00%)
Rows with finished_steps geocode_name: 402 (100.00%)
Rows with empty finished_steps: 0 (0.00%)

-- match_reasons : absolute --
match_reason
point_on_bdg             259
isolated_closest_bdg      48
precise_address_match     21
found_name_in_osm          3
Name: count, dtype: int64

-- match_reasons : % --
match_reason
point_on_bdg             64.427861
isolated_closest_bdg     11.940299
precise_address_match     5.223881
found_name_in_osm         0.746269
Name: count, dtype: float64

-- Inputs --
rows with ban_id: 23 (5.72%)


In [11]:
guesser = Guesser()
guesser.load_work_file(guesses_path)
guesser.report()

- loading work file
-- Report --
Number of rows: 402
Number of match: 331 (82.34%)

-- finished_steps --
Rows with finished_steps closest_from_point: 402 (100.00%)
Rows with finished_steps geocode_address: 402 (100.00%)
Rows with finished_steps geocode_name: 402 (100.00%)
Rows with empty finished_steps: 0 (0.00%)

-- match_reasons : absolute --
match_reason
point_on_bdg             259
isolated_closest_bdg      48
precise_address_match     21
found_name_in_osm          3
Name: count, dtype: int64

-- match_reasons : % --
match_reason
point_on_bdg             64.427861
isolated_closest_bdg     11.940299
precise_address_match     5.223881
found_name_in_osm         0.746269
Name: count, dtype: float64

-- Inputs --
rows with ban_id: 23 (5.72%)


In [12]:
guesser.to_csv("/app/notebooks/rapprochements/Saint-Malo/SM_out.csv", ext_id_col_name="CODE COMPLET")